summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElena Demikhovsky <elena.demikhovsky@intel.com>2012-07-01 06:12:26 +0000
committerElena Demikhovsky <elena.demikhovsky@intel.com>2012-07-01 06:12:26 +0000
commit8f40f7b8676ae7931baaecb1046a21f09471384b (patch)
tree7e9c81e2818a4f270446083b06f705e087618b1d
parent51e89c0d6ab7ec7d6c816c845447b1c3267775a3 (diff)
downloadllvm-8f40f7b8676ae7931baaecb1046a21f09471384b.tar.gz
llvm-8f40f7b8676ae7931baaecb1046a21f09471384b.tar.bz2
llvm-8f40f7b8676ae7931baaecb1046a21f09471384b.tar.xz
Optimization of shuffle node that can fit to the register form of VBROADCAST instruction on AVX2.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@159504 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp12
-rw-r--r--lib/Target/X86/X86InstrSSE.td33
-rw-r--r--test/CodeGen/X86/avx2-vbroadcast.ll96
3 files changed, 135 insertions, 6 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index e6a0df7b04..ba6659357c 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -5047,8 +5047,16 @@ X86TargetLowering::LowerVectorBroadcast(SDValue &Op, SelectionDAG &DAG) const {
SDValue Sc = Op.getOperand(0);
if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
- Sc.getOpcode() != ISD::BUILD_VECTOR)
- return SDValue();
+ Sc.getOpcode() != ISD::BUILD_VECTOR) {
+
+ if (!Subtarget->hasAVX2())
+ return SDValue();
+
+ // Use the register form of the broadcast instruction available on AVX2.
+ if (VT.is256BitVector())
+ Sc = Extract128BitVector(Sc, 0, DAG, dl);
+ return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
+ }
Ld = Sc.getOperand(0);
ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index ad8d15dab3..5319455dc5 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -7272,8 +7272,8 @@ let ExeDomain = SSEPackedSingle in {
int_x86_avx2_vbroadcast_ss_ps_256>;
}
let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
- int_x86_avx2_vbroadcast_sd_pd_256>;
+def VBROADCASTSDYrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
+ int_x86_avx2_vbroadcast_sd_pd_256>;
let Predicates = [HasAVX2] in
def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
@@ -7684,6 +7684,31 @@ let Predicates = [HasAVX2] in {
def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
(VPBROADCASTQYrm addr:$src)>;
+ def : Pat<(v16i8 (X86VBroadcast (v16i8 VR128:$src))),
+ (VPBROADCASTBrr VR128:$src)>;
+ def : Pat<(v32i8 (X86VBroadcast (v16i8 VR128:$src))),
+ (VPBROADCASTBYrr VR128:$src)>;
+ def : Pat<(v8i16 (X86VBroadcast (v8i16 VR128:$src))),
+ (VPBROADCASTWrr VR128:$src)>;
+ def : Pat<(v16i16 (X86VBroadcast (v8i16 VR128:$src))),
+ (VPBROADCASTWYrr VR128:$src)>;
+ def : Pat<(v4i32 (X86VBroadcast (v4i32 VR128:$src))),
+ (VPBROADCASTDrr VR128:$src)>;
+ def : Pat<(v8i32 (X86VBroadcast (v4i32 VR128:$src))),
+ (VPBROADCASTDYrr VR128:$src)>;
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 VR128:$src))),
+ (VPBROADCASTQrr VR128:$src)>;
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 VR128:$src))),
+ (VPBROADCASTQYrr VR128:$src)>;
+ def : Pat<(v4f32 (X86VBroadcast (v4f32 VR128:$src))),
+ (VBROADCASTSSrr VR128:$src)>;
+ def : Pat<(v8f32 (X86VBroadcast (v4f32 VR128:$src))),
+ (VBROADCASTSSYrr VR128:$src)>;
+ def : Pat<(v2f64 (X86VBroadcast (v2f64 VR128:$src))),
+ (VPBROADCASTQrr VR128:$src)>;
+ def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))),
+ (VBROADCASTSDYrr VR128:$src)>;
+
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
let AddedComplexity = 20 in {
@@ -7694,7 +7719,7 @@ let Predicates = [HasAVX2] in {
(VBROADCASTSSYrr
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FR32:$src, sub_ss))>;
def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
- (VBROADCASTSDrr
+ (VBROADCASTSDYrr
(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FR64:$src, sub_sd))>;
def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
@@ -7704,7 +7729,7 @@ let Predicates = [HasAVX2] in {
(VBROADCASTSSYrr
(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), GR32:$src, sub_ss))>;
def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
- (VBROADCASTSDrr
+ (VBROADCASTSDYrr
(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), GR64:$src, sub_sd))>;
}
}
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 46b41fa953..b804233663 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -259,3 +259,99 @@ define <4 x double> @_inreg3(double %scalar) nounwind uwtable readnone ssp {
ret <4 x double> %wide
}
+;CHECK: _inreg8xfloat
+;CHECK: vbroadcastss
+;CHECK: ret
+define <8 x float> @_inreg8xfloat(<8 x float> %a) {
+ %b = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> zeroinitializer
+ ret <8 x float> %b
+}
+
+;CHECK: _inreg4xfloat
+;CHECK: vbroadcastss
+;CHECK: ret
+define <4 x float> @_inreg4xfloat(<4 x float> %a) {
+ %b = shufflevector <4 x float> %a, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %b
+}
+
+;CHECK: _inreg16xi16
+;CHECK: vpbroadcastw
+;CHECK: ret
+define <16 x i16> @_inreg16xi16(<16 x i16> %a) {
+ %b = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> zeroinitializer
+ ret <16 x i16> %b
+}
+
+;CHECK: _inreg8xi16
+;CHECK: vpbroadcastw
+;CHECK: ret
+define <8 x i16> @_inreg8xi16(<8 x i16> %a) {
+ %b = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %b
+}
+
+
+;CHECK: _inreg4xi64
+;CHECK: vpbroadcastq
+;CHECK: ret
+define <4 x i64> @_inreg4xi64(<4 x i64> %a) {
+ %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
+ ret <4 x i64> %b
+}
+
+;CHECK: _inreg2xi64
+;CHECK: vpbroadcastq
+;CHECK: ret
+define <2 x i64> @_inreg2xi64(<2 x i64> %a) {
+ %b = shufflevector <2 x i64> %a, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %b
+}
+
+;CHECK: _inreg4xdouble
+;CHECK: vbroadcastsd
+;CHECK: ret
+define <4 x double> @_inreg4xdouble(<4 x double> %a) {
+ %b = shufflevector <4 x double> %a, <4 x double> undef, <4 x i32> zeroinitializer
+ ret <4 x double> %b
+}
+
+;CHECK: _inreg2xdouble
+;CHECK: vpbroadcastq
+;CHECK: ret
+define <2 x double> @_inreg2xdouble(<2 x double> %a) {
+ %b = shufflevector <2 x double> %a, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %b
+}
+
+;CHECK: _inreg8xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define <8 x i32> @_inreg8xi32(<8 x i32> %a) {
+ %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
+ ret <8 x i32> %b
+}
+
+;CHECK: _inreg4xi32
+;CHECK: vpbroadcastd
+;CHECK: ret
+define <4 x i32> @_inreg4xi32(<4 x i32> %a) {
+ %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %b
+}
+
+;CHECK: _inreg32xi8
+;CHECK: vpbroadcastb
+;CHECK: ret
+define <32 x i8> @_inreg32xi8(<32 x i8> %a) {
+ %b = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> zeroinitializer
+ ret <32 x i8> %b
+}
+
+;CHECK: _inreg16xi8
+;CHECK: vpbroadcastb
+;CHECK: ret
+define <16 x i8> @_inreg16xi8(<16 x i8> %a) {
+ %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %b
+}