summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruno Cardoso Lopes <bruno.cardoso@gmail.com>2011-08-23 22:06:37 +0000
committerBruno Cardoso Lopes <bruno.cardoso@gmail.com>2011-08-23 22:06:37 +0000
commitd8b7dd52525e99b998544bbeecb56907587b25a9 (patch)
tree9aa9593244020ada937b01ceac13e5a19a868132
parent4477d691ed9fa63f051986dcb98e358054c8281f (diff)
downloadllvm-d8b7dd52525e99b998544bbeecb56907587b25a9.tar.gz
llvm-d8b7dd52525e99b998544bbeecb56907587b25a9.tar.bz2
llvm-d8b7dd52525e99b998544bbeecb56907587b25a9.tar.xz
Fix a nasty bug where a v4i64 was being wrong emitted with 32-bit
permutations. Also tidy up some patterns and make them close to their instruction definition! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138392 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp36
-rw-r--r--lib/Target/X86/X86InstrSSE.td125
-rw-r--r--test/CodeGen/X86/avx-splat.ll6
3 files changed, 109 insertions, 58 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d2b7690489..88566c687e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -4188,20 +4188,21 @@ static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
assert((VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
&& "Vector size not supported");
- bool Is128 = VT.getSizeInBits() == 128;
- EVT NVT = Is128 ? MVT::v4f32 : MVT::v8f32;
- V = DAG.getNode(ISD::BITCAST, dl, NVT, V);
-
- if (Is128) {
+ if (VT.getSizeInBits() == 128) {
+ V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
- V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
+ V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
+ &SplatMask[0]);
} else {
- // The second half of indicies refer to the higher part, which is a
- // duplication of the lower one. This makes this shuffle a perfect match
- // for the VPERM instruction.
+ // To use VPERMILPS to splat scalars, the second half of indicies must
+ // refer to the higher part, which is a duplication of the lower one,
+ // because VPERMILPS can only handle in-lane permutations.
int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
- V = DAG.getVectorShuffle(NVT, dl, V, DAG.getUNDEF(NVT), &SplatMask[0]);
+
+ V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
+ V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
+ &SplatMask[0]);
}
return DAG.getNode(ISD::BITCAST, dl, VT, V);
@@ -4217,6 +4218,9 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
int NumElems = SrcVT.getVectorNumElements();
unsigned Size = SrcVT.getSizeInBits();
+ assert(((Size == 128 && NumElems > 4) || Size == 256) &&
+ "Unknown how to promote splat for type");
+
// Extract the 128-bit part containing the splat element and update
// the splat element index when it refers to the higher register.
if (Size == 256) {
@@ -4229,16 +4233,14 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
// All i16 and i8 vector types can't be used directly by a generic shuffle
// instruction because the target has no such instruction. Generate shuffles
// which repeat i16 and i8 several times until they fit in i32, and then can
- // be manipulated by target suported shuffles. After the insertion of the
- // necessary shuffles, the result is bitcasted back to v4f32 or v8f32.
+ // be manipulated by target suported shuffles.
EVT EltVT = SrcVT.getVectorElementType();
- if (NumElems > 4 && (EltVT == MVT::i8 || EltVT == MVT::i16))
+ if (EltVT == MVT::i8 || EltVT == MVT::i16)
V1 = PromoteSplati8i16(V1, DAG, EltNo);
// Recreate the 256-bit vector and place the same 128-bit vector
// into the low and high part. This is necessary because we want
- // to use VPERM to shuffle the v8f32 vector, and VPERM only shuffles
- // inside each separate v4f32 lane.
+ // to use VPERM* to shuffle the vectors
if (Size == 256) {
SDValue InsV = Insert128BitVector(DAG.getUNDEF(SrcVT), V1,
DAG.getConstant(0, MVT::i32), DAG, dl);
@@ -6211,6 +6213,7 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
// Handle splat operations
if (SVOp->isSplat()) {
unsigned NumElem = VT.getVectorNumElements();
+ int Size = VT.getSizeInBits();
// Special case, this is the only place now where it's allowed to return
// a vector_shuffle operation without using a target specific node, because
// *hopefully* it will be optimized away by the dag combiner. FIXME: should
@@ -6223,7 +6226,8 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
// Handle splats by matching through known shuffle masks
- if (VT.is128BitVector() && NumElem <= 4)
+ if ((Size == 128 && NumElem <= 4) ||
+ (Size == 256 && NumElem < 8))
return SDValue();
// All remaning splats are promoted to target supported vector shuffles.
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index d1f74842c3..40dd294d74 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -473,13 +473,90 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
(v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
}
-def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
- (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
-let AddedComplexity = 20 in {
- def : Pat<(v4f32 (movddup VR128:$src, (undef))),
- (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
- def : Pat<(v2i64 (movddup VR128:$src, (undef))),
- (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+let Predicates = [HasAVX] in {
+ // MOVHPS patterns
+ def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (VMOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+ (VMOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (VMOVHPSrm VR128:$src1, addr:$src2)>;
+
+ // MOVLHPS patterns
+ let AddedComplexity = 20 in {
+ def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+ (VMOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
+ def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+ (VMOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+
+ // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+ def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+ }
+ def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+ // MOVHLPS patterns
+ let AddedComplexity = 20 in {
+ // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+ def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+ (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+ // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+ def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+ (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
+ def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+ (VMOVHLPSrr VR128:$src1, VR128:$src1)>;
+ }
+}
+
+let Predicates = [HasSSE1] in {
+ // MOVHPS patterns
+ def : Pat<(movlhps VR128:$src1, (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (MOVHPSrm (v4i32 VR128:$src1), addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+ def : Pat<(X86Movlhps VR128:$src1,
+ (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (MOVHPSrm VR128:$src1, addr:$src2)>;
+
+ // MOVLHPS patterns
+ let AddedComplexity = 20 in {
+ def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+ (MOVLHPSrr (v4f32 VR128:$src), (v4f32 VR128:$src))>;
+ def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+ (MOVLHPSrr (v2i64 VR128:$src), (v2i64 VR128:$src))>;
+
+ // vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+ def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+ }
+ def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+ def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
+ (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+
+ // MOVHLPS patterns
+ let AddedComplexity = 20 in {
+ // vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+ def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+ (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+ // vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+ def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+ (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+ def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+ (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+ }
}
//===----------------------------------------------------------------------===//
@@ -4011,22 +4088,6 @@ def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
Requires<[HasSSE2]>;
let AddedComplexity = 20 in {
-// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
-def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
- (MOVLHPSrr VR128:$src1, VR128:$src2)>;
-
-// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
-def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
- (MOVHLPSrr VR128:$src1, VR128:$src2)>;
-
-// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
-def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
- (MOVHLPSrr VR128:$src1, VR128:$src1)>;
-def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
- (MOVHLPSrr VR128:$src1, VR128:$src1)>;
-}
-
-let AddedComplexity = 20 in {
// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
(MOVLPSrm VR128:$src1, addr:$src2)>;
@@ -6023,20 +6084,6 @@ def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, (memopv4i64 addr:$src2))),
def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
(VUNPCKHPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
-// Shuffle with MOVLHPS
-def : Pat<(X86Movlhps VR128:$src1,
- (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
- (MOVHPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(X86Movlhps VR128:$src1,
- (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
- (MOVHPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)),
- (MOVLHPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
- (MOVLHPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
- (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
-
// FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the problem
// is during lowering, where it's not possible to recognize the load fold cause
// it has two uses through a bitcast. One use disappears at isel time and the
@@ -6108,8 +6155,8 @@ def : Pat<(X86Movlps VR128:$src1,
def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
(MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2), sub_sd))>;
-def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>;
+def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
+ (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4i32 VR128:$src2), sub_sd))>;
// Shuffle with MOVLPD
def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
diff --git a/test/CodeGen/X86/avx-splat.ll b/test/CodeGen/X86/avx-splat.ll
index 38b2ff3b06..f8522c2695 100644
--- a/test/CodeGen/X86/avx-splat.ll
+++ b/test/CodeGen/X86/avx-splat.ll
@@ -21,8 +21,8 @@ entry:
}
; CHECK: vmovd
+; CHECK-NEXT: vmovlhps %xmm
; CHECK-NEXT: vinsertf128 $1
-; CHECK-NEXT: vpermilps $0
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
entry:
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
@@ -32,8 +32,8 @@ entry:
ret <4 x i64> %vecinit6.i
}
-; CHECK: vinsertf128 $1
-; CHECK-NEXT: vpermilps $0
+; CHECK: vshufpd $0
+; CHECK-NEXT: vinsertf128 $1
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
entry:
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0