summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp88
-rw-r--r--test/CodeGen/X86/vector-shuffle-128-v8.ll3
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining.ll60
3 files changed, 138 insertions, 13 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ff206976bc..5f19d1b169 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19061,6 +19061,79 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
}
}
+/// \brief Search for a combinable shuffle across a chain ending in pshufd.
+///
+/// We walk up the chain and look for a combinable shuffle, skipping over
+/// shuffles that we could hoist this shuffle's transformation past without
+/// altering anything.
+static bool combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(N.getOpcode() == X86ISD::PSHUFD &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+
+ // Walk up a single-use chain looking for a combinable shuffle.
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return false; // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFD:
+ // Found another dword shuffle.
+ break;
+
+ case X86ISD::PSHUFLW:
+ // Check that the low words (being shuffled) are the identity in the
+ // dword shuffle, and the high words are self-contained.
+ if (Mask[0] != 0 || Mask[1] != 1 ||
+ !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
+ return false;
+
+ continue;
+
+ case X86ISD::PSHUFHW:
+ // Check that the high words (being shuffled) are the identity in the
+ // dword shuffle, and the low words are self-contained.
+ if (Mask[2] != 2 || Mask[3] != 3 ||
+ !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
+ return false;
+
+ continue;
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return false;
+
+ // Record the old value to use in RAUW-ing.
+ SDValue Old = V;
+
+ // Merge this node's mask and our incoming mask.
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ // Replace N with its operand as we're going to combine that shuffle away.
+ DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+ // Replace the combinable shuffle with the combined one, updating all users
+ // so that we re-evaluate the chain here.
+ DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+ return true;
+}
+
/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
///
/// We walk up the chain, skipping shuffles of the other half and looking
@@ -19194,18 +19267,11 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V);
}
- // Fallthrough
+ break;
+
case X86ISD::PSHUFD:
- if (V.getOpcode() == N.getOpcode()) {
- // If we have two sequential shuffles of the same kind we can always fold
- // them. Even if there are multiple uses, this is beneficial because it
- // breaks a dependency.
- SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
- for (int &M : Mask)
- M = VMask[M];
- return DAG.getNode(N.getOpcode(), DL, VT, V.getOperand(0),
- getV4X86ShuffleImm8ForMask(Mask, DAG));
- }
+ if (combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+ return SDValue(); // We combined away this shuffle.
break;
}
diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll
index 091822b6cb..5d1922a348 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -157,9 +157,8 @@ define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) {
; CHECK-SSE2: # BB#0:
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7]
; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1]
+; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2]
; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7]
-; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> <i32 2, i32 6, i32 4, i32 0, i32 1, i32 3, i32 7, i32 5>
ret <8 x i16> %shuffle
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
index dae1ef5875..1bc2aee6ef 100644
--- a/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3,9 +3,69 @@
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
+declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8)
declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
+define <4 x i32> @combine_pshufd1(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd1
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: retq
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
+ %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27)
+ ret <4 x i32> %c
+}
+
+define <4 x i32> @combine_pshufd2(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd2
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: retq
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
+ %b.cast = bitcast <4 x i32> %b to <8 x i16>
+ %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28)
+ %c.cast = bitcast <8 x i16> %c to <4 x i32>
+ %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
+ ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd3(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd3
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: retq
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27)
+ %b.cast = bitcast <4 x i32> %b to <8 x i16>
+ %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28)
+ %c.cast = bitcast <8 x i16> %c to <4 x i32>
+ %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27)
+ ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd4(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd4
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT: retq
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31)
+ %b.cast = bitcast <4 x i32> %b to <8 x i16>
+ %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27)
+ %c.cast = bitcast <8 x i16> %c to <4 x i32>
+ %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31)
+ ret <4 x i32> %d
+}
+
+define <4 x i32> @combine_pshufd5(<4 x i32> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufd5
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT: retq
+ %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76)
+ %b.cast = bitcast <4 x i32> %b to <8 x i16>
+ %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27)
+ %c.cast = bitcast <8 x i16> %c to <4 x i32>
+ %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76)
+ ret <4 x i32> %d
+}
+
define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
; CHECK-SSE2-LABEL: @combine_pshuflw1
; CHECK-SSE2: # BB#0: