summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChandler Carruth <chandlerc@gmail.com>2014-06-27 11:34:40 +0000
committerChandler Carruth <chandlerc@gmail.com>2014-06-27 11:34:40 +0000
commit4363b0729b2b2aa50949ac9fb3af5ced0240898f (patch)
treea79307489e3443b447e9a302f14e4d4bbbeb17f4
parentf91161874e516621aa9c1734391c5bd01d9d4e6d (diff)
downloadllvm-4363b0729b2b2aa50949ac9fb3af5ced0240898f.tar.gz
llvm-4363b0729b2b2aa50949ac9fb3af5ced0240898f.tar.bz2
llvm-4363b0729b2b2aa50949ac9fb3af5ced0240898f.tar.xz
[x86] Teach the target-specific combining how to aggressively fold
half-shuffles, even looking through intervening instructions in a chain. Summary: This doesn't happen to show up with any test cases I've found for the current shuffle lowering, but previous attempts would benefit from this and it seems generally useful. I've tested it directly using intrinsics, which also shows that it will work with hand vectorized code as well. Note that even though pshufd isn't directly used in these tests, it gets exercised because we combine some of the half shuffles into a pshufd first, and then merge them. Differential Revision: http://reviews.llvm.org/D4291 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211890 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp90
-rw-r--r--test/CodeGen/X86/vector-shuffle-combining.ll49
2 files changed, 139 insertions, 0 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2e23cd3674..ff206976bc 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -19061,6 +19061,88 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
}
}
+/// \brief Search for a combinable shuffle across a chain ending in pshuflw or pshufhw.
+///
+/// We walk up the chain, skipping shuffles of the other half and looking
+/// through shuffles which switch halves trying to find a shuffle of the same
+/// pair of dwords.
+static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ assert(
+ (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
+ "Called with something other than an x86 128-bit half shuffle!");
+ SDLoc DL(N);
+ unsigned CombineOpcode = N.getOpcode();
+
+ // Walk up a single-use chain looking for a combinable shuffle.
+ SDValue V = N.getOperand(0);
+ for (; V.hasOneUse(); V = V.getOperand(0)) {
+ switch (V.getOpcode()) {
+ default:
+ return false; // Nothing combined!
+
+ case ISD::BITCAST:
+ // Skip bitcasts as we always know the type for the target specific
+ // instructions.
+ continue;
+
+ case X86ISD::PSHUFLW:
+ case X86ISD::PSHUFHW:
+ if (V.getOpcode() == CombineOpcode)
+ break;
+
+ // Other-half shuffles are no-ops.
+ continue;
+
+ case X86ISD::PSHUFD: {
+ // We can only handle pshufd if the half we are combining either stays in
+ // its half, or switches to the other half. Bail if one of these isn't
+ // true.
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ int DOffset = CombineOpcode == X86ISD::PSHUFLW ? 0 : 2;
+ if (!((VMask[DOffset + 0] < 2 && VMask[DOffset + 1] < 2) ||
+ (VMask[DOffset + 0] >= 2 && VMask[DOffset + 1] >= 2)))
+ return false;
+
+ // Map the mask through the pshufd and keep walking up the chain.
+ for (int i = 0; i < 4; ++i)
+ Mask[i] = 2 * (VMask[DOffset + Mask[i] / 2] % 2) + Mask[i] % 2;
+
+ // Switch halves if the pshufd does.
+ CombineOpcode =
+ VMask[DOffset + Mask[0] / 2] < 2 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
+ continue;
+ }
+ }
+ // Break out of the loop if we break out of the switch.
+ break;
+ }
+
+ if (!V.hasOneUse())
+ // We fell out of the loop without finding a viable combining instruction.
+ return false;
+
+ // Record the old value to use in RAUW-ing.
+ SDValue Old = V;
+
+ // Merge this node's mask and our incoming mask (adjusted to account for all
+ // the pshufd instructions encountered).
+ SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
+ for (int &M : Mask)
+ M = VMask[M];
+ V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
+ getV4X86ShuffleImm8ForMask(Mask, DAG));
+
+ // Replace N with its operand as we're going to combine that shuffle away.
+ DAG.ReplaceAllUsesWith(N, N.getOperand(0));
+
+ // Replace the combinable shuffle with the combined one, updating all users
+ // so that we re-evaluate the chain here.
+ DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
+ return true;
+}
+
/// \brief Try to combine x86 target specific shuffles.
static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -19080,6 +19162,11 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
return SDValue();
}
+ // Nuke no-op shuffles that show up after combining.
+ if (isNoopShuffleMask(Mask))
+ return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
+
+ // Look for simplifications involving one or two shuffle instructions.
SDValue V = N.getOperand(0);
switch (N.getOpcode()) {
default:
@@ -19088,6 +19175,9 @@ static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG,
case X86ISD::PSHUFHW:
assert(VT == MVT::v8i16);
+ if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
+ return SDValue(); // We combined away this shuffle, so we're done.
+
// See if this reduces to a PSHUFD which is no more expensive and can
// combine with more operations.
if (Mask[0] % 2 == 0 && Mask[2] % 2 == 0 &&
diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll
new file mode 100644
index 0000000000..dae1ef5875
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8)
+declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8)
+
+define <8 x i16> @combine_pshuflw1(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshuflw1
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: retq
+ %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
+ ret <8 x i16> %c
+}
+
+define <8 x i16> @combine_pshuflw2(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshuflw2
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: retq
+ %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28)
+ %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
+ ret <8 x i16> %d
+}
+
+define <8 x i16> @combine_pshuflw3(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshuflw3
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; CHECK-SSE2-NEXT: retq
+ %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27)
+ %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27)
+ ret <8 x i16> %d
+}
+
+define <8 x i16> @combine_pshufhw1(<8 x i16> %a) {
+; CHECK-SSE2-LABEL: @combine_pshufhw1
+; CHECK-SSE2: # BB#0:
+; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; CHECK-SSE2-NEXT: retq
+ %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27)
+ %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27)
+ %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27)
+ ret <8 x i16> %d
+}
+