X86 CodeGenPrep: sink shufflevectors before shifts

On x86, shifting a vector by a scalar is significantly cheaper than shifting a vector by another fully general vector. Unfortunately, because SelectionDAG operates on just one basic block at a time, the shufflevector instruction that reveals whether the right-hand side of a shift *is* really a scalar is often not visible to CodeGen when it's needed. This adds another handler to CodeGenPrepare, to sink any useful shufflevector instructions down to the basic block where they're used, predicated on a target hook (since on other architectures, doing so will often just introduce extra real work). rdar://problem/16063505 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@201655 91177308-0d34-0410-b5e6-96231b3b80d8
author: Tim Northover <tnorthover@apple.com> 2014-02-19 10:02:43 +0000
committer: Tim Northover <tnorthover@apple.com> 2014-02-19 10:02:43 +0000
commit: 44697f3fc1c81644aedadf5e879fed7ff56a03da (patch)
tree: 63d1b192dfd091c63c6d8f9babd5b34873ab367d
parent: 84d5a235fdd09b7ed5271203a0b4ac9ab657edab (diff)
download: llvm-44697f3fc1c81644aedadf5e879fed7ff56a03da.tar.gz
llvm-44697f3fc1c81644aedadf5e879fed7ff56a03da.tar.bz2
llvm-44697f3fc1c81644aedadf5e879fed7ff56a03da.tar.xz
5 files changed, 203 insertions, 0 deletions
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 155bb0a7a5..0b3428ed7a 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1185,6 +1185,14 @@ public:
     return true;
   }
 
+  /// Return true if it's significantly cheaper to shift a vector by a uniform
+  /// scalar than by an amount which will vary across each lane. On x86, for
+  /// example, there is a "psllw" instruction for the former case, but no simple
+  /// instruction for a general "a << b" operation on vectors.
+  virtual bool isVectorShiftByScalarCheap(Type *Ty) const {
+    return false;
+  }
+
   /// Return true if it's free to truncate a value of type Ty1 to type
   /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
   /// by referencing its sub-register AX.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b1d734e932..f038580c5f 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -14172,6 +14172,24 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   return true;
 }
 
+bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
+  unsigned Bits = Ty->getScalarSizeInBits();
+
+  // 8-bit shifts are always expensive, but versions with a scalar amount aren't
+  // particularly cheaper than those without.
+  if (Bits == 8)
+    return false;
+
+  // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
+  // variable shifts just as cheap as scalar ones.
+  if (Subtarget->hasInt256() && (Bits == 32 || Bits == 64))
+    return false;
+
+  // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
+  // fully general vector.
+  return true;
+}
+
 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
   if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
     return false;
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 6ea060ba3b..ce9594ae3e 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -679,6 +679,9 @@ namespace llvm {
     /// the immediate into a register.
     virtual bool isLegalAddImmediate(int64_t Imm) const;
 
+
+    virtual bool isVectorShiftByScalarCheap(Type *Ty) const;
+
     /// isTruncateFree - Return true if it's free to truncate a value of
     /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
     /// register EAX to i16 by referencing its sub-register AX.
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 0fde256943..3c9ecce8e3 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -132,6 +132,7 @@ typedef DenseMap<Instruction *, Type *> InstrToOrigTy;
     bool MoveExtToFormExtLoad(Instruction *I);
     bool OptimizeExtUses(Instruction *I);
     bool OptimizeSelectInst(SelectInst *SI);
+    bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI);
     bool DupRetToEnableTailCallOpts(BasicBlock *BB);
     bool PlaceDbgValues(Function &F);
   };
@@ -2719,6 +2720,74 @@ bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) {
   return true;
 }
 
+
+bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
+  SmallVector<int, 16> Mask(SVI->getShuffleMask());
+  int SplatElem = -1;
+  for (unsigned i = 0; i < Mask.size(); ++i) {
+    if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
+      return false;
+    SplatElem = Mask[i];
+  }
+
+  return true;
+}
+
+/// Some targets have expensive vector shifts if the lanes aren't all the same
+/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
+/// it's often worth sinking a shufflevector splat down to its use so that
+/// codegen can spot all lanes are identical.
+bool CodeGenPrepare::OptimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
+  BasicBlock *DefBB = SVI->getParent();
+
+  // Only do this xform if variable vector shifts are particularly expensive.
+  if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType()))
+    return false;
+
+  // We only expect better codegen by sinking a shuffle if we can recognise a
+  // constant splat.
+  if (!isBroadcastShuffle(SVI))
+    return false;
+
+  // InsertedShuffles - Only insert a shuffle in each block once.
+  DenseMap<BasicBlock*, Instruction*> InsertedShuffles;
+
+  bool MadeChange = false;
+  for (Value::use_iterator UI = SVI->use_begin(), E = SVI->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+
+    // For now only apply this when the splat is used by a shift instruction.
+    if (!User->isShift()) continue;
+
+    // Everything checks out, sink the shuffle if the user's block doesn't
+    // already have a copy.
+    Instruction *&InsertedShuffle = InsertedShuffles[UserBB];
+
+    if (!InsertedShuffle) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      InsertedShuffle = new ShuffleVectorInst(SVI->getOperand(0),
+                                              SVI->getOperand(1),
+                                              SVI->getOperand(2), "", InsertPt);
+    }
+
+    User->replaceUsesOfWith(SVI, InsertedShuffle);
+    MadeChange = true;
+  }
+
+  // If we removed all uses, nuke the shuffle.
+  if (SVI->use_empty()) {
+    SVI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
 bool CodeGenPrepare::OptimizeInst(Instruction *I) {
   if (PHINode *P = dyn_cast<PHINode>(I)) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
@@ -2791,6 +2860,9 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
   if (SelectInst *SI = dyn_cast<SelectInst>(I))
     return OptimizeSelectInst(SI);
 
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
+    return OptimizeShuffleVectorInst(SVI);
+
   return false;
 }
 
diff --git a/test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll b/test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll
new file mode 100644
index 0000000000..c4ee79cd7f
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/x86-shuffle-sink.ll
@@ -0,0 +1,102 @@
+; RUN: opt -S -codegenprepare -mtriple=x86_64-apple-macosx10.9 -mcpu=core-avx2 %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-AVX2
+; RUN: opt -S -codegenprepare -mtriple=x86_64-apple-macosx10.9 -mcpu=corei7 %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SSE2
+
+define <16 x i8> @test_8bit(<16 x i8> %lhs, <16 x i8> %tmp, i1 %tst) {
+; CHECK-LABEL: @test_8bit
+; CHECK: if_true:
+; CHECK-NOT: shufflevector
+
+; CHECK: if_false:
+; CHECK-NOT: shufflevector
+; CHECK: shl <16 x i8> %lhs, %mask
+  %mask = shufflevector <16 x i8> %tmp, <16 x i8> undef, <16 x i32> zeroinitializer
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <16 x i8> %mask
+
+if_false:
+  %res = shl <16 x i8> %lhs, %mask
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @test_16bit(<8 x i16> %lhs, <8 x i16> %tmp, i1 %tst) {
+; CHECK-LABEL: @test_16bit
+; CHECK: if_true:
+; CHECK-NOT: shufflevector
+
+; CHECK: if_false:
+; CHECK: [[SPLAT:%[0-9a-zA-Z_]+]] = shufflevector
+; CHECK: shl <8 x i16> %lhs, [[SPLAT]]
+  %mask = shufflevector <8 x i16> %tmp, <8 x i16> undef, <8 x i32> zeroinitializer
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <8 x i16> %mask
+
+if_false:
+  %res = shl <8 x i16> %lhs, %mask
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @test_notsplat(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
+; CHECK-LABEL: @test_notsplat
+; CHECK: if_true:
+; CHECK-NOT: shufflevector
+
+; CHECK: if_false:
+; CHECK-NOT: shufflevector
+; CHECK: shl <4 x i32> %lhs, %mask
+  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <4 x i32> %mask
+
+if_false:
+  %res = shl <4 x i32> %lhs, %mask
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_32bit(<4 x i32> %lhs, <4 x i32> %tmp, i1 %tst) {
+; CHECK-AVX2-LABEL: @test_32bit
+; CHECK-AVX2: if_false:
+; CHECK-AVX2-NOT: shufflevector
+; CHECK-AVX2: ashr <4 x i32> %lhs, %mask
+
+; CHECK-SSE2-LABEL: @test_32bit
+; CHECK-SSE2: if_false:
+; CHECK-SSE2: [[SPLAT:%[0-9a-zA-Z_]+]] = shufflevector
+; CHECK-SSE2: ashr <4 x i32> %lhs, [[SPLAT]]
+  %mask = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 0>
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <4 x i32> %mask
+
+if_false:
+  %res = ashr <4 x i32> %lhs, %mask
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @test_64bit(<2 x i64> %lhs, <2 x i64> %tmp, i1 %tst) {
+; CHECK-AVX2-LABEL: @test_64bit
+; CHECK-AVX2: if_false:
+; CHECK-AVX2-NOT: shufflevector
+; CHECK-AVX2: lshr <2 x i64> %lhs, %mask
+
+; CHECK-SSE2-LABEL: @test_64bit
+; CHECK-SSE2: if_false:
+; CHECK-SSE2: [[SPLAT:%[0-9a-zA-Z_]+]] = shufflevector
+; CHECK-SSE2: lshr <2 x i64> %lhs, [[SPLAT]]
+
+  %mask = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer
+  br i1 %tst, label %if_true, label %if_false
+
+if_true:
+  ret <2 x i64> %mask
+
+if_false:
+  %res = lshr <2 x i64> %lhs, %mask
+  ret <2 x i64> %res
+}
author	Tim Northover <tnorthover@apple.com>	2014-02-19 10:02:43 +0000
committer	Tim Northover <tnorthover@apple.com>	2014-02-19 10:02:43 +0000
commit	44697f3fc1c81644aedadf5e879fed7ff56a03da (patch)
tree	63d1b192dfd091c63c6d8f9babd5b34873ab367d
parent	84d5a235fdd09b7ed5271203a0b4ac9ab657edab (diff)
download	llvm-44697f3fc1c81644aedadf5e879fed7ff56a03da.tar.gz llvm-44697f3fc1c81644aedadf5e879fed7ff56a03da.tar.bz2 llvm-44697f3fc1c81644aedadf5e879fed7ff56a03da.tar.xz