Add custom lowering of X86 vector SRA/SRL/SHL when the shift amount is a splat vector.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@131179 91177308-0d34-0410-b5e6-96231b3b80d8
author: Nadav Rotem <nadav.rotem@intel.com> 2011-05-11 08:12:09 +0000
committer: Nadav Rotem <nadav.rotem@intel.com> 2011-05-11 08:12:09 +0000
commit: 4301222525b565028850030835b8db9ce6d153db (patch)
tree: 89660351f871398592d7ac95a4de33c635151da2
parent: 41cdc16e7301c91d2460aa14412f592695b0d4ed (diff)
download: llvm-4301222525b565028850030835b8db9ce6d153db.tar.gz
llvm-4301222525b565028850030835b8db9ce6d153db.tar.bz2
llvm-4301222525b565028850030835b8db9ce6d153db.tar.xz
3 files changed, 222 insertions, 11 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 97fd2a3f90..cd939f472e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -927,7 +927,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     // Can turn SHL into an integer multiply.
     setOperationAction(ISD::SHL,                MVT::v4i32, Custom);
     setOperationAction(ISD::SHL,                MVT::v16i8, Custom);
-    setOperationAction(ISD::SRL,                MVT::v4i32, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -949,6 +948,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
   }
 
+  if (Subtarget->hasSSE2()) {
+    setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
+    setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
+    setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
+    setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
+
+    setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
+    setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
+  }
+
   if (Subtarget->hasSSE42())
     setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
 
@@ -6616,9 +6628,9 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
 }
 
 
-/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
+/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values and
 /// take a 2 x i32 value to shift plus a shift amount.
-SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getNumOperands() == 3 && "Not a double-shift!");
   EVT VT = Op.getValueType();
   unsigned VTBits = VT.getSizeInBits();
@@ -8778,16 +8790,71 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const {
   return Res;
 }
 
-SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
+
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();
   SDValue R = Op.getOperand(0);
+  SDValue Amt = Op.getOperand(1);
 
   LLVMContext *Context = DAG.getContext();
 
-  assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later");
+  // Must have SSE2.
+  if (!Subtarget->hasSSE2()) return SDValue();
+
+  // Optimize shl/srl/sra with constant shift amount.
+  if (isSplatVector(Amt.getNode())) {
+    SDValue SclrAmt = Amt->getOperand(0);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+      uint64_t ShiftAmt = C->getZExtValue();
+
+      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SHL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SHL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SHL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v2i64 && Op.getOpcode() == ISD::SRL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRL)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v4i32 && Op.getOpcode() == ISD::SRA)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+
+      if (VT == MVT::v8i16 && Op.getOpcode() == ISD::SRA)
+       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                     DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
+                     R, DAG.getConstant(ShiftAmt, MVT::i32));
+    }
+  }
+
+  // Lower SHL with variable shift amount.
+  // Cannot lower SHL without SSE4.1 or later.
+  if (!Subtarget->hasSSE41()) return SDValue();
 
-  if (VT == MVT::v4i32) {
+  if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
     Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
                      Op.getOperand(1), DAG.getConstant(23, MVT::i32));
@@ -8806,7 +8873,7 @@ SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const {
     Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   }
-  if (VT == MVT::v16i8) {
+  if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
     // a = a << 5;
     Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                      DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
@@ -9111,7 +9178,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
   case ISD::SHL_PARTS:
   case ISD::SRA_PARTS:
-  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
+  case ISD::SRL_PARTS:          return LowerShiftParts(Op, DAG);
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
@@ -9139,7 +9206,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
-  case ISD::SHL:                return LowerSHL(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:                return LowerShift(Op, DAG);
   case ISD::SADDO:
   case ISD::UADDO:
   case ISD::SSUBO:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index ea0c1b68d7..ca84a99045 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -770,7 +770,7 @@ namespace llvm {
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) const;
     SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
                       SelectionDAG &DAG) const;
     SDValue LowerBITCAST(SDValue op, SelectionDAG &DAG) const;
@@ -805,7 +805,7 @@ namespace llvm {
     SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
new file mode 100644
index 0000000000..fdf68f92a9
--- /dev/null
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; Splat patterns below
+
+
+define <4 x i32> @shl4(<4 x i32> %A) nounwind {
+entry:
+; CHECK:      shl4
+; CHECK:      pslld
+; CHECK-NEXT: pslld
+  %B = shl <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
+  %C = shl <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
+  %K = xor <4 x i32> %B, %C
+  ret <4 x i32> %K
+}
+
+define <4 x i32> @shr4(<4 x i32> %A) nounwind {
+entry:
+; CHECK:      shr4
+; CHECK:      psrld
+; CHECK-NEXT: psrld
+  %B = lshr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
+  %C = lshr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
+  %K = xor <4 x i32> %B, %C
+  ret <4 x i32> %K
+}
+
+define <4 x i32> @sra4(<4 x i32> %A) nounwind {
+entry:
+; CHECK:      sra4
+; CHECK:      psrad
+; CHECK-NEXT: psrad
+  %B = ashr <4 x i32> %A,  < i32 2, i32 2, i32 2, i32 2>
+  %C = ashr <4 x i32> %A,  < i32 1, i32 1, i32 1, i32 1>
+  %K = xor <4 x i32> %B, %C
+  ret <4 x i32> %K
+}
+
+define <2 x i64> @shl2(<2 x i64> %A) nounwind {
+entry:
+; CHECK:      shl2
+; CHECK:      psllq
+; CHECK-NEXT: psllq
+  %B = shl <2 x i64> %A,  < i64 2, i64 2>
+  %C = shl <2 x i64> %A,  < i64 9, i64 9>
+  %K = xor <2 x i64> %B, %C
+  ret <2 x i64> %K
+}
+
+define <2 x i64> @shr2(<2 x i64> %A) nounwind {
+entry:
+; CHECK:      shr2
+; CHECK:      psrlq
+; CHECK-NEXT: psrlq
+  %B = lshr <2 x i64> %A,  < i64 8, i64 8>
+  %C = lshr <2 x i64> %A,  < i64 1, i64 1>
+  %K = xor <2 x i64> %B, %C
+  ret <2 x i64> %K
+}
+
+
+define <8 x i16> @shl8(<8 x i16> %A) nounwind {
+entry:
+; CHECK:      shl8
+; CHECK:      psllw
+; CHECK-NEXT: psllw
+  %B = shl <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %C = shl <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+define <8 x i16> @shr8(<8 x i16> %A) nounwind {
+entry:
+; CHECK:      shr8
+; CHECK:      psrlw
+; CHECK-NEXT: psrlw
+  %B = lshr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %C = lshr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+define <8 x i16> @sra8(<8 x i16> %A) nounwind {
+entry:
+; CHECK:      sra8
+; CHECK:      psraw
+; CHECK-NEXT: psraw
+  %B = ashr <8 x i16> %A,  < i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  %C = ashr <8 x i16> %A,  < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+; non splat test
+
+
+define <8 x i16> @sll8_nosplat(<8 x i16> %A) nounwind {
+entry:
+; CHECK: sll8_nosplat
+; CHECK-NOT: psll
+; CHECK-NOT: psll
+  %B = shl <8 x i16> %A,  < i16 1, i16 2, i16 3, i16 6, i16 2, i16 2, i16 2, i16 2>
+  %C = shl <8 x i16> %A,  < i16 9, i16 7, i16 5, i16 1, i16 4, i16 1, i16 1, i16 1>
+  %K = xor <8 x i16> %B, %C
+  ret <8 x i16> %K
+}
+
+
+define <2 x i64> @shr2_nosplat(<2 x i64> %A) nounwind {
+entry:
+; CHECK: shr2_nosplat
+; CHECK-NOT:  psrlq
+; CHECK-NOT:  psrlq
+  %B = lshr <2 x i64> %A,  < i64 8, i64 1>
+  %C = lshr <2 x i64> %A,  < i64 1, i64 0>
+  %K = xor <2 x i64> %B, %C
+  ret <2 x i64> %K
+}
+
+
+; Other shifts
+
+define <2 x i32> @shl2_other(<2 x i32> %A) nounwind {
+entry:
+; CHECK: shl2_other
+; CHECK-not:      psllq
+  %B = shl <2 x i32> %A,  < i32 2, i32 2>
+  %C = shl <2 x i32> %A,  < i32 9, i32 9>
+  %K = xor <2 x i32> %B, %C
+  ret <2 x i32> %K
+}
+
+define <2 x i32> @shr2_other(<2 x i32> %A) nounwind {
+entry:
+; CHECK: shr2_other
+; CHECK-NOT:      psrlq
+  %B = lshr <2 x i32> %A,  < i32 8, i32 8>
+  %C = lshr <2 x i32> %A,  < i32 1, i32 1>
+  %K = xor <2 x i32> %B, %C
+  ret <2 x i32> %K
+}
author	Nadav Rotem <nadav.rotem@intel.com>	2011-05-11 08:12:09 +0000
committer	Nadav Rotem <nadav.rotem@intel.com>	2011-05-11 08:12:09 +0000
commit	4301222525b565028850030835b8db9ce6d153db (patch)
tree	89660351f871398592d7ac95a4de33c635151da2
parent	41cdc16e7301c91d2460aa14412f592695b0d4ed (diff)
download	llvm-4301222525b565028850030835b8db9ce6d153db.tar.gz llvm-4301222525b565028850030835b8db9ce6d153db.tar.bz2 llvm-4301222525b565028850030835b8db9ce6d153db.tar.xz