diff options
author | Nate Begeman <natebegeman@mac.com> | 2010-07-27 22:37:06 +0000 |
---|---|---|
committer | Nate Begeman <natebegeman@mac.com> | 2010-07-27 22:37:06 +0000 |
commit | bdcb5afb77547337ba148ce24d5e1046c0b25ced (patch) | |
tree | 9fcb26351ca757a6e412769796e5d7f5f7dd9b0c | |
parent | 622b7cf147f9231a1d6e3aac81a2dd1b6047b26c (diff) | |
download | llvm-bdcb5afb77547337ba148ce24d5e1046c0b25ced.tar.gz llvm-bdcb5afb77547337ba148ce24d5e1046c0b25ced.tar.bz2 llvm-bdcb5afb77547337ba148ce24d5e1046c0b25ced.tar.xz |
~40% faster vector shl <4 x i32> on SSE 4.1 Larger improvements for smaller types coming in future patches.
For:
define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
entry:
%shl = shl <4 x i32> %r, %a ; <<4 x i32>> [#uses=1]
%tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1]
ret <2 x i64> %tmp2
}
We get:
_shl: ## @shl
pslld $23, %xmm1
paddd LCPI0_0, %xmm1
cvttps2dq %xmm1, %xmm1
pmulld %xmm1, %xmm0
ret
Instead of:
_shl: ## @shl
pshufd $3, %xmm0, %xmm2
movd %xmm2, %eax
pshufd $3, %xmm1, %xmm2
movd %xmm2, %ecx
shll %cl, %eax
movd %eax, %xmm2
pshufd $1, %xmm0, %xmm3
movd %xmm3, %eax
pshufd $1, %xmm1, %xmm3
movd %xmm3, %ecx
shll %cl, %eax
movd %eax, %xmm3
punpckldq %xmm2, %xmm3
movd %xmm0, %eax
movd %xmm1, %ecx
shll %cl, %eax
movd %eax, %xmm2
movhlps %xmm0, %xmm0
movd %xmm0, %eax
movhlps %xmm1, %xmm1
movd %xmm1, %ecx
shll %cl, %eax
movd %eax, %xmm0
punpckldq %xmm0, %xmm2
movdqa %xmm2, %xmm0
punpckldq %xmm3, %xmm0
ret
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@109549 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 33 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 1 | ||||
-rw-r--r-- | test/CodeGen/X86/vec_shift4.ll | 14 |
3 files changed, 48 insertions, 0 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 618bb90d44..88bc8d0a92 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -838,6 +838,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); + // Can turn SHL into an integer multiply. + setOperationAction(ISD::SHL, MVT::v4i32, Custom); + // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional @@ -7498,6 +7501,35 @@ SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const { return Res; } +SDValue X86TargetLowering::LowerSHL(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + DebugLoc dl = Op.getDebugLoc(); + SDValue R = Op.getOperand(0); + + assert(Subtarget->hasSSE41() && "Cannot lower SHL without SSE4.1 or later"); + assert(VT == MVT::v4i32 && "Only know how to lower v4i32"); + + Op = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, + DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32), + Op.getOperand(1), DAG.getConstant(23, MVT::i32)); + + std::vector<Constant*> CV; + LLVMContext *Context = DAG.getContext(); + CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); + CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); + CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); + CV.push_back(ConstantInt::get(*Context, APInt(32, 0x3f800000U))); + Constant *C = ConstantVector::get(CV); + SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16); + SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx, + PseudoSourceValue::getConstantPool(), 0, + false, false, 16); + + Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend); + Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, Op); + Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op); + return DAG.getNode(ISD::MUL, dl, VT, Op, R); +} SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const { // Lower the "add/sub/mul with overflow" instruction into a regular ins plus @@ -7730,6 +7762,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::CTLZ: return LowerCTLZ(Op, DAG); case ISD::CTTZ: return LowerCTTZ(Op, DAG); case ISD::MUL: return LowerMUL_V2I64(Op, DAG); + case ISD::SHL: return LowerSHL(Op, DAG); case ISD::SADDO: case ISD::UADDO: case ISD::SSUBO: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index 96c97d9f9e..3556579ae6 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -723,6 +723,7 @@ namespace llvm { SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerSHL(SDValue Op, SelectionDAG &DAG) const; SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) const; diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll new file mode 100644 index 0000000000..d8f4e4ec68 --- /dev/null +++ b/test/CodeGen/X86/vec_shift4.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=x86 -mattr=+sse41 | FileCheck %s + +define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp { +entry: +; CHECK-NOT: shll +; CHECK: pslld +; CHECK: paddd +; CHECK: cvttps2dq +; CHECK: pmulld + + %shl = shl <4 x i32> %r, %a ; <<4 x i32>> [#uses=1] + %tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1] + ret <2 x i64> %tmp2 +} |