diff options
author | Benjamin Kramer <benny.kra@googlemail.com> | 2012-12-22 16:07:56 +0000 |
---|---|---|
committer | Benjamin Kramer <benny.kra@googlemail.com> | 2012-12-22 16:07:56 +0000 |
commit | 2f8a6cdfa3bc0bfa4532da89e574666c5251cdb5 (patch) | |
tree | 7b8d1f46fdf06a86b5ac8ed24ebcc10a3dede709 | |
parent | 17347912b46213658074416133396caffd034e0c (diff) | |
download | llvm-2f8a6cdfa3bc0bfa4532da89e574666c5251cdb5.tar.gz llvm-2f8a6cdfa3bc0bfa4532da89e574666c5251cdb5.tar.bz2 llvm-2f8a6cdfa3bc0bfa4532da89e574666c5251cdb5.tar.xz |
X86: Turn mul of <4 x i32> into pmuludq when no SSE4.1 is available.
pmuludq is slow, but it turns out that all the unpacking and packing of the
scalarized mul is even slower. 10% speedup on loop-vectorized paq8p.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170985 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 34 | ||||
-rw-r--r-- | test/CodeGen/X86/sse2-mul.ll | 14 |
2 files changed, 43 insertions, 5 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index fd883075a2..262475e97f 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -870,6 +870,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal); + setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); @@ -11027,17 +11028,43 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) { static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { + DebugLoc dl = Op.getDebugLoc(); EVT VT = Op.getValueType(); // Decompose 256-bit ops into smaller 128-bit ops. if (VT.is256BitVector() && !Subtarget->hasInt256()) return Lower256IntArith(Op, DAG); + SDValue A = Op.getOperand(0); + SDValue B = Op.getOperand(1); + + // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. + if (VT == MVT::v4i32) { + assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() && + "Should not custom lower when pmuldq is available!"); + + // Extract the odd parts. + const int UnpackMask[] = { 1, -1, 3, -1 }; + SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); + SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); + + // Multiply the even parts. + SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B); + // Now multiply odd parts. + SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds); + + Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens); + Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds); + + // Merge the two vectors back together with a shuffle. This expands into 2 + // shuffles. + const int ShufMask[] = { 0, 4, 2, 6 }; + return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); + } + assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Only know how to lower V2I64/V4I64 multiply"); - DebugLoc dl = Op.getDebugLoc(); - // Ahi = psrlqi(a, 32); // Bhi = psrlqi(b, 32); // @@ -11049,9 +11076,6 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget, // AhiBlo = psllqi(AhiBlo, 32); // return AloBlo + AloBhi + AhiBlo; - SDValue A = Op.getOperand(0); - SDValue B = Op.getOperand(1); - SDValue ShAmt = DAG.getConstant(32, MVT::i32); SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt); diff --git a/test/CodeGen/X86/sse2-mul.ll b/test/CodeGen/X86/sse2-mul.ll new file mode 100644 index 0000000000..0466d60ec3 --- /dev/null +++ b/test/CodeGen/X86/sse2-mul.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s + +define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) { + %m = mul <4 x i32> %x, %y + ret <4 x i32> %m +; CHECK: test1: +; CHECK: pshufd $49 +; CHECK: pmuludq +; CHECK: pshufd $49 +; CHECK: pmuludq +; CHECK: shufps $-120 +; CHECK: pshufd $-40 +; CHECK: ret +} |