diff options
author | Filipe Cabecinhas <me@filcab.net> | 2014-05-19 19:45:57 +0000 |
---|---|---|
committer | Filipe Cabecinhas <me@filcab.net> | 2014-05-19 19:45:57 +0000 |
commit | ca162faee2e8f9e9f8938b164a0e959307194baa (patch) | |
tree | de63c1a509bacc0839608b29d2c83105172ad611 /lib/Target/X86/X86ISelLowering.cpp | |
parent | 861e2ef7b0fb312c9acab325c622c7758f3eac0a (diff) | |
download | llvm-ca162faee2e8f9e9f8938b164a0e959307194baa.tar.gz llvm-ca162faee2e8f9e9f8938b164a0e959307194baa.tar.bz2 llvm-ca162faee2e8f9e9f8938b164a0e959307194baa.tar.xz |
Added more insertps optimizations
Summary:
When inserting an element that's coming from a vector load or a broadcast
of a vector (or scalar) load, combine the load into the insertps
instruction.
Added PerformINSERTPSCombine for the case where we need to fix the load
(load of a vector + insertps with a non-zero CountS).
Added patterns for the broadcasts.
Also added tests for SSE4.1, AVX, and AVX2.
Reviewers: delena, nadav, craig.topper
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D3581
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209156 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 60 |
1 files changed, 49 insertions, 11 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index e1db618baf..cfdaf0f8e3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -7412,6 +7412,23 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) { getShuffleSHUFImmediate(SVOp), DAG); } +static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index, + SelectionDAG &DAG) { + SDLoc dl(Load); + MVT VT = Load->getSimpleValueType(0); + MVT EVT = VT.getVectorElementType(); + SDValue Addr = Load->getOperand(1); + SDValue NewAddr = DAG.getNode( + ISD::ADD, dl, Addr.getSimpleValueType(), Addr, + DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType())); + + SDValue NewLoad = + DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, + DAG.getMachineFunction().getMachineMemOperand( + Load->getMemOperand(), 0, EVT.getStoreSize())); + return NewLoad; +} + // It is only safe to call this function if isINSERTPSMask is true for // this shufflevector mask. static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, @@ -7423,7 +7440,6 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, // If we're transferring an i32 from memory to a specific element in a // register, we output a generic DAG that will match the PINSRD // instruction. - // TODO: Optimize for AVX cases too (VINSERTPS) MVT VT = SVOp->getSimpleValueType(0); MVT EVT = VT.getVectorElementType(); SDValue V1 = SVOp->getOperand(0); @@ -7456,17 +7472,10 @@ static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl, // Trivial case, when From comes from a load and is only used by the // shuffle. Make it use insertps from the vector that we need from that // load. - SDValue Addr = From.getOperand(1); - SDValue NewAddr = - DAG.getNode(ISD::ADD, dl, Addr.getSimpleValueType(), Addr, - DAG.getConstant(DestIndex * EVT.getStoreSize(), - Addr.getSimpleValueType())); - - LoadSDNode *Load = cast<LoadSDNode>(From); SDValue NewLoad = - DAG.getLoad(EVT, dl, Load->getChain(), NewAddr, - DAG.getMachineFunction().getMachineMemOperand( - Load->getMemOperand(), 0, EVT.getStoreSize())); + NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG); + if (!NewLoad.getNode()) + return SDValue(); if (EVT == MVT::f32) { // Create this as a scalar to vector to match the instruction pattern. @@ -20281,6 +20290,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG, + const X86Subtarget *Subtarget) { + SDLoc dl(N); + MVT VT = N->getOperand(1)->getSimpleValueType(0); + assert(VT == MVT::v4f32 || + VT == MVT::v4i32 && "X86insertps is only defined for v4x32"); + + SDValue Ld = N->getOperand(1); + if (MayFoldLoad(Ld)) { + // Extract the countS bits from the immediate so we can get the proper + // address when narrowing the vector load to a specific element. + // When the second source op is a memory address, interps doesn't use + // countS and just gets an f32 from that address. + unsigned DestIndex = + cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6; + Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG); + } else + return SDValue(); + + // Create this as a scalar to vector to match the instruction pattern. + SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld); + // countS bits are ignored when loading from memory on insertps, which + // means we don't need to explicitly set them to 0. + return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0), + LoadScalarToVector, N->getOperand(2)); +} + // Helper function of PerformSETCCCombine. It is to materialize "setb reg" // as "sbb reg,reg", since it can be extended without zext and produces // an all-ones bit which is more useful than 0/1 in some cases. @@ -20584,6 +20620,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget); case ISD::INTRINSIC_WO_CHAIN: return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget); + case X86ISD::INSERTPS: + return PerformINSERTPSCombine(N, DAG, Subtarget); } return SDValue(); |