From 509a492442b7e889d615d3b451629c81a810aef1 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 15 Nov 2013 04:42:23 +0000 Subject: Add target hook to prevent folding some bitcasted loads. This is to avoid this transformation in some cases: fold (conv (load x)) -> (load (conv*)x) On architectures that don't natively support some vector loads efficiently casting the load to a smaller vector of larger types and loading is more efficient. Patch by Micah Villmow. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194783 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/TargetLowering.h | 11 +++++++++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 ++- lib/Target/R600/AMDGPUISelLowering.cpp | 12 +++++++++ lib/Target/R600/AMDGPUISelLowering.h | 1 + test/CodeGen/R600/combine_vloads.ll | 42 ++++++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/R600/combine_vloads.ll diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 2649d26cb7..5ab04f7944 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -204,6 +204,17 @@ public: return PredictableSelectIsExpensive; } + /// isLoadBitCastBeneficial() - Return true if the following transform + /// is beneficial. + /// fold (conv (load x)) -> (load (conv*)x) + /// On architectures that don't natively support some vector loads efficiently, + /// casting the load to a smaller vector of larger types and loading + /// is more efficient, however, this can be undone by optimizations in + /// dag combiner. + virtual bool isLoadBitCastBeneficial(EVT /* Load */, EVT /* Bitcast */) const { + return true; + } + /// Return the ValueType of the result of SETCC operations. Also used to /// obtain the target's preferred type for the condition operand of SELECT and /// BRCOND nodes. In the case of BRCOND the argument passed is MVT::Other diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 90cd1d3ee2..78543a4113 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5768,7 +5768,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) { if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && // Do not change the width of a volatile load. !cast(N0)->isVolatile() && - (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT))) { + (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) && + TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) { LoadSDNode *LN0 = cast(N0); unsigned Align = TLI.getDataLayout()-> getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 51ad217fc8..fdabea5169 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -196,6 +196,18 @@ MVT AMDGPUTargetLowering::getVectorIdxTy() const { return MVT::i32; } +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, + EVT CastTy) const { + if (LoadTy.getSizeInBits() != CastTy.getSizeInBits()) + return true; + + unsigned LScalarSize = LoadTy.getScalarType().getSizeInBits(); + unsigned CastScalarSize = CastTy.getScalarType().getSizeInBits(); + + return ((LScalarSize <= CastScalarSize) || + (CastScalarSize >= 32) || + (LScalarSize < 32)); +} //===---------------------------------------------------------------------===// // Target Properties diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index dacb086dd5..2dfd3cf492 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -77,6 +77,7 @@ public: virtual bool isFAbsFree(EVT VT) const; virtual bool isFNegFree(EVT VT) const; virtual MVT getVectorIdxTy() const; + virtual bool isLoadBitCastBeneficial(EVT, EVT) const LLVM_OVERRIDE; virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Outs, diff --git a/test/CodeGen/R600/combine_vloads.ll b/test/CodeGen/R600/combine_vloads.ll new file mode 100644 index 0000000000..f8ec712c1e --- /dev/null +++ b/test/CodeGen/R600/combine_vloads.ll @@ -0,0 +1,42 @@ +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s + +; +; kernel void combine_vloads(global char8* src, global char8* result) { +; for (int i = 0; i < 1024; ++i) +; result[i] = src[0] + src[1] + src[2] + src[3]; +; } +; + + +; 128-bit loads instead of many 8-bit +; EG-LABEL: @combine_vloads: +; EG: VTX_READ_128 +; EG: VTX_READ_128 +define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind { +entry: + br label %for.body + +for.exit: ; preds = %for.body + ret void + +for.body: ; preds = %for.body, %entry + %i.01 = phi i32 [ 0, %entry ], [ %tmp19, %for.body ] + %arrayidx_v4 = bitcast <8 x i8> addrspace(1)* %src to <32 x i8> addrspace(1)* + %0 = bitcast <32 x i8> addrspace(1)* %arrayidx_v4 to <8 x i32> addrspace(1)* + %vecload2 = load <8 x i32> addrspace(1)* %0, align 32 + %1 = bitcast <8 x i32> %vecload2 to <32 x i8> + %tmp5 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp8 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp9 = add nsw <8 x i8> %tmp5, %tmp8 + %tmp12 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp13 = add nsw <8 x i8> %tmp9, %tmp12 + %tmp16 = shufflevector <32 x i8> %1, <32 x i8> undef, <8 x i32> + %tmp17 = add nsw <8 x i8> %tmp13, %tmp16 + %scevgep = getelementptr <8 x i8> addrspace(1)* %result, i32 %i.01 + %2 = bitcast <8 x i8> %tmp17 to <2 x i32> + %3 = bitcast <8 x i8> addrspace(1)* %scevgep to <2 x i32> addrspace(1)* + store <2 x i32> %2, <2 x i32> addrspace(1)* %3, align 8 + %tmp19 = add nsw i32 %i.01, 1 + %exitcond = icmp eq i32 %tmp19, 1024 + br i1 %exitcond, label %for.exit, label %for.body +} -- cgit v1.2.3