diff options
-rw-r--r-- | lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 254 | ||||
-rw-r--r-- | test/CodeGen/X86/MergeConsecutiveStores.ll | 150 | ||||
-rw-r--r-- | test/CodeGen/X86/loop-strength-reduce-2.ll | 10 | ||||
-rw-r--r-- | test/CodeGen/X86/loop-strength-reduce-3.ll | 5 | ||||
-rw-r--r-- | test/CodeGen/X86/loop-strength-reduce.ll | 5 |
5 files changed, 412 insertions, 12 deletions
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index da3293bc8c..db7140d114 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -301,6 +301,10 @@ namespace { /// looking for a better chain (aliasing node.) SDValue FindBetterChain(SDNode *N, SDValue Chain); + /// Merge consecutive store operations into a wide store. + /// \return True if some memory operations were changed. + bool MergeConsecutiveStores(StoreSDNode *N); + public: DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), @@ -7423,6 +7427,250 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) { return SDValue(); } +/// Returns the base pointer and an integer offset from that object. +static std::pair<SDValue, int64_t> GetPointerBaseAndOffset(SDValue Ptr) { + if (Ptr->getOpcode() == ISD::ADD && isa<ConstantSDNode>(Ptr->getOperand(1))) { + int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue(); + SDValue Base = Ptr->getOperand(0); + return std::make_pair(Base, Offset); + } + + return std::make_pair(Ptr, 0); +} + +struct ConsecutiveMemoryChainSorter { + typedef std::pair<LSBaseSDNode*, int64_t> MemLink; + bool operator()(MemLink LHS, MemLink RHS) { + return LHS.second < RHS.second; + } +}; + +bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) { + EVT MemVT = St->getMemoryVT(); + int64_t ElementSizeBytes = MemVT.getSizeInBits()/8; + + // Don't handle vectors. + if (MemVT.isVector() || !MemVT.isSimple()) + return false; + + // Perform an early exit check. Do not bother looking at stored values that + // are not constants or loads. + SDValue StoredVal = St->getValue(); + if (!isa<ConstantSDNode>(StoredVal) && !isa<ConstantFPSDNode>(StoredVal) && + !isa<LoadSDNode>(StoredVal)) + return false; + + // Is this a load-to-store or a const-store. + bool IsLoadSrc = isa<LoadSDNode>(StoredVal); + + // Only look at ends of store chains. + SDValue Chain = SDValue(St, 1); + if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE) + return false; + + // This holds the base pointer and the offset in bytes from the base pointer. + std::pair<SDValue, int64_t> BasePtr = + GetPointerBaseAndOffset(St->getBasePtr()); + + // We must have a base and an offset. + if (!BasePtr.first.getNode()) + return false; + + // Do not handle stores to undef base pointers. + if (BasePtr.first.getOpcode() == ISD::UNDEF) + return false; + + SmallVector<std::pair<StoreSDNode*, int64_t>, 8> StoreNodes; + // Walk up the chain and look for nodes with offsets from the same + // base pointer. Stop when reaching an instruction with a different kind + // or instruction which has a different base pointer. + StoreSDNode *Index = St; + while (Index) { + // If the chain has more than one use, then we can't reorder the mem ops. + if (Index != St && !SDValue(Index, 1)->hasOneUse()) + break; + + // Find the base pointer and offset for this memory node. + std::pair<SDValue, int64_t> Ptr = + GetPointerBaseAndOffset(Index->getBasePtr()); + + // Check that the base pointer is the same as the original one. + if (Ptr.first.getNode() != BasePtr.first.getNode()) + break; + + // Check that the alignment is the same. + if (Index->getAlignment() != St->getAlignment()) + break; + + // The memory operands must not be volatile. + if (Index->isVolatile() || Index->isIndexed()) + break; + + // No truncation. + if (StoreSDNode *St = dyn_cast<StoreSDNode>(Index)) + if (St->isTruncatingStore()) + break; + + // The stored memory type must be the same. + if (Index->getMemoryVT() != MemVT) + break; + + // We found a potential memory operand to merge. + StoreNodes.push_back(std::make_pair(Index,Ptr.second)); + + // Move up the chain to the next memory operation. + Index = dyn_cast<StoreSDNode>(Index->getChain().getNode()); + } + + // Check if there is anything to merge. + if (StoreNodes.size() < 2) + return false; + + // Remember which node is the earliest node in the chain. + LSBaseSDNode *EarliestOp = StoreNodes.back().first; + + // Sort the memory operands according to their distance from the base pointer. + std::sort(StoreNodes.begin(), StoreNodes.end(), + ConsecutiveMemoryChainSorter()); + + LSBaseSDNode *FirstInChain = StoreNodes[0].first; + + // Scan the memory operations on the chain and find the first non-consecutive + // store memory address. + unsigned LastConsecutiveStore = 0; + int64_t StartAddress = StoreNodes[0].second; + for (unsigned i=1; i<StoreNodes.size(); ++i) { + int64_t CurrAddress = StoreNodes[i].second; + if (CurrAddress - StartAddress != (ElementSizeBytes * i)) + break; + LastConsecutiveStore = i; + } + + // Store the constants into memory as one consecutive store. + if (!IsLoadSrc) { + unsigned LastConst = 0; + for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { + SDValue StoredVal = StoreNodes[i].first->getValue(); + bool IsConst = (isa<ConstantSDNode>(StoredVal) || isa<ConstantFPSDNode>(StoredVal)); + if (!IsConst) + break; + LastConst = i; + } + unsigned NumElem = std::min(LastConsecutiveStore + 1, LastConst + 1); + if (NumElem < 2) + return false; + + EVT JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); + DebugLoc DL = StoreNodes[0].first->getDebugLoc(); + SmallVector<SDValue, 8> Ops; + + for (unsigned i = 0; i < NumElem ; ++i) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].first); + Ops.push_back(St->getValue()); + } + + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, + JointMemOpVT, &Ops[0], Ops.size()); + + SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, BV, + FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), false, false, + FirstInChain->getAlignment()); + + for (unsigned i = 0; i < NumElem ; ++i) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].first); + CombineTo(St, NewStore); + } + return true; + } + + // Look for load nodes wich are used by the stored values. + SmallVector<std::pair<LoadSDNode*, int64_t>, 8> LoadNodes; + + // Find acceptible loads. Loads need to have the same chain (token factor), + // must not be zext, volatile, indexed, and they must be consecutive. + SDValue LdBasePtr; + for (unsigned i=0; i<LastConsecutiveStore+1; ++i) { + LoadSDNode *Ld = dyn_cast<LoadSDNode>(StoreNodes[i].first->getValue()); + if (!Ld) break; + + // Loads must only have one use. + if (!Ld->hasNUsesOfValue(1, 0)) + break; + + // Check that the alignment is the same as the stores. + if (Ld->getAlignment() != St->getAlignment()) + break; + + // The memory operands must not be volatile. + if (Ld->isVolatile() || Ld->isIndexed()) + break; + + if (Ld->getExtensionType() != ISD::NON_EXTLOAD) + break; + + // The stored memory type must be the same. + if (Ld->getMemoryVT() != MemVT) + break; + + std::pair<SDValue, int64_t> LdPtr = + GetPointerBaseAndOffset(Ld->getBasePtr()); + + // If this is not the first ptr that we check. + if (LdBasePtr.getNode()) { + // The base ptr must be the same, + if (LdPtr.first != LdBasePtr) + break; + } else { + LdBasePtr = LdPtr.first; + } + + // We found a potential memory operand to merge. + LoadNodes.push_back(std::make_pair(Ld, LdPtr.second)); + } + + if (LoadNodes.size() < 2) + return false; + + // Scan the memory operations on the chain and find the first non-consecutive + // load memory address. + unsigned LastConsecutiveLoad = 0; + StartAddress = LoadNodes[0].second; + for (unsigned i=1; i<LoadNodes.size(); ++i) { + int64_t CurrAddress = LoadNodes[i].second; + if (CurrAddress - StartAddress != (ElementSizeBytes * i)) + break; + LastConsecutiveLoad = i; + } + + unsigned NumElem = + std::min(LastConsecutiveStore + 1, LastConsecutiveLoad + 1); + + EVT JointMemOpVT = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem); + DebugLoc LoadDL = LoadNodes[0].first->getDebugLoc(); + DebugLoc StoreDL = StoreNodes[0].first->getDebugLoc(); + + LoadSDNode *FirstLoad = LoadNodes[0].first; + SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, + FirstLoad->getChain(), + FirstLoad->getBasePtr(), + FirstLoad->getPointerInfo(), + false, false, false, + FirstLoad->getAlignment()); + + SDValue NewStore = DAG.getStore(EarliestOp->getChain(), StoreDL, NewLoad, + FirstInChain->getBasePtr(), + FirstInChain->getPointerInfo(), false, false, + FirstInChain->getAlignment()); + + for (unsigned i = 0; i < NumElem ; ++i) { + StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].first); + CombineTo(St, NewStore); + } + + return true; +} + SDValue DAGCombiner::visitSTORE(SDNode *N) { StoreSDNode *ST = cast<StoreSDNode>(N); SDValue Chain = ST->getChain(); @@ -7624,6 +7872,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) { ST->getAlignment()); } + + // Only perform this optimization before the types are legal, because we + // don't want to generate illegal types in this optimization. + if (!LegalTypes && MergeConsecutiveStores(ST)) + return SDValue(N, 0); + return ReduceLoadOpStoreWidth(N); } diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll new file mode 100644 index 0000000000..435f38c8ad --- /dev/null +++ b/test/CodeGen/X86/MergeConsecutiveStores.ll @@ -0,0 +1,150 @@ +; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 } + +@a = common global [10000 x %struct.A] zeroinitializer, align 8 + +; Move all of the constants using a single vector store. +; CHECK: merge_const_store +; CHECK: movq %xmm0 +; CHECK: ret +define void @merge_const_store(i32 %count, %struct.A* nocapture %p) nounwind uwtable noinline ssp { + %1 = icmp sgt i32 %count, 0 + br i1 %1, label %.lr.ph, label %._crit_edge +.lr.ph: + %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] + %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ] + %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0 + store i8 1, i8* %2, align 1 + %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1 + store i8 2, i8* %3, align 1 + %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2 + store i8 3, i8* %4, align 1 + %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3 + store i8 4, i8* %5, align 1 + %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4 + store i8 5, i8* %6, align 1 + %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5 + store i8 6, i8* %7, align 1 + %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6 + store i8 7, i8* %8, align 1 + %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7 + store i8 8, i8* %9, align 1 + %10 = add nsw i32 %i.02, 1 + %11 = getelementptr inbounds %struct.A* %.01, i64 1 + %exitcond = icmp eq i32 %10, %count + br i1 %exitcond, label %._crit_edge, label %.lr.ph +._crit_edge: + ret void +} + +; Move the first 4 constants as a single vector. Move the rest as scalars. +; CHECK: merge_nonconst_store +; CHECK: movd %xmm0 +; CHECK: movb +; CHECK: movb +; CHECK: movb +; CHECK: movb +; CHECK: ret +define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp { + %1 = icmp sgt i32 %count, 0 + br i1 %1, label %.lr.ph, label %._crit_edge +.lr.ph: + %i.02 = phi i32 [ %10, %.lr.ph ], [ 0, %0 ] + %.01 = phi %struct.A* [ %11, %.lr.ph ], [ %p, %0 ] + %2 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0 + store i8 1, i8* %2, align 1 + %3 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1 + store i8 2, i8* %3, align 1 + %4 = getelementptr inbounds %struct.A* %.01, i64 0, i32 2 + store i8 3, i8* %4, align 1 + %5 = getelementptr inbounds %struct.A* %.01, i64 0, i32 3 + store i8 4, i8* %5, align 1 + %6 = getelementptr inbounds %struct.A* %.01, i64 0, i32 4 + store i8 %zz, i8* %6, align 1 ; <----------- Not a const; + %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 5 + store i8 6, i8* %7, align 1 + %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 6 + store i8 7, i8* %8, align 1 + %9 = getelementptr inbounds %struct.A* %.01, i64 0, i32 7 + store i8 8, i8* %9, align 1 + %10 = add nsw i32 %i.02, 1 + %11 = getelementptr inbounds %struct.A* %.01, i64 1 + %exitcond = icmp eq i32 %10, %count + br i1 %exitcond, label %._crit_edge, label %.lr.ph +._crit_edge: + ret void +} + + +;CHECK: merge_loads +; load: +;CHECK: movw +; store: +;CHECK: movw +;CHECK: ret +define void @merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { + %1 = icmp sgt i32 %count, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0 + %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0 + %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1 + br label %4 + +; <label>:4 ; preds = %4, %.lr.ph + %i.02 = phi i32 [ 0, %.lr.ph ], [ %9, %4 ] + %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %10, %4 ] + %5 = load i8* %2, align 1 + %6 = load i8* %3, align 1 + %7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0 + store i8 %5, i8* %7, align 1 + %8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1 + store i8 %6, i8* %8, align 1 + %9 = add nsw i32 %i.02, 1 + %10 = getelementptr inbounds %struct.A* %.01, i64 1 + %exitcond = icmp eq i32 %9, %count + br i1 %exitcond, label %._crit_edge, label %4 + +._crit_edge: ; preds = %4, %0 + ret void +} + +; The loads and the stores are interleved. Can't merge them. +;CHECK: no_merge_loads +;CHECK: movb +;CHECK: movb +;CHECK: movb +;CHECK: movb +;CHECK: ret +define void @no_merge_loads(i32 %count, %struct.A* noalias nocapture %q, %struct.A* noalias nocapture %p) nounwind uwtable noinline ssp { + %1 = icmp sgt i32 %count, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0 + %2 = getelementptr inbounds %struct.A* %q, i64 0, i32 0 + %3 = getelementptr inbounds %struct.A* %q, i64 0, i32 1 + br label %a4 + +a4: ; preds = %4, %.lr.ph + %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ] + %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ] + %a5 = load i8* %2, align 1 + %a7 = getelementptr inbounds %struct.A* %.01, i64 0, i32 0 + store i8 %a5, i8* %a7, align 1 + %a8 = getelementptr inbounds %struct.A* %.01, i64 0, i32 1 + %a6 = load i8* %3, align 1 + store i8 %a6, i8* %a8, align 1 + %a9 = add nsw i32 %i.02, 1 + %a10 = getelementptr inbounds %struct.A* %.01, i64 1 + %exitcond = icmp eq i32 %a9, %count + br i1 %exitcond, label %._crit_edge, label %a4 + +._crit_edge: ; preds = %4, %0 + ret void +} + + diff --git a/test/CodeGen/X86/loop-strength-reduce-2.ll b/test/CodeGen/X86/loop-strength-reduce-2.ll index b546462b68..b094fed2f6 100644 --- a/test/CodeGen/X86/loop-strength-reduce-2.ll +++ b/test/CodeGen/X86/loop-strength-reduce-2.ll @@ -1,20 +1,18 @@ -; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC -; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s -check-prefix=STATIC +; RUN: llc < %s -march=x86 -mcpu=corei7 -relocation-model=pic | FileCheck %s -check-prefix=PIC +; RUN: llc < %s -march=x86 -mcpu=corei7 -relocation-model=static | FileCheck %s -check-prefix=STATIC ; ; Make sure the common loop invariant A is hoisted up to preheader, ; since too many registers are needed to subsume it into the addressing modes. ; It's safe to sink A in when it's not pic. ; PIC: align -; PIC: movl $4, -4([[REG:%e[a-z]+]]) -; PIC: movl $5, ([[REG]]) +; PIC: movlpd %xmm0, -4([[REG:%e[a-z]+]]) ; PIC: addl $4, [[REG]] ; PIC: decl {{%e[[a-z]+}} ; PIC: jne ; STATIC: align -; STATIC: movl $4, -4(%ecx) -; STATIC: movl $5, (%ecx) +; STATIC: movlpd %xmm0, -4(%ecx) ; STATIC: addl $4, %ecx ; STATIC: decl %eax ; STATIC: jne diff --git a/test/CodeGen/X86/loop-strength-reduce-3.ll b/test/CodeGen/X86/loop-strength-reduce-3.ll index b1c9fb9c07..e2d77f2a6f 100644 --- a/test/CodeGen/X86/loop-strength-reduce-3.ll +++ b/test/CodeGen/X86/loop-strength-reduce-3.ll @@ -1,8 +1,7 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 -relocation-model=dynamic-no-pic | FileCheck %s ; CHECK: align -; CHECK: movl $4, -4(%ecx) -; CHECK: movl $5, (%ecx) +; CHECK: movlpd %xmm0, -4(%ecx) ; CHECK: addl $4, %ecx ; CHECK: decl %eax ; CHECK: jne diff --git a/test/CodeGen/X86/loop-strength-reduce.ll b/test/CodeGen/X86/loop-strength-reduce.ll index 42c6ac4983..d197451eee 100644 --- a/test/CodeGen/X86/loop-strength-reduce.ll +++ b/test/CodeGen/X86/loop-strength-reduce.ll @@ -1,8 +1,7 @@ -; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s +; RUN: llc < %s -march=x86 -mcpu=corei7 -relocation-model=static | FileCheck %s ; CHECK: align -; CHECK: movl $4, -4(%ecx) -; CHECK: movl $5, (%ecx) +; CHECK: movlpd %xmm0, -4(%ecx) ; CHECK: addl $4, %ecx ; CHECK: decl %eax ; CHECK: jne |