diff options
-rw-r--r-- | include/llvm/CodeGen/LiveIntervalAnalysis.h | 20 | ||||
-rw-r--r-- | lib/CodeGen/LiveIntervalAnalysis.cpp | 67 | ||||
-rw-r--r-- | lib/CodeGen/RegisterCoalescer.cpp | 55 | ||||
-rw-r--r-- | test/CodeGen/ARM/coalesce-subregs.ll | 50 |
4 files changed, 189 insertions, 3 deletions
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h index bf7469093a..1e8dde1255 100644 --- a/include/llvm/CodeGen/LiveIntervalAnalysis.h +++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h @@ -165,6 +165,26 @@ namespace llvm { bool shrinkToUses(LiveInterval *li, SmallVectorImpl<MachineInstr*> *dead = 0); + /// extendToIndices - Extend the live range of LI to reach all points in + /// Indices. The points in the Indices array must be jointly dominated by + /// existing defs in LI. PHI-defs are added as needed to maintain SSA form. + /// + /// If a SlotIndex in Indices is the end index of a basic block, LI will be + /// extended to be live out of the basic block. + /// + /// See also LiveRangeCalc::extend(). + void extendToIndices(LiveInterval *LI, ArrayRef<SlotIndex> Indices); + + /// pruneValue - If an LI value is live at Kill, prune its live range by + /// removing any liveness reachable from Kill. Add live range end points to + /// EndPoints such that extendToIndices(LI, EndPoints) will reconstruct the + /// value's live range. + /// + /// Calling pruneValue() and extendToIndices() can be used to reconstruct + /// SSA form after adding defs to a virtual register. + void pruneValue(LiveInterval *LI, SlotIndex Kill, + SmallVectorImpl<SlotIndex> *EndPoints); + SlotIndexes *getSlotIndexes() const { return Indexes; } diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp index f7d601e23e..6c26b79f11 100644 --- a/lib/CodeGen/LiveIntervalAnalysis.cpp +++ b/lib/CodeGen/LiveIntervalAnalysis.cpp @@ -731,6 +731,73 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li, return CanSeparate; } +void LiveIntervals::extendToIndices(LiveInterval *LI, + ArrayRef<SlotIndex> Indices) { + assert(LRCalc && "LRCalc not initialized."); + LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator()); + for (unsigned i = 0, e = Indices.size(); i != e; ++i) + LRCalc->extend(LI, Indices[i]); +} + +void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill, + SmallVectorImpl<SlotIndex> *EndPoints) { + LiveRangeQuery LRQ(*LI, Kill); + assert (!LRQ.valueDefined() && "Can't prune value at the defining instr"); + + // Also can't prune a value that isn't there. + VNInfo *VNI = LRQ.valueOut(); + if (!VNI) + return; + + MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill); + SlotIndex MBBStart, MBBEnd; + tie(MBBStart, MBBEnd) = Indexes->getMBBRange(KillMBB); + + // If VNI isn't live out from KillMBB, the value is trivially pruned. + if (LRQ.endPoint() < MBBEnd) { + LI->removeRange(Kill, LRQ.endPoint()); + if (EndPoints) EndPoints->push_back(LRQ.endPoint()); + return; + } + + // VNI is live out of KillMBB. + LI->removeRange(Kill, MBBEnd); + if (EndPoints) EndPoints->push_back(MBBEnd); + + // Find all blocks that are reachable from MBB without leaving VNI's live + // range. + for (df_iterator<MachineBasicBlock*> + I = df_begin(KillMBB), E = df_end(KillMBB); I != E;) { + MachineBasicBlock *MBB = *I; + // KillMBB itself was already handled. + if (MBB == KillMBB) { + ++I; + continue; + } + + // Check if VNI is live in to MBB. + tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB); + LiveRangeQuery LRQ(*LI, MBBStart); + if (LRQ.valueIn() != VNI) { + // This block isn't part of the VNI live range. Prune the search. + I.skipChildren(); + continue; + } + + // Prune the search if VNI is killed in MBB. + if (LRQ.endPoint() < MBBEnd) { + LI->removeRange(MBBStart, LRQ.endPoint()); + if (EndPoints) EndPoints->push_back(LRQ.endPoint()); + I.skipChildren(); + continue; + } + + // VNI is live through MBB. + LI->removeRange(MBBStart, MBBEnd); + if (EndPoints) EndPoints->push_back(MBBEnd); + ++I; + } +} //===----------------------------------------------------------------------===// // Register allocator hooks. diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp index 85a62b2645..67cad14afd 100644 --- a/lib/CodeGen/RegisterCoalescer.cpp +++ b/lib/CodeGen/RegisterCoalescer.cpp @@ -1277,6 +1277,11 @@ public: /// Returns false if any conflicts were impossible to resolve. bool resolveConflicts(JoinVals &Other); + /// Prune the live range of values in Other.LI where they would conflict with + /// CR_Replace values in LI. Collect end points for restoring the live range + /// after joining. + void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints); + /// Erase any machine instructions that have been coalesced away. /// Add erased instructions to ErasedInstrs. /// Add foreign virtual registers to ShrinkRegs if their live range ended at @@ -1421,6 +1426,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) { // We have overlapping values, or possibly a kill of Other. // Recursively compute assignments up the dominator tree. Other.computeAssignment(V.OtherVNI->id, *this); + const Val &OtherV = Other.Vals[V.OtherVNI->id]; // Don't attempt resolving PHI values for now. if (VNI->isPHIDef()) @@ -1435,7 +1441,7 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) { if (CP.isCoalescable(DefMI)) { // Some of the lanes copied from OtherVNI may be undef, making them undef // here too. - V.ValidLanes &= ~V.WriteLanes | Other.Vals[V.OtherVNI->id].ValidLanes; + V.ValidLanes &= ~V.WriteLanes | OtherV.ValidLanes; return CR_Erase; } @@ -1453,7 +1459,23 @@ JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) { stripCopies(VNI) == stripCopies(V.OtherVNI)) return CR_Erase; - // FIXME: Identify CR_Replace opportunities. + // If the lanes written by this instruction were all undef in OtherVNI, it is + // still safe to join the live ranges. This can't be done with a simple value + // mapping, though - OtherVNI will map to multiple values: + // + // 1 %dst:ssub0 = FOO <-- OtherVNI + // 2 %src = BAR <-- VNI + // 3 %dst:ssub1 = COPY %src<kill> <-- Eliminate this copy. + // 4 BAZ %dst<kill> + // 5 QUUX %src<kill> + // + // Here OtherVNI will map to itself in [1;2), but to VNI in [2;5). CR_Replace + // handles this complex value mapping. + if ((V.WriteLanes & OtherV.ValidLanes) == 0) + return CR_Replace; + + // FIXME: Identify CR_Replace opportunities where the clobbered lanes are + // dead. return CR_Impossible; } @@ -1514,6 +1536,18 @@ bool JoinVals::resolveConflicts(JoinVals &Other) { return true; } +void JoinVals::pruneValues(JoinVals &Other, + SmallVectorImpl<SlotIndex> &EndPoints) { + for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) { + if (Vals[i].Resolution != CR_Replace) + continue; + SlotIndex Def = LI.getValNumInfo(i)->def; + LIS->pruneValue(&Other.LI, Def, &EndPoints); + DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.LI.reg) << " at " << Def + << ": " << Other.LI << '\n'); + } +} + void JoinVals::eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs, SmallVectorImpl<unsigned> &ShrinkRegs) { for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) { @@ -1557,6 +1591,14 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { // All clear, the live ranges can be merged. + // The merging algorithm in LiveInterval::join() can't handle conflicting + // value mappings, so we need to remove any live ranges that overlap a + // CR_Replace resolution. Collect a set of end points that can be used to + // restore the live range after joining. + SmallVector<SlotIndex, 8> EndPoints; + LHSVals.pruneValues(RHSVals, EndPoints); + RHSVals.pruneValues(LHSVals, EndPoints); + // Erase COPY and IMPLICIT_DEF instructions. This may cause some external // registers to require trimming. SmallVector<unsigned, 8> ShrinkRegs; @@ -1574,6 +1616,15 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) { // ranges. They are reinserted after register allocation. MRI->clearKillFlags(LHS.reg); MRI->clearKillFlags(RHS.reg); + + if (EndPoints.empty()) + return true; + + // Recompute the parts of the live range we had to remove because of + // CR_Replace conflicts. + DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size() + << " points: " << LHS << '\n'); + LIS->extendToIndices(&LHS, EndPoints); return true; } diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll index fb0f4c67c9..dfb5b17306 100644 --- a/test/CodeGen/ARM/coalesce-subregs.ll +++ b/test/CodeGen/ARM/coalesce-subregs.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mcpu=cortex-a9 | FileCheck %s +; RUN: llc < %s -mcpu=cortex-a9 -new-coalescer | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" target triple = "thumbv7-apple-ios0.0.0" @@ -66,3 +66,51 @@ do.end: ; preds = %do.body declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind + +; CHECK: f3 +; This function has lane insertions that span basic blocks. +; The trivial REG_SEQUENCE lowering can't handle that, but the coalescer can. +; +; void f3(float *p, float *q) { +; float32x2_t x; +; x[1] = p[3]; +; if (q) +; x[0] = q[0] + q[1]; +; else +; x[0] = p[2]; +; vst1_f32(p+4, x); +; } +; +; CHECK-NOT: vmov +; CHECK-NOT: vorr +define void @f3(float* %p, float* %q) nounwind ssp { +entry: + %arrayidx = getelementptr inbounds float* %p, i32 3 + %0 = load float* %arrayidx, align 4 + %vecins = insertelement <2 x float> undef, float %0, i32 1 + %tobool = icmp eq float* %q, null + br i1 %tobool, label %if.else, label %if.then + +if.then: ; preds = %entry + %1 = load float* %q, align 4 + %arrayidx2 = getelementptr inbounds float* %q, i32 1 + %2 = load float* %arrayidx2, align 4 + %add = fadd float %1, %2 + %vecins3 = insertelement <2 x float> %vecins, float %add, i32 0 + br label %if.end + +if.else: ; preds = %entry + %arrayidx4 = getelementptr inbounds float* %p, i32 2 + %3 = load float* %arrayidx4, align 4 + %vecins5 = insertelement <2 x float> %vecins, float %3, i32 0 + br label %if.end + +if.end: ; preds = %if.else, %if.then + %x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ] + %add.ptr = getelementptr inbounds float* %p, i32 4 + %4 = bitcast float* %add.ptr to i8* + tail call void @llvm.arm.neon.vst1.v2f32(i8* %4, <2 x float> %x.0, i32 4) + ret void +} + +declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind |