diff options
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 51 | ||||
-rw-r--r-- | test/CodeGen/X86/masked-iv-safe.ll | 5 | ||||
-rw-r--r-- | test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll | 10 | ||||
-rw-r--r-- | test/Transforms/LoopStrengthReduce/uglygep.ll | 58 |
4 files changed, 109 insertions, 15 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index c7344f6bbd..ecc96ae0b2 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -774,6 +774,13 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { } namespace { +class LSRUse; +} +// Check if it is legal to fold 2 base registers. +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F); + +namespace { /// Cost - This class is used to measure and compare candidate formulae. class Cost { @@ -810,12 +817,14 @@ public: return NumRegs == ~0u; } - void RateFormula(const Formula &F, + void RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs = 0); void print(raw_ostream &OS) const; @@ -900,12 +909,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg, } } -void Cost::RateFormula(const Formula &F, +void Cost::RateFormula(const TargetTransformInfo &TTI, + const Formula &F, SmallPtrSet<const SCEV *, 16> &Regs, const DenseSet<const SCEV *> &VisitedRegs, const Loop *L, const SmallVectorImpl<int64_t> &Offsets, ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, SmallPtrSet<const SCEV *, 16> *LoserRegs) { // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { @@ -932,7 +943,9 @@ void Cost::RateFormula(const Formula &F, // Determine how many (unfolded) adds we'll need inside the loop. size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); if (NumBaseParts > 1) - NumBaseAdds += NumBaseParts - 1; + // Do not count the base and a possible second register if the target + // allows to fold 2 registers. + NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F)); // Tally up the non-zero immediates. for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), @@ -1359,6 +1372,30 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, F.BaseOffset, F.HasBaseReg, F.Scale); } +static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU, + const Formula &F) { + // If F is used as an Addressing Mode, it may fold one Base plus one + // scaled register. If the scaled register is nil, do as if another + // element of the base regs is a 1-scaled register. + // This is possible if BaseRegs has at least 2 registers. + + // If this is not an address calculation, this is not an addressing mode + // use. + if (LU.Kind != LSRUse::Address) + return false; + + // F is already scaled. + if (F.Scale != 0) + return false; + + // We need to keep one register for the base and one to scale. + if (F.BaseRegs.size() < 2) + return false; + + return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, + F.BaseGV, F.BaseOffset, F.HasBaseReg, 1); + } + static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, @@ -3690,7 +3727,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { // the corresponding bad register from the Regs set. Cost CostF; Regs.clear(); - CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, + CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, &LoserRegs); if (CostF.isLoser()) { // During initial formula generation, undesirable formulae are generated @@ -3726,7 +3763,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { Cost CostBest; Regs.clear(); - CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, + DT, LU); if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); @@ -4079,7 +4117,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, // the current best, prune the search at that point. NewCost = CurCost; NewRegs = CurRegs; - NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT); + NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT, + LU); if (NewCost < SolutionCost) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) { diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll index a7b036e9b6..c33cac2e05 100644 --- a/test/CodeGen/X86/masked-iv-safe.ll +++ b/test/CodeGen/X86/masked-iv-safe.ll @@ -3,9 +3,8 @@ ; RUN: not grep movz %t ; RUN: not grep sar %t ; RUN: not grep shl %t -; RUN: grep add %t | count 1 -; RUN: grep inc %t | count 4 -; RUN: grep dec %t | count 2 +; RUN: grep add %t | count 5 +; RUN: grep inc %t | count 2 ; RUN: grep lea %t | count 3 ; Optimize away zext-inreg and sext-inreg on the loop induction diff --git a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll index 8fbddf8ae4..652eb06225 100644 --- a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll +++ b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll @@ -10,12 +10,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; Verify that nothing uses the "dead" ptrtoint from "undef". ; CHECK: @VerifyDiagnosticConsumerTest ; CHECK: bb: -; CHECK: %0 = ptrtoint i8* undef to i64 -; CHECK-NOT: %0 +; "dead" ptrpoint not emitted (or dead code eliminated) with +; current LSR cost model. +; CHECK-NOT: = ptrtoint i8* undef to i64 ; CHECK: .lr.ph -; CHECK-NOT: %0 -; CHECK: sub i64 %7, %tmp6 -; CHECK-NOT: %0 +; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1 +; CHECK: sub i64 [[TMP]], %tmp6 ; CHECK: ret void define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 { bb: diff --git a/test/Transforms/LoopStrengthReduce/uglygep.ll b/test/Transforms/LoopStrengthReduce/uglygep.ll index 8af5cf1dfd..10c77d5f64 100644 --- a/test/Transforms/LoopStrengthReduce/uglygep.ll +++ b/test/Transforms/LoopStrengthReduce/uglygep.ll @@ -1,4 +1,4 @@ -; RUN: opt < %s -loop-reduce -S | not grep uglygep +; RUN: opt < %s -loop-reduce -S | FileCheck %s ; LSR shouldn't consider %t8 to be an interesting user of %t6, and it ; should be able to form pretty GEPs. @@ -6,6 +6,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" define void @Z4() nounwind { +; CHECK: define void @Z4 bb: br label %bb3 @@ -20,11 +21,26 @@ bb3: ; preds = %bb2, %bb %t4 = phi i64 [ %t, %bb2 ], [ 0, %bb ] ; <i64> [#uses=3] br label %bb1 +; CHECK: bb10: +; CHECK-NEXT: %t7 = icmp eq i64 %t4, 0 +; Host %t2 computation outside the loop. +; CHECK-NEXT: [[SCEVGEP:%[^ ]+]] = getelementptr i8* undef, i64 %t4 +; CHECK-NEXT: br label %bb14 bb10: ; preds = %bb9 %t7 = icmp eq i64 %t4, 0 ; <i1> [#uses=1] %t3 = add i64 %t4, 16 ; <i64> [#uses=1] br label %bb14 +; CHECK: bb14: +; CHECK-NEXT: store i8 undef, i8* [[SCEVGEP]] +; CHECK-NEXT: %t6 = load float** undef +; Fold %t3's add within the address. +; CHECK-NEXT: [[SCEVGEP1:%[^ ]+]] = getelementptr float* %t6, i64 4 +; CHECK-NEXT: [[SCEVGEP2:%[^ ]+]] = bitcast float* [[SCEVGEP1]] to i8* +; Use the induction variable (%t4) to access the right element +; CHECK-NEXT: [[ADDRESS:%[^ ]+]] = getelementptr i8* [[SCEVGEP2]], i64 %t4 +; CHECK-NEXT: store i8 undef, i8* [[ADDRESS]] +; CHECK-NEXT: br label %bb14 bb14: ; preds = %bb14, %bb10 %t2 = getelementptr inbounds i8* undef, i64 %t4 ; <i8*> [#uses=1] store i8 undef, i8* %t2 @@ -36,9 +52,15 @@ bb14: ; preds = %bb14, %bb10 } define fastcc void @TransformLine() nounwind { +; CHECK: @TransformLine bb: br label %loop0 +; CHECK: loop0: +; Induction variable is initialized to -2. +; CHECK-NEXT: [[PHIIV:%[^ ]+]] = phi i32 [ [[IVNEXT:%[^ ]+]], %loop0 ], [ -2, %bb ] +; CHECK-NEXT: [[IVNEXT]] = add i32 [[PHIIV]], 1 +; CHECK-NEXT: br i1 false, label %loop0, label %bb0 loop0: ; preds = %loop0, %bb %i0 = phi i32 [ %i0.next, %loop0 ], [ 0, %bb ] ; <i32> [#uses=2] %i0.next = add i32 %i0, 1 ; <i32> [#uses=1] @@ -47,18 +69,52 @@ loop0: ; preds = %loop0, %bb bb0: ; preds = %loop0 br label %loop1 +; CHECK: loop1: +; CHECK-NEXT: %i1 = phi i32 [ 0, %bb0 ], [ %i1.next, %bb5 ] +; IVNEXT covers the uses of %i0 and %t0. +; Therefore, %t0 has been removed. +; The critical edge has been split. +; CHECK-NEXT: br i1 false, label %bb2, label %[[LOOP1BB6:.+]] loop1: ; preds = %bb5, %bb0 %i1 = phi i32 [ 0, %bb0 ], [ %i1.next, %bb5 ] ; <i32> [#uses=4] %t0 = add i32 %i0, %i1 ; <i32> [#uses=1] br i1 false, label %bb2, label %bb6 +; CHECK: bb2: +; Critical edge split. +; CHECK-NEXT: br i1 true, label %[[BB2BB6:[^,]+]], label %bb5 bb2: ; preds = %loop1 br i1 true, label %bb6, label %bb5 +; CHECK: bb5: +; CHECK-NEXT: %i1.next = add i32 %i1, 1 +; CHECK-NEXT: br i1 true, label %[[BB5BB6:[^,]+]], label %loop1 bb5: ; preds = %bb2 %i1.next = add i32 %i1, 1 ; <i32> [#uses=1] br i1 true, label %bb6, label %loop1 +; bb5 to bb6 split basic block. +; CHECK: [[BB5BB6]]: +; CHECK-NEXT: [[INITIALVAL:%[^ ]+]] = add i32 [[IVNEXT]], %i1.next +; CHECK-NEXT: br label %[[SPLITTOBB6:.+]] + +; bb2 to bb6 split basic block. +; CHECK: [[BB2BB6]]: +; CHECK-NEXT: br label %[[SPLITTOBB6]] + +; Split basic blocks to bb6. +; CHECK: [[SPLITTOBB6]]: +; CHECK-NEXT: [[INITP8:%[^ ]+]] = phi i32 [ [[INITIALVAL]], %[[BB5BB6]] ], [ undef, %[[BB2BB6]] ] +; CHECK-NEXT: [[INITP9:%[^ ]+]] = phi i32 [ undef, %[[BB5BB6]] ], [ %i1, %[[BB2BB6]] ] +; CHECK-NEXT: br label %bb6 + +; CHECK: [[LOOP1BB6]]: +; CHECK-NEXT: br label %bb6 + +; CHECK: bb6: +; CHECK-NEXT: %p8 = phi i32 [ undef, %[[LOOP1BB6]] ], [ [[INITP8]], %[[SPLITTOBB6]] ] +; CHECK-NEXT: %p9 = phi i32 [ %i1, %[[LOOP1BB6]] ], [ [[INITP9]], %[[SPLITTOBB6]] ] +; CHECK-NEXT: unreachable bb6: ; preds = %bb5, %bb2, %loop1 %p8 = phi i32 [ %t0, %bb5 ], [ undef, %loop1 ], [ undef, %bb2 ] ; <i32> [#uses=0] %p9 = phi i32 [ undef, %bb5 ], [ %i1, %loop1 ], [ %i1, %bb2 ] ; <i32> [#uses=0] |