summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp51
-rw-r--r--test/CodeGen/X86/masked-iv-safe.ll5
-rw-r--r--test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll10
-rw-r--r--test/Transforms/LoopStrengthReduce/uglygep.ll58
4 files changed, 109 insertions, 15 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index c7344f6bbd..ecc96ae0b2 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -774,6 +774,13 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
}
namespace {
+class LSRUse;
+}
+// Check if it is legal to fold 2 base registers.
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+ const Formula &F);
+
+namespace {
/// Cost - This class is used to measure and compare candidate formulae.
class Cost {
@@ -810,12 +817,14 @@ public:
return NumRegs == ~0u;
}
- void RateFormula(const Formula &F,
+ void RateFormula(const TargetTransformInfo &TTI,
+ const Formula &F,
SmallPtrSet<const SCEV *, 16> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
const Loop *L,
const SmallVectorImpl<int64_t> &Offsets,
ScalarEvolution &SE, DominatorTree &DT,
+ const LSRUse &LU,
SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
void print(raw_ostream &OS) const;
@@ -900,12 +909,14 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
}
}
-void Cost::RateFormula(const Formula &F,
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+ const Formula &F,
SmallPtrSet<const SCEV *, 16> &Regs,
const DenseSet<const SCEV *> &VisitedRegs,
const Loop *L,
const SmallVectorImpl<int64_t> &Offsets,
ScalarEvolution &SE, DominatorTree &DT,
+ const LSRUse &LU,
SmallPtrSet<const SCEV *, 16> *LoserRegs) {
// Tally up the registers.
if (const SCEV *ScaledReg = F.ScaledReg) {
@@ -932,7 +943,9 @@ void Cost::RateFormula(const Formula &F,
// Determine how many (unfolded) adds we'll need inside the loop.
size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
if (NumBaseParts > 1)
- NumBaseAdds += NumBaseParts - 1;
+ // Do not count the base and a possible second register if the target
+ // allows to fold 2 registers.
+ NumBaseAdds += NumBaseParts - (1 + isLegal2RegAMUse(TTI, LU, F));
// Tally up the non-zero immediates.
for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
@@ -1359,6 +1372,30 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
F.BaseOffset, F.HasBaseReg, F.Scale);
}
+static bool isLegal2RegAMUse(const TargetTransformInfo &TTI, const LSRUse &LU,
+ const Formula &F) {
+ // If F is used as an Addressing Mode, it may fold one Base plus one
+ // scaled register. If the scaled register is nil, do as if another
+ // element of the base regs is a 1-scaled register.
+ // This is possible if BaseRegs has at least 2 registers.
+
+ // If this is not an address calculation, this is not an addressing mode
+ // use.
+ if (LU.Kind != LSRUse::Address)
+ return false;
+
+ // F is already scaled.
+ if (F.Scale != 0)
+ return false;
+
+ // We need to keep one register for the base and one to scale.
+ if (F.BaseRegs.size() < 2)
+ return false;
+
+ return isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ F.BaseGV, F.BaseOffset, F.HasBaseReg, 1);
+ }
+
static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
LSRUse::KindType Kind, Type *AccessTy,
GlobalValue *BaseGV, int64_t BaseOffset,
@@ -3690,7 +3727,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
// the corresponding bad register from the Regs set.
Cost CostF;
Regs.clear();
- CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
+ CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
&LoserRegs);
if (CostF.isLoser()) {
// During initial formula generation, undesirable formulae are generated
@@ -3726,7 +3763,8 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
Cost CostBest;
Regs.clear();
- CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
+ CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
+ DT, LU);
if (CostF < CostBest)
std::swap(F, Best);
DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
@@ -4079,7 +4117,8 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
// the current best, prune the search at that point.
NewCost = CurCost;
NewRegs = CurRegs;
- NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
+ NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
+ LU);
if (NewCost < SolutionCost) {
Workspace.push_back(&F);
if (Workspace.size() != Uses.size()) {
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index a7b036e9b6..c33cac2e05 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -3,9 +3,8 @@
; RUN: not grep movz %t
; RUN: not grep sar %t
; RUN: not grep shl %t
-; RUN: grep add %t | count 1
-; RUN: grep inc %t | count 4
-; RUN: grep dec %t | count 2
+; RUN: grep add %t | count 5
+; RUN: grep inc %t | count 2
; RUN: grep lea %t | count 3
; Optimize away zext-inreg and sext-inreg on the loop induction
diff --git a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
index 8fbddf8ae4..652eb06225 100644
--- a/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
+++ b/test/Transforms/LoopStrengthReduce/2013-01-14-ReuseCast.ll
@@ -10,12 +10,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
; Verify that nothing uses the "dead" ptrtoint from "undef".
; CHECK: @VerifyDiagnosticConsumerTest
; CHECK: bb:
-; CHECK: %0 = ptrtoint i8* undef to i64
-; CHECK-NOT: %0
+; "dead" ptrpoint not emitted (or dead code eliminated) with
+; current LSR cost model.
+; CHECK-NOT: = ptrtoint i8* undef to i64
; CHECK: .lr.ph
-; CHECK-NOT: %0
-; CHECK: sub i64 %7, %tmp6
-; CHECK-NOT: %0
+; CHECK: [[TMP:%[^ ]+]] = add i64 %tmp5, 1
+; CHECK: sub i64 [[TMP]], %tmp6
; CHECK: ret void
define void @VerifyDiagnosticConsumerTest() unnamed_addr nounwind uwtable align 2 {
bb:
diff --git a/test/Transforms/LoopStrengthReduce/uglygep.ll b/test/Transforms/LoopStrengthReduce/uglygep.ll
index 8af5cf1dfd..10c77d5f64 100644
--- a/test/Transforms/LoopStrengthReduce/uglygep.ll
+++ b/test/Transforms/LoopStrengthReduce/uglygep.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-reduce -S | not grep uglygep
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
; LSR shouldn't consider %t8 to be an interesting user of %t6, and it
; should be able to form pretty GEPs.
@@ -6,6 +6,7 @@
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
define void @Z4() nounwind {
+; CHECK: define void @Z4
bb:
br label %bb3
@@ -20,11 +21,26 @@ bb3: ; preds = %bb2, %bb
%t4 = phi i64 [ %t, %bb2 ], [ 0, %bb ] ; <i64> [#uses=3]
br label %bb1
+; CHECK: bb10:
+; CHECK-NEXT: %t7 = icmp eq i64 %t4, 0
+; Host %t2 computation outside the loop.
+; CHECK-NEXT: [[SCEVGEP:%[^ ]+]] = getelementptr i8* undef, i64 %t4
+; CHECK-NEXT: br label %bb14
bb10: ; preds = %bb9
%t7 = icmp eq i64 %t4, 0 ; <i1> [#uses=1]
%t3 = add i64 %t4, 16 ; <i64> [#uses=1]
br label %bb14
+; CHECK: bb14:
+; CHECK-NEXT: store i8 undef, i8* [[SCEVGEP]]
+; CHECK-NEXT: %t6 = load float** undef
+; Fold %t3's add within the address.
+; CHECK-NEXT: [[SCEVGEP1:%[^ ]+]] = getelementptr float* %t6, i64 4
+; CHECK-NEXT: [[SCEVGEP2:%[^ ]+]] = bitcast float* [[SCEVGEP1]] to i8*
+; Use the induction variable (%t4) to access the right element
+; CHECK-NEXT: [[ADDRESS:%[^ ]+]] = getelementptr i8* [[SCEVGEP2]], i64 %t4
+; CHECK-NEXT: store i8 undef, i8* [[ADDRESS]]
+; CHECK-NEXT: br label %bb14
bb14: ; preds = %bb14, %bb10
%t2 = getelementptr inbounds i8* undef, i64 %t4 ; <i8*> [#uses=1]
store i8 undef, i8* %t2
@@ -36,9 +52,15 @@ bb14: ; preds = %bb14, %bb10
}
define fastcc void @TransformLine() nounwind {
+; CHECK: @TransformLine
bb:
br label %loop0
+; CHECK: loop0:
+; Induction variable is initialized to -2.
+; CHECK-NEXT: [[PHIIV:%[^ ]+]] = phi i32 [ [[IVNEXT:%[^ ]+]], %loop0 ], [ -2, %bb ]
+; CHECK-NEXT: [[IVNEXT]] = add i32 [[PHIIV]], 1
+; CHECK-NEXT: br i1 false, label %loop0, label %bb0
loop0: ; preds = %loop0, %bb
%i0 = phi i32 [ %i0.next, %loop0 ], [ 0, %bb ] ; <i32> [#uses=2]
%i0.next = add i32 %i0, 1 ; <i32> [#uses=1]
@@ -47,18 +69,52 @@ loop0: ; preds = %loop0, %bb
bb0: ; preds = %loop0
br label %loop1
+; CHECK: loop1:
+; CHECK-NEXT: %i1 = phi i32 [ 0, %bb0 ], [ %i1.next, %bb5 ]
+; IVNEXT covers the uses of %i0 and %t0.
+; Therefore, %t0 has been removed.
+; The critical edge has been split.
+; CHECK-NEXT: br i1 false, label %bb2, label %[[LOOP1BB6:.+]]
loop1: ; preds = %bb5, %bb0
%i1 = phi i32 [ 0, %bb0 ], [ %i1.next, %bb5 ] ; <i32> [#uses=4]
%t0 = add i32 %i0, %i1 ; <i32> [#uses=1]
br i1 false, label %bb2, label %bb6
+; CHECK: bb2:
+; Critical edge split.
+; CHECK-NEXT: br i1 true, label %[[BB2BB6:[^,]+]], label %bb5
bb2: ; preds = %loop1
br i1 true, label %bb6, label %bb5
+; CHECK: bb5:
+; CHECK-NEXT: %i1.next = add i32 %i1, 1
+; CHECK-NEXT: br i1 true, label %[[BB5BB6:[^,]+]], label %loop1
bb5: ; preds = %bb2
%i1.next = add i32 %i1, 1 ; <i32> [#uses=1]
br i1 true, label %bb6, label %loop1
+; bb5 to bb6 split basic block.
+; CHECK: [[BB5BB6]]:
+; CHECK-NEXT: [[INITIALVAL:%[^ ]+]] = add i32 [[IVNEXT]], %i1.next
+; CHECK-NEXT: br label %[[SPLITTOBB6:.+]]
+
+; bb2 to bb6 split basic block.
+; CHECK: [[BB2BB6]]:
+; CHECK-NEXT: br label %[[SPLITTOBB6]]
+
+; Split basic blocks to bb6.
+; CHECK: [[SPLITTOBB6]]:
+; CHECK-NEXT: [[INITP8:%[^ ]+]] = phi i32 [ [[INITIALVAL]], %[[BB5BB6]] ], [ undef, %[[BB2BB6]] ]
+; CHECK-NEXT: [[INITP9:%[^ ]+]] = phi i32 [ undef, %[[BB5BB6]] ], [ %i1, %[[BB2BB6]] ]
+; CHECK-NEXT: br label %bb6
+
+; CHECK: [[LOOP1BB6]]:
+; CHECK-NEXT: br label %bb6
+
+; CHECK: bb6:
+; CHECK-NEXT: %p8 = phi i32 [ undef, %[[LOOP1BB6]] ], [ [[INITP8]], %[[SPLITTOBB6]] ]
+; CHECK-NEXT: %p9 = phi i32 [ %i1, %[[LOOP1BB6]] ], [ [[INITP9]], %[[SPLITTOBB6]] ]
+; CHECK-NEXT: unreachable
bb6: ; preds = %bb5, %bb2, %loop1
%p8 = phi i32 [ %t0, %bb5 ], [ undef, %loop1 ], [ undef, %bb2 ] ; <i32> [#uses=0]
%p9 = phi i32 [ undef, %bb5 ], [ %i1, %loop1 ], [ %i1, %bb2 ] ; <i32> [#uses=0]