summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDan Gohman <gohman@apple.com>2011-05-03 00:46:49 +0000
committerDan Gohman <gohman@apple.com>2011-05-03 00:46:49 +0000
commitcca82149adef8306a295abdc963213ae3b11bbb6 (patch)
tree2d38fc521b67e22982d8a8535c98bafe4cdbd709
parentf7710af4ba78aa7a0cc9c226f334d8f2b6ab31bf (diff)
downloadllvm-cca82149adef8306a295abdc963213ae3b11bbb6.tar.gz
llvm-cca82149adef8306a295abdc963213ae3b11bbb6.tar.bz2
llvm-cca82149adef8306a295abdc963213ae3b11bbb6.tar.xz
Add an unfolded offset field to LSR's Formula record. This is used to
model constants which can be added to base registers via add-immediate instructions which don't require an additional register to materialize the immediate. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@130743 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/Target/TargetLowering.h8
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp8
-rw-r--r--lib/Target/ARM/ARMISelLowering.h6
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp71
-rw-r--r--test/CodeGen/ARM/lsr-unfolded-offset.ll80
5 files changed, 164 insertions, 9 deletions
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 17d761ce8f..15db9c52bf 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1583,6 +1583,14 @@ public:
return true;
}
+ /// isLegalAddImmediate - Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can add
+ /// a register with the immediate without having to materialize the
+ /// immediate into a register.
+ virtual bool isLegalAddImmediate(int64_t Imm) const {
+ return true;
+ }
+
//===--------------------------------------------------------------------===//
// Div utility functions
//
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0a31b87c4b..8533b6d38c 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -6984,6 +6984,14 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
return Imm >= 0 && Imm <= 255;
}
+/// isLegalAddImmediate - Return true if the specified immediate is legal
+/// add immediate, that is the target has add instructions which can add
+/// a register with the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+ return ARM_AM::getSOImmVal(Imm) != -1;
+}
+
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
bool isSEXTLoad, SDValue &Base,
SDValue &Offset, bool &isInc,
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index a2e626062a..82d167ed33 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -264,6 +264,12 @@ namespace llvm {
/// the immediate into a register.
virtual bool isLegalICmpImmediate(int64_t Imm) const;
+ /// isLegalAddImmediate - Return true if the specified immediate is legal
+ /// add immediate, that is the target has add instructions which can
+ /// add a register and the immediate without having to materialize
+ /// the immediate into a register.
+ virtual bool isLegalAddImmediate(int64_t Imm) const;
+
/// getPreIndexedAddressParts - returns true by value, base pointer and
/// offset pointer and addressing mode by reference if the node's address
/// can be legally represented as pre-indexed load / store address.
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 5abc790423..0d617bf8b1 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -209,7 +209,12 @@ struct Formula {
/// when AM.Scale is not zero.
const SCEV *ScaledReg;
- Formula() : ScaledReg(0) {}
+ /// UnfoldedOffset - An additional constant offset which added near the
+ /// use. This requires a temporary register, but the offset itself can
+ /// live in an add immediate field rather than a register.
+ int64_t UnfoldedOffset;
+
+ Formula() : ScaledReg(0), UnfoldedOffset(0) {}
void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
@@ -379,6 +384,10 @@ void Formula::print(raw_ostream &OS) const {
OS << "<unknown>";
OS << ')';
}
+ if (UnfoldedOffset != 0) {
+ if (!First) OS << " + "; else First = false;
+ OS << "imm(" << UnfoldedOffset << ')';
+ }
}
void Formula::dump() const {
@@ -771,8 +780,10 @@ void Cost::RateFormula(const Formula &F,
RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
}
- if (F.BaseRegs.size() > 1)
- NumBaseAdds += F.BaseRegs.size() - 1;
+ // Determine how many (unfolded) adds we'll need inside the loop.
+ size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0);
+ if (NumBaseParts > 1)
+ NumBaseAdds += NumBaseParts - 1;
// Tally up the non-zero immediates.
for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
@@ -1945,7 +1956,8 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
if (F.BaseRegs == OrigF.BaseRegs &&
F.ScaledReg == OrigF.ScaledReg &&
F.AM.BaseGV == OrigF.AM.BaseGV &&
- F.AM.Scale == OrigF.AM.Scale) {
+ F.AM.Scale == OrigF.AM.Scale &&
+ F.UnfoldedOffset == OrigF.UnfoldedOffset) {
if (F.AM.BaseOffs == 0)
return &LU;
// This is the formula where all the registers and symbols matched;
@@ -2313,8 +2325,29 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
if (InnerSum->isZero())
continue;
Formula F = Base;
- F.BaseRegs[i] = InnerSum;
- F.BaseRegs.push_back(*J);
+
+ // Add the remaining pieces of the add back into the new formula.
+ const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+ if (TLI && InnerSumSC &&
+ SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+ TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ InnerSumSC->getValue()->getZExtValue())) {
+ F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
+ InnerSumSC->getValue()->getZExtValue();
+ F.BaseRegs.erase(F.BaseRegs.begin() + i);
+ } else
+ F.BaseRegs[i] = InnerSum;
+
+ // Add J as its own register, or an unfolded immediate.
+ const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+ if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+ TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ SC->getValue()->getZExtValue()))
+ F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset +
+ SC->getValue()->getZExtValue();
+ else
+ F.BaseRegs.push_back(*J);
+
if (InsertFormula(LU, LUIdx, F))
// If that formula hadn't been seen before, recurse to find more like
// it.
@@ -2482,6 +2515,13 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
continue;
}
+ // Check that multiplying with the unfolded offset doesn't overflow.
+ if (F.UnfoldedOffset != 0) {
+ F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+ if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+ continue;
+ }
+
// If we make it here and it's legal, add it.
(void)InsertFormula(LU, LUIdx, F);
next:;
@@ -2664,7 +2704,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
// other orig regs.
ImmMapTy::const_iterator OtherImms[] = {
Imms.begin(), prior(Imms.end()),
- Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
+ Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
};
for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
ImmMapTy::const_iterator M = OtherImms[i];
@@ -2738,8 +2778,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
Formula NewF = F;
NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
- LU.Kind, LU.AccessTy, TLI))
- continue;
+ LU.Kind, LU.AccessTy, TLI)) {
+ if (!TLI ||
+ !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+ continue;
+ NewF = F;
+ NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+ }
NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
// If the new formula has a constant in a register, and adding the
@@ -3488,6 +3533,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF,
}
}
+ // Expand the unfolded offset portion.
+ int64_t UnfoldedOffset = F.UnfoldedOffset;
+ if (UnfoldedOffset != 0) {
+ // Just add the immediate values.
+ Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+ UnfoldedOffset)));
+ }
+
// Emit instructions summing all the operands.
const SCEV *FullS = Ops.empty() ?
SE.getConstant(IntTy, 0) :
diff --git a/test/CodeGen/ARM/lsr-unfolded-offset.ll b/test/CodeGen/ARM/lsr-unfolded-offset.ll
new file mode 100644
index 0000000000..a3342c6bde
--- /dev/null
+++ b/test/CodeGen/ARM/lsr-unfolded-offset.ll
@@ -0,0 +1,80 @@
+; RUN: llc < %s | FileCheck %s
+
+; LSR shouldn't introduce more induction variables than needed, increasing
+; register pressure and therefore spilling. There is more room for improvement
+; here.
+
+; CHECK: sub sp, #32
+
+; CHECK: ldr r{{.*}}, [sp, #4]
+; CHECK-NEXT: ldr r{{.*}}, [sp, #16]
+; CHECK-NEXT: ldr r{{.*}}, [sp, #12]
+; CHECK-NEXT: adds
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
+target triple = "thumbv7-apple-macosx10.7.0"
+
+%struct.partition_entry = type { i32, i32, i64, i64 }
+
+define i32 @partition_overlap_check(%struct.partition_entry* nocapture %part, i32 %num_entries) nounwind readonly optsize ssp {
+entry:
+ %cmp79 = icmp sgt i32 %num_entries, 0
+ br i1 %cmp79, label %outer.loop, label %for.end72
+
+outer.loop: ; preds = %for.inc69, %entry
+ %overlap.081 = phi i32 [ %overlap.4, %for.inc69 ], [ 0, %entry ]
+ %0 = phi i32 [ %inc71, %for.inc69 ], [ 0, %entry ]
+ %offset = getelementptr %struct.partition_entry* %part, i32 %0, i32 2
+ %len = getelementptr %struct.partition_entry* %part, i32 %0, i32 3
+ %tmp5 = load i64* %offset, align 4, !tbaa !0
+ %tmp15 = load i64* %len, align 4, !tbaa !0
+ %add = add nsw i64 %tmp15, %tmp5
+ br label %inner.loop
+
+inner.loop: ; preds = %for.inc, %outer.loop
+ %overlap.178 = phi i32 [ %overlap.081, %outer.loop ], [ %overlap.4, %for.inc ]
+ %1 = phi i32 [ 0, %outer.loop ], [ %inc, %for.inc ]
+ %cmp23 = icmp eq i32 %0, %1
+ br i1 %cmp23, label %for.inc, label %if.end
+
+if.end: ; preds = %inner.loop
+ %len39 = getelementptr %struct.partition_entry* %part, i32 %1, i32 3
+ %offset28 = getelementptr %struct.partition_entry* %part, i32 %1, i32 2
+ %tmp29 = load i64* %offset28, align 4, !tbaa !0
+ %tmp40 = load i64* %len39, align 4, !tbaa !0
+ %add41 = add nsw i64 %tmp40, %tmp29
+ %cmp44 = icmp sge i64 %tmp29, %tmp5
+ %cmp47 = icmp slt i64 %tmp29, %add
+ %or.cond = and i1 %cmp44, %cmp47
+ %overlap.2 = select i1 %or.cond, i32 1, i32 %overlap.178
+ %cmp52 = icmp sle i64 %add41, %add
+ %cmp56 = icmp sgt i64 %add41, %tmp5
+ %or.cond74 = and i1 %cmp52, %cmp56
+ %overlap.3 = select i1 %or.cond74, i32 1, i32 %overlap.2
+ %cmp61 = icmp sgt i64 %tmp29, %tmp5
+ %cmp65 = icmp slt i64 %add41, %add
+ %or.cond75 = or i1 %cmp61, %cmp65
+ br i1 %or.cond75, label %for.inc, label %if.then66
+
+if.then66: ; preds = %if.end
+ br label %for.inc
+
+for.inc: ; preds = %if.end, %if.then66, %inner.loop
+ %overlap.4 = phi i32 [ %overlap.178, %inner.loop ], [ 1, %if.then66 ], [ %overlap.3, %if.end ]
+ %inc = add nsw i32 %1, 1
+ %exitcond = icmp eq i32 %inc, %num_entries
+ br i1 %exitcond, label %for.inc69, label %inner.loop
+
+for.inc69: ; preds = %for.inc
+ %inc71 = add nsw i32 %0, 1
+ %exitcond83 = icmp eq i32 %inc71, %num_entries
+ br i1 %exitcond83, label %for.end72, label %outer.loop
+
+for.end72: ; preds = %for.inc69, %entry
+ %overlap.0.lcssa = phi i32 [ 0, %entry ], [ %overlap.4, %for.inc69 ]
+ ret i32 %overlap.0.lcssa
+}
+
+!0 = metadata !{metadata !"long long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}