summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEvan Cheng <evan.cheng@apple.com>2009-05-11 22:33:01 +0000
committerEvan Cheng <evan.cheng@apple.com>2009-05-11 22:33:01 +0000
commit5792f51e12d9c8685399e9857799365854ab5bf6 (patch)
tree70eb5ebee02553ba2b892ce1ff7f7bbaad77a682
parent6dc4ade59505fd8a01370ff8da5b18110f7a2f41 (diff)
downloadllvm-5792f51e12d9c8685399e9857799365854ab5bf6.tar.gz
llvm-5792f51e12d9c8685399e9857799365854ab5bf6.tar.bz2
llvm-5792f51e12d9c8685399e9857799365854ab5bf6.tar.xz
Teach LSR to optimize more loop exit compares, i.e. change them to use postinc iv value. Previously LSR would only optimize those which are in the loop latch block. However, if LSR can prove it is safe (and profitable), it's now possible to change those not in the latch blocks to use postinc values.
Also, if the compare is the only use, LSR would place the iv increment instruction before the compare instead in the latch. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@71485 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp182
-rw-r--r--test/CodeGen/X86/lsr-loop-exit-cond.ll134
-rw-r--r--test/CodeGen/X86/lsr-negative-stride.ll2
-rw-r--r--test/CodeGen/X86/remat-mov-1.ll (renamed from test/CodeGen/X86/remat-mov0.ll)2
4 files changed, 275 insertions, 45 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 9568449948..127ef56cbd 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -43,6 +43,7 @@ STATISTIC(NumVariable, "Number of PHIs with variable strides");
STATISTIC(NumEliminated, "Number of strides eliminated");
STATISTIC(NumShadow, "Number of Shadow IVs optimized");
STATISTIC(NumImmSunk, "Number of common expr immediates sunk into uses");
+STATISTIC(NumLoopCond, "Number of loop terminating conds optimized");
static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",
cl::init(false),
@@ -122,6 +123,10 @@ namespace {
/// particular stride.
std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
+ /// StrideNoReuse - Keep track of all the strides whose ivs cannot be
+ /// reused (nor should they be rewritten to reuse other strides).
+ SmallSet<SCEVHandle, 4> StrideNoReuse;
+
/// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
/// We use this to iterate over the IVUsesByStride collection without being
/// dependent on random ordering of pointers in the process.
@@ -184,8 +189,8 @@ namespace {
SCEVHandle CheckForIVReuse(bool, bool, bool, const SCEVHandle&,
IVExpr&, const Type*,
const std::vector<BasedUser>& UsersToProcess);
- bool ValidStride(bool, int64_t,
- const std::vector<BasedUser>& UsersToProcess);
+ bool ValidScale(bool, int64_t,
+ const std::vector<BasedUser>& UsersToProcess);
SCEVHandle CollectIVUsers(const SCEVHandle &Stride,
IVUsersOfOneStride &Uses,
Loop *L,
@@ -213,6 +218,7 @@ namespace {
SCEVHandle Stride,
SCEVHandle CommonExprs,
Value *CommonBaseV,
+ Instruction *IVIncInsertPt,
const Loop *L,
SCEVExpander &PreheaderRewriter);
void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
@@ -799,7 +805,7 @@ static bool fitsInAddressMode(const SCEVHandle &V, const Type *UseTy,
/// MoveLoopVariantsToImmediateField - Move any subexpressions from Val that are
/// loop varying to the Imm operand.
static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
- Loop *L, ScalarEvolution *SE) {
+ Loop *L, ScalarEvolution *SE) {
if (Val->isLoopInvariant(L)) return; // Nothing to do.
if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
@@ -1122,16 +1128,15 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
return Result;
}
-/// ValidStride - Check whether the given Scale is valid for all loads and
+/// ValidScale - Check whether the given Scale is valid for all loads and
/// stores in UsersToProcess.
///
-bool LoopStrengthReduce::ValidStride(bool HasBaseReg,
- int64_t Scale,
+bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale,
const std::vector<BasedUser>& UsersToProcess) {
if (!TLI)
return true;
- for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) {
+ for (unsigned i = 0, e = UsersToProcess.size(); i!=e; ++i) {
// If this is a load or other access, pass the type of the access in.
const Type *AccessTy = Type::VoidTy;
if (isAddressUse(UsersToProcess[i].Inst,
@@ -1186,13 +1191,17 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
const SCEVHandle &Stride,
IVExpr &IV, const Type *Ty,
const std::vector<BasedUser>& UsersToProcess) {
+ if (StrideNoReuse.count(Stride))
+ return SE->getIntegerSCEV(0, Stride->getType());
+
if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
int64_t SInt = SC->getValue()->getSExtValue();
for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e;
++NewStride) {
std::map<SCEVHandle, IVsOfOneStride>::iterator SI =
IVsByStride.find(StrideOrder[NewStride]);
- if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
+ if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
+ StrideNoReuse.count(SI->first))
continue;
int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
if (SI->first != Stride &&
@@ -1206,7 +1215,7 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
// multiplications.
if (Scale == 1 ||
(AllUsesAreAddresses &&
- ValidStride(HasBaseReg, Scale, UsersToProcess)))
+ ValidScale(HasBaseReg, Scale, UsersToProcess)))
for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
IE = SI->second.IVs.end(); II != IE; ++II)
// FIXME: Only handle base == 0 for now.
@@ -1302,7 +1311,7 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
// field of the use, so that we don't try to use something before it is
// computed.
MoveLoopVariantsToImmediateField(UsersToProcess.back().Base,
- UsersToProcess.back().Imm, L, SE);
+ UsersToProcess.back().Imm, L, SE);
assert(UsersToProcess.back().Base->isLoopInvariant(L) &&
"Base value is not loop invariant!");
}
@@ -1452,6 +1461,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
/// Return the created phi node.
///
static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
+ Instruction *IVIncInsertPt,
const Loop *L,
SCEVExpander &Rewriter) {
assert(Start->isLoopInvariant(L) && "New PHI start is not loop invariant!");
@@ -1475,16 +1485,17 @@ static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
IncAmount = Rewriter.SE.getNegativeSCEV(Step);
// Insert an add instruction right before the terminator corresponding
- // to the back-edge.
+ // to the back-edge or just before the only use. The location is determined
+ // by the caller and passed in as IVIncInsertPt.
Value *StepV = Rewriter.expandCodeFor(IncAmount, Ty,
Preheader->getTerminator());
Instruction *IncV;
if (isNegative) {
IncV = BinaryOperator::CreateSub(PN, StepV, "lsr.iv.next",
- LatchBlock->getTerminator());
+ IVIncInsertPt);
} else {
IncV = BinaryOperator::CreateAdd(PN, StepV, "lsr.iv.next",
- LatchBlock->getTerminator());
+ IVIncInsertPt);
}
if (!isa<ConstantInt>(StepV)) ++NumVariable;
@@ -1541,6 +1552,7 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
// Rewrite the UsersToProcess records, creating a separate PHI for each
// unique Base value.
+ Instruction *IVIncInsertPt = L->getLoopLatch()->getTerminator();
for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) {
// TODO: The uses are grouped by base, but not sorted. We arbitrarily
// pick the first Imm value here to start with, and adjust it for the
@@ -1548,7 +1560,7 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
SCEVHandle Imm = UsersToProcess[i].Imm;
SCEVHandle Base = UsersToProcess[i].Base;
SCEVHandle Start = SE->getAddExpr(CommonExprs, Base, Imm);
- PHINode *Phi = InsertAffinePhi(Start, Stride, L,
+ PHINode *Phi = InsertAffinePhi(Start, Stride, IVIncInsertPt, L,
PreheaderRewriter);
// Loop over all the users with the same base.
do {
@@ -1561,6 +1573,18 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
}
}
+/// FindIVIncInsertPt - Return the location to insert the increment instruction.
+/// If the only use if a use of postinc value, (must be the loop termination
+/// condition), then insert it just before the use.
+static Instruction *FindIVIncInsertPt(std::vector<BasedUser> &UsersToProcess,
+ const Loop *L) {
+ if (UsersToProcess.size() == 1 &&
+ UsersToProcess[0].isUseOfPostIncrementedValue &&
+ L->contains(UsersToProcess[0].Inst->getParent()))
+ return UsersToProcess[0].Inst;
+ return L->getLoopLatch()->getTerminator();
+}
+
/// PrepareToStrengthReduceWithNewPhi - Insert a new induction variable for the
/// given users to share.
///
@@ -1570,12 +1594,13 @@ LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
SCEVHandle Stride,
SCEVHandle CommonExprs,
Value *CommonBaseV,
+ Instruction *IVIncInsertPt,
const Loop *L,
SCEVExpander &PreheaderRewriter) {
DOUT << " Inserting new PHI:\n";
PHINode *Phi = InsertAffinePhi(SE->getUnknown(CommonBaseV),
- Stride, L,
+ Stride, IVIncInsertPt, L,
PreheaderRewriter);
// Remember this in case a later stride is multiple of this.
@@ -1590,8 +1615,8 @@ LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
DOUT << "\n";
}
-/// PrepareToStrengthReduceWithNewPhi - Prepare for the given users to reuse
-/// an induction variable with a stride that is a factor of the current
+/// PrepareToStrengthReduceFromSmallerStride - Prepare for the given users to
+/// reuse an induction variable with a stride that is a factor of the current
/// induction variable.
///
void
@@ -1727,6 +1752,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
BasicBlock *Preheader = L->getLoopPreheader();
Instruction *PreInsertPt = Preheader->getTerminator();
BasicBlock *LatchBlock = L->getLoopLatch();
+ Instruction *IVIncInsertPt = LatchBlock->getTerminator();
Value *CommonBaseV = Constant::getNullValue(ReplacedTy);
@@ -1755,13 +1781,15 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
AllUsesAreOutsideLoop,
Stride, ReuseIV, ReplacedTy,
UsersToProcess);
- if (isa<SCEVConstant>(RewriteFactor) &&
- cast<SCEVConstant>(RewriteFactor)->isZero())
- PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs,
- CommonBaseV, L, PreheaderRewriter);
- else
+ if (!RewriteFactor->isZero())
PrepareToStrengthReduceFromSmallerStride(UsersToProcess, CommonBaseV,
ReuseIV, PreInsertPt);
+ else {
+ IVIncInsertPt = FindIVIncInsertPt(UsersToProcess, L);
+ PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs,
+ CommonBaseV, IVIncInsertPt,
+ L, PreheaderRewriter);
+ }
}
// Process all the users now, replacing their strided uses with
@@ -1800,7 +1828,12 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
// FIXME: Use emitted users to emit other users.
BasedUser &User = UsersToProcess.back();
- DOUT << " Examining use ";
+ DOUT << " Examining ";
+ if (User.isUseOfPostIncrementedValue)
+ DOUT << "postinc";
+ else
+ DOUT << "preinc";
+ DOUT << " use ";
DEBUG(WriteAsOperand(*DOUT, UsersToProcess.back().OperandValToReplace,
/*PrintType=*/false));
DOUT << " in Inst: " << *(User.Inst);
@@ -1810,11 +1843,12 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
Value *RewriteOp = User.Phi;
if (User.isUseOfPostIncrementedValue) {
RewriteOp = User.Phi->getIncomingValueForBlock(LatchBlock);
-
// If this user is in the loop, make sure it is the last thing in the
- // loop to ensure it is dominated by the increment.
- if (L->contains(User.Inst->getParent()))
- User.Inst->moveBefore(LatchBlock->getTerminator());
+ // loop to ensure it is dominated by the increment. In case it's the
+ // only use of the iv, the increment instruction is already before the
+ // use.
+ if (L->contains(User.Inst->getParent()) && User.Inst != IVIncInsertPt)
+ User.Inst->moveBefore(IVIncInsertPt);
}
SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
@@ -2085,7 +2119,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
// if it's likely the new stride uses will be rewritten using the
// stride of the compare instruction.
if (AllUsesAreAddresses &&
- ValidStride(!CommonExprs->isZero(), Scale, UsersToProcess))
+ ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
continue;
// If scale is negative, use swapped predicate unless it's testing
@@ -2304,8 +2338,8 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
if (!DestTy) continue;
if (TLI) {
- /* If target does not support DestTy natively then do not apply
- this transformation. */
+ // If target does not support DestTy natively then do not apply
+ // this transformation.
MVT DVT = TLI->getValueType(DestTy);
if (!TLI->isTypeLegal(DVT)) continue;
}
@@ -2380,8 +2414,6 @@ void LoopStrengthReduce::OptimizeIndvars(Loop *L) {
// TODO: implement optzns here.
OptimizeShadowIV(L);
-
- OptimizeLoopTermCond(L);
}
/// OptimizeLoopTermCond - Change loop terminating condition to use the
@@ -2391,23 +2423,78 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
// can, we want to change it to use a post-incremented version of its
// induction variable, to allow coalescing the live ranges for the IV into
// one register value.
- PHINode *SomePHI = cast<PHINode>(L->getHeader()->begin());
- BasicBlock *Preheader = L->getLoopPreheader();
- BasicBlock *LatchBlock =
- SomePHI->getIncomingBlock(SomePHI->getIncomingBlock(0) == Preheader);
- BranchInst *TermBr = dyn_cast<BranchInst>(LatchBlock->getTerminator());
- if (!TermBr || TermBr->isUnconditional() ||
- !isa<ICmpInst>(TermBr->getCondition()))
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ BasicBlock *ExitBlock = L->getExitingBlock();
+ if (!ExitBlock)
+ // Multiple exits, just look at the exit in the latch block if there is one.
+ ExitBlock = LatchBlock;
+ BranchInst *TermBr = dyn_cast<BranchInst>(ExitBlock->getTerminator());
+ if (!TermBr)
+ return;
+ if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
return;
- ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
// Search IVUsesByStride to find Cond's IVUse if there is one.
IVStrideUse *CondUse = 0;
const SCEVHandle *CondStride = 0;
-
+ ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
if (!FindIVUserForCond(Cond, CondUse, CondStride))
return; // setcc doesn't use the IV.
+ if (ExitBlock != LatchBlock) {
+ if (!Cond->hasOneUse())
+ // See below, we don't want the condition to be cloned.
+ return;
+
+ // If exiting block is the latch block, we know it's safe and profitable to
+ // transform the icmp to use post-inc iv. Otherwise do so only if it would
+ // not reuse another iv and its iv would be reused by other uses. We are
+ // optimizing for the case where the icmp is the only use of the iv.
+ IVUsersOfOneStride &StrideUses = IVUsesByStride[*CondStride];
+ for (unsigned i = 0, e = StrideUses.Users.size(); i != e; ++i) {
+ if (StrideUses.Users[i].User == Cond)
+ continue;
+ if (!StrideUses.Users[i].isUseOfPostIncrementedValue)
+ return;
+ }
+
+ // FIXME: This is expensive, and worse still ChangeCompareStride does a
+ // similar check. Can we perform all the icmp related transformations after
+ // StrengthReduceStridedIVUsers?
+ if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride)) {
+ int64_t SInt = SC->getValue()->getSExtValue();
+ for (unsigned NewStride = 0, ee = StrideOrder.size(); NewStride != ee;
+ ++NewStride) {
+ std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI =
+ IVUsesByStride.find(StrideOrder[NewStride]);
+ if (!isa<SCEVConstant>(SI->first) || SI->first == *CondStride)
+ continue;
+ int64_t SSInt =
+ cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+ if (SSInt == SInt)
+ return; // This can definitely be reused.
+ if (unsigned(abs(SSInt)) < SInt || (SSInt % SInt) != 0)
+ continue;
+ int64_t Scale = SSInt / SInt;
+ bool AllUsesAreAddresses = true;
+ bool AllUsesAreOutsideLoop = true;
+ std::vector<BasedUser> UsersToProcess;
+ SCEVHandle CommonExprs = CollectIVUsers(SI->first, SI->second, L,
+ AllUsesAreAddresses,
+ AllUsesAreOutsideLoop,
+ UsersToProcess);
+ // Avoid rewriting the compare instruction with an iv of new stride
+ // if it's likely the new stride uses will be rewritten using the
+ // stride of the compare instruction.
+ if (AllUsesAreAddresses &&
+ ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
+ return;
+ }
+ }
+
+ StrideNoReuse.insert(*CondStride);
+ }
+
// If the trip count is computed in terms of an smax (due to ScalarEvolution
// being unable to find a sufficient guard, for example), change the loop
// comparison to use SLT instead of NE.
@@ -2415,7 +2502,8 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
// If possible, change stride and operands of the compare instruction to
// eliminate one stride.
- Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
+ if (ExitBlock == LatchBlock)
+ Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
// It's possible for the setcc instruction to be anywhere in the loop, and
// possible for it to have multiple users. If it is not immediately before
@@ -2431,7 +2519,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
// Clone the IVUse, as the old use still exists!
IVUsesByStride[*CondStride].addUser(CondUse->Offset, Cond,
- CondUse->OperandValToReplace);
+ CondUse->OperandValToReplace);
CondUse = &IVUsesByStride[*CondStride].Users.back();
}
}
@@ -2442,6 +2530,8 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
CondUse->Offset = SE->getMinusSCEV(CondUse->Offset, *CondStride);
CondUse->isUseOfPostIncrementedValue = true;
Changed = true;
+
+ ++NumLoopCond;
}
// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding
@@ -2582,6 +2672,11 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
// computation of some other indvar to decide when to terminate the loop.
OptimizeIndvars(L);
+ // Change loop terminating condition to use the postinc iv when possible
+ // and optimize loop terminating compare. FIXME: Move this after
+ // StrengthReduceStridedIVUsers?
+ OptimizeLoopTermCond(L);
+
// FIXME: We can shrink overlarge IV's here. e.g. if the code has
// computation in i64 values and the target doesn't support i64, demote
// the computation to 32-bit if safe.
@@ -2616,6 +2711,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
IVUsesByStride.clear();
IVsByStride.clear();
StrideOrder.clear();
+ StrideNoReuse.clear();
// Clean up after ourselves
if (!DeadInsts.empty())
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
new file mode 100644
index 0000000000..c998268600
--- /dev/null
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -0,0 +1,134 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | %prcontext decq 1 | grep jne
+
+@Te0 = external global [256 x i32] ; <[256 x i32]*> [#uses=5]
+@Te1 = external global [256 x i32] ; <[256 x i32]*> [#uses=4]
+@Te3 = external global [256 x i32] ; <[256 x i32]*> [#uses=2]
+
+define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r) nounwind ssp {
+entry:
+ %0 = load i32* %rk, align 4 ; <i32> [#uses=1]
+ %1 = getelementptr i32* %rk, i64 1 ; <i32*> [#uses=1]
+ %2 = load i32* %1, align 4 ; <i32> [#uses=1]
+ %tmp15 = add i32 %r, -1 ; <i32> [#uses=1]
+ %tmp.16 = zext i32 %tmp15 to i64 ; <i64> [#uses=2]
+ br label %bb
+
+bb: ; preds = %bb1, %entry
+ %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %bb1 ] ; <i64> [#uses=3]
+ %s1.0 = phi i32 [ %2, %entry ], [ %56, %bb1 ] ; <i32> [#uses=2]
+ %s0.0 = phi i32 [ %0, %entry ], [ %43, %bb1 ] ; <i32> [#uses=2]
+ %tmp18 = shl i64 %indvar, 4 ; <i64> [#uses=4]
+ %rk26 = bitcast i32* %rk to i8* ; <i8*> [#uses=6]
+ %3 = lshr i32 %s0.0, 24 ; <i32> [#uses=1]
+ %4 = zext i32 %3 to i64 ; <i64> [#uses=1]
+ %5 = getelementptr [256 x i32]* @Te0, i64 0, i64 %4 ; <i32*> [#uses=1]
+ %6 = load i32* %5, align 4 ; <i32> [#uses=1]
+ %7 = lshr i32 %s1.0, 16 ; <i32> [#uses=1]
+ %8 = and i32 %7, 255 ; <i32> [#uses=1]
+ %9 = zext i32 %8 to i64 ; <i64> [#uses=1]
+ %10 = getelementptr [256 x i32]* @Te1, i64 0, i64 %9 ; <i32*> [#uses=1]
+ %11 = load i32* %10, align 4 ; <i32> [#uses=1]
+ %ctg2.sum2728 = or i64 %tmp18, 8 ; <i64> [#uses=1]
+ %12 = getelementptr i8* %rk26, i64 %ctg2.sum2728 ; <i8*> [#uses=1]
+ %13 = bitcast i8* %12 to i32* ; <i32*> [#uses=1]
+ %14 = load i32* %13, align 4 ; <i32> [#uses=1]
+ %15 = xor i32 %11, %6 ; <i32> [#uses=1]
+ %16 = xor i32 %15, %14 ; <i32> [#uses=3]
+ %17 = lshr i32 %s1.0, 24 ; <i32> [#uses=1]
+ %18 = zext i32 %17 to i64 ; <i64> [#uses=1]
+ %19 = getelementptr [256 x i32]* @Te0, i64 0, i64 %18 ; <i32*> [#uses=1]
+ %20 = load i32* %19, align 4 ; <i32> [#uses=1]
+ %21 = and i32 %s0.0, 255 ; <i32> [#uses=1]
+ %22 = zext i32 %21 to i64 ; <i64> [#uses=1]
+ %23 = getelementptr [256 x i32]* @Te3, i64 0, i64 %22 ; <i32*> [#uses=1]
+ %24 = load i32* %23, align 4 ; <i32> [#uses=1]
+ %ctg2.sum2930 = or i64 %tmp18, 12 ; <i64> [#uses=1]
+ %25 = getelementptr i8* %rk26, i64 %ctg2.sum2930 ; <i8*> [#uses=1]
+ %26 = bitcast i8* %25 to i32* ; <i32*> [#uses=1]
+ %27 = load i32* %26, align 4 ; <i32> [#uses=1]
+ %28 = xor i32 %24, %20 ; <i32> [#uses=1]
+ %29 = xor i32 %28, %27 ; <i32> [#uses=4]
+ %30 = lshr i32 %16, 24 ; <i32> [#uses=1]
+ %31 = zext i32 %30 to i64 ; <i64> [#uses=1]
+ %32 = getelementptr [256 x i32]* @Te0, i64 0, i64 %31 ; <i32*> [#uses=1]
+ %33 = load i32* %32, align 4 ; <i32> [#uses=2]
+ %exitcond = icmp eq i64 %indvar, %tmp.16 ; <i1> [#uses=1]
+ br i1 %exitcond, label %bb2, label %bb1
+
+bb1: ; preds = %bb
+ %ctg2.sum31 = add i64 %tmp18, 16 ; <i64> [#uses=1]
+ %34 = getelementptr i8* %rk26, i64 %ctg2.sum31 ; <i8*> [#uses=1]
+ %35 = bitcast i8* %34 to i32* ; <i32*> [#uses=1]
+ %36 = lshr i32 %29, 16 ; <i32> [#uses=1]
+ %37 = and i32 %36, 255 ; <i32> [#uses=1]
+ %38 = zext i32 %37 to i64 ; <i64> [#uses=1]
+ %39 = getelementptr [256 x i32]* @Te1, i64 0, i64 %38 ; <i32*> [#uses=1]
+ %40 = load i32* %39, align 4 ; <i32> [#uses=1]
+ %41 = load i32* %35, align 4 ; <i32> [#uses=1]
+ %42 = xor i32 %40, %33 ; <i32> [#uses=1]
+ %43 = xor i32 %42, %41 ; <i32> [#uses=1]
+ %44 = lshr i32 %29, 24 ; <i32> [#uses=1]
+ %45 = zext i32 %44 to i64 ; <i64> [#uses=1]
+ %46 = getelementptr [256 x i32]* @Te0, i64 0, i64 %45 ; <i32*> [#uses=1]
+ %47 = load i32* %46, align 4 ; <i32> [#uses=1]
+ %48 = and i32 %16, 255 ; <i32> [#uses=1]
+ %49 = zext i32 %48 to i64 ; <i64> [#uses=1]
+ %50 = getelementptr [256 x i32]* @Te3, i64 0, i64 %49 ; <i32*> [#uses=1]
+ %51 = load i32* %50, align 4 ; <i32> [#uses=1]
+ %ctg2.sum32 = add i64 %tmp18, 20 ; <i64> [#uses=1]
+ %52 = getelementptr i8* %rk26, i64 %ctg2.sum32 ; <i8*> [#uses=1]
+ %53 = bitcast i8* %52 to i32* ; <i32*> [#uses=1]
+ %54 = load i32* %53, align 4 ; <i32> [#uses=1]
+ %55 = xor i32 %51, %47 ; <i32> [#uses=1]
+ %56 = xor i32 %55, %54 ; <i32> [#uses=1]
+ %indvar.next = add i64 %indvar, 1 ; <i64> [#uses=1]
+ br label %bb
+
+bb2: ; preds = %bb
+ %tmp10 = shl i64 %tmp.16, 4 ; <i64> [#uses=2]
+ %ctg2.sum = add i64 %tmp10, 16 ; <i64> [#uses=1]
+ %tmp1213 = getelementptr i8* %rk26, i64 %ctg2.sum ; <i8*> [#uses=1]
+ %57 = bitcast i8* %tmp1213 to i32* ; <i32*> [#uses=1]
+ %58 = and i32 %33, -16777216 ; <i32> [#uses=1]
+ %59 = lshr i32 %29, 16 ; <i32> [#uses=1]
+ %60 = and i32 %59, 255 ; <i32> [#uses=1]
+ %61 = zext i32 %60 to i64 ; <i64> [#uses=1]
+ %62 = getelementptr [256 x i32]* @Te1, i64 0, i64 %61 ; <i32*> [#uses=1]
+ %63 = load i32* %62, align 4 ; <i32> [#uses=1]
+ %64 = and i32 %63, 16711680 ; <i32> [#uses=1]
+ %65 = or i32 %64, %58 ; <i32> [#uses=1]
+ %66 = load i32* %57, align 4 ; <i32> [#uses=1]
+ %67 = xor i32 %65, %66 ; <i32> [#uses=2]
+ %68 = lshr i32 %29, 8 ; <i32> [#uses=1]
+ %69 = zext i32 %68 to i64 ; <i64> [#uses=1]
+ %70 = getelementptr [256 x i32]* @Te0, i64 0, i64 %69 ; <i32*> [#uses=1]
+ %71 = load i32* %70, align 4 ; <i32> [#uses=1]
+ %72 = and i32 %71, -16777216 ; <i32> [#uses=1]
+ %73 = and i32 %16, 255 ; <i32> [#uses=1]
+ %74 = zext i32 %73 to i64 ; <i64> [#uses=1]
+ %75 = getelementptr [256 x i32]* @Te1, i64 0, i64 %74 ; <i32*> [#uses=1]
+ %76 = load i32* %75, align 4 ; <i32> [#uses=1]
+ %77 = and i32 %76, 16711680 ; <i32> [#uses=1]
+ %78 = or i32 %77, %72 ; <i32> [#uses=1]
+ %ctg2.sum25 = add i64 %tmp10, 20 ; <i64> [#uses=1]
+ %79 = getelementptr i8* %rk26, i64 %ctg2.sum25 ; <i8*> [#uses=1]
+ %80 = bitcast i8* %79 to i32* ; <i32*> [#uses=1]
+ %81 = load i32* %80, align 4 ; <i32> [#uses=1]
+ %82 = xor i32 %78, %81 ; <i32> [#uses=2]
+ %83 = lshr i32 %67, 24 ; <i32> [#uses=1]
+ %84 = trunc i32 %83 to i8 ; <i8> [#uses=1]
+ store i8 %84, i8* %out, align 1
+ %85 = lshr i32 %67, 16 ; <i32> [#uses=1]
+ %86 = trunc i32 %85 to i8 ; <i8> [#uses=1]
+ %87 = getelementptr i8* %out, i64 1 ; <i8*> [#uses=1]
+ store i8 %86, i8* %87, align 1
+ %88 = getelementptr i8* %out, i64 4 ; <i8*> [#uses=1]
+ %89 = lshr i32 %82, 24 ; <i32> [#uses=1]
+ %90 = trunc i32 %89 to i8 ; <i8> [#uses=1]
+ store i8 %90, i8* %88, align 1
+ %91 = lshr i32 %82, 16 ; <i32> [#uses=1]
+ %92 = trunc i32 %91 to i8 ; <i8> [#uses=1]
+ %93 = getelementptr i8* %out, i64 5 ; <i8*> [#uses=1]
+ store i8 %92, i8* %93, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/lsr-negative-stride.ll b/test/CodeGen/X86/lsr-negative-stride.ll
index 43b507ba52..28d041f060 100644
--- a/test/CodeGen/X86/lsr-negative-stride.ll
+++ b/test/CodeGen/X86/lsr-negative-stride.ll
@@ -16,7 +16,7 @@
;}
-define i32 @t(i32 %a, i32 %b) {
+define i32 @t(i32 %a, i32 %b) nounwind {
entry:
%tmp1434 = icmp eq i32 %a, %b ; <i1> [#uses=1]
br i1 %tmp1434, label %bb17, label %bb.outer
diff --git a/test/CodeGen/X86/remat-mov0.ll b/test/CodeGen/X86/remat-mov-1.ll
index 360628cb6a..98b7bb45e9 100644
--- a/test/CodeGen/X86/remat-mov0.ll
+++ b/test/CodeGen/X86/remat-mov-1.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 | grep xor | count 2
+; RUN: llvm-as < %s | llc -march=x86 | grep 4294967295 | grep mov | count 2
%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
%struct.ImgT = type { i8, i8*, i8*, %struct.FILE*, i32, i32, i32, i32, i8*, double*, float*, float*, float*, i32*, double, double, i32*, double*, i32*, i32* }