2 files changed, 181 insertions, 46 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 349d04524f..7867d9fad3 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -634,6 +634,19 @@ static Type *getAccessType(const Instruction *Inst) {
   return AccessTy;
 }
 
+/// isExistingPhi - Return true if this AddRec is already a phi in its loop.
+static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+  for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+    if (SE.isSCEVable(PN->getType()) &&
+        (SE.getEffectiveSCEVType(PN->getType()) ==
+         SE.getEffectiveSCEVType(AR->getType())) &&
+        SE.getSCEV(PN) == AR)
+      return true;
+  }
+  return false;
+}
+
 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
 /// specified set are trivially dead, delete them and see if this makes any of
 /// their operands subsequently dead.
@@ -703,7 +716,8 @@ public:
                    const DenseSet<const SCEV *> &VisitedRegs,
                    const Loop *L,
                    const SmallVectorImpl<int64_t> &Offsets,
-                   ScalarEvolution &SE, DominatorTree &DT);
+                   ScalarEvolution &SE, DominatorTree &DT,
+                   SmallPtrSet<const SCEV *, 16> *LoserRegs = 0);
 
   void print(raw_ostream &OS) const;
   void dump() const;
@@ -716,7 +730,8 @@ private:
   void RatePrimaryRegister(const SCEV *Reg,
                            SmallPtrSet<const SCEV *, 16> &Regs,
                            const Loop *L,
-                           ScalarEvolution &SE, DominatorTree &DT);
+                           ScalarEvolution &SE, DominatorTree &DT,
+                           SmallPtrSet<const SCEV *, 16> *LoserRegs);
 };
 
 }
@@ -736,18 +751,13 @@ void Cost::RateRegister(const SCEV *Reg,
     // on other loops, and cannot be expected to change sibling loops. If the
     // AddRec exists, consider it's register free and leave it alone. Otherwise,
     // do not consider this formula at all.
-    // FIXME: why do we need to generate such fomulae?
     else if (!EnableNested || L->contains(AR->getLoop()) ||
              (!AR->getLoop()->contains(L) &&
               DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
-      for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
-           PHINode *PN = dyn_cast<PHINode>(I); ++I) {
-        if (SE.isSCEVable(PN->getType()) &&
-            (SE.getEffectiveSCEVType(PN->getType()) ==
-             SE.getEffectiveSCEVType(AR->getType())) &&
-            SE.getSCEV(PN) == AR)
-          return;
-      }
+      if (isExistingPhi(AR, SE))
+        return;
+
+      // For !EnableNested, never rewrite IVs in other loops.
       if (!EnableNested) {
         Loose();
         return;
@@ -789,13 +799,22 @@ void Cost::RateRegister(const SCEV *Reg,
 }
 
 /// RatePrimaryRegister - Record this register in the set. If we haven't seen it
-/// before, rate it.
+/// before, rate it. Optional LoserRegs provides a way to declare any formula
+/// that refers to one of those regs an instant loser.
 void Cost::RatePrimaryRegister(const SCEV *Reg,
                                SmallPtrSet<const SCEV *, 16> &Regs,
                                const Loop *L,
-                               ScalarEvolution &SE, DominatorTree &DT) {
-  if (Regs.insert(Reg))
+                               ScalarEvolution &SE, DominatorTree &DT,
+                               SmallPtrSet<const SCEV *, 16> *LoserRegs) {
+  if (LoserRegs && LoserRegs->count(Reg)) {
+    Loose();
+    return;
+  }
+  if (Regs.insert(Reg)) {
     RateRegister(Reg, Regs, L, SE, DT);
+    if (isLoser())
+      LoserRegs->insert(Reg);
+  }
 }
 
 void Cost::RateFormula(const Formula &F,
@@ -803,14 +822,15 @@ void Cost::RateFormula(const Formula &F,
                        const DenseSet<const SCEV *> &VisitedRegs,
                        const Loop *L,
                        const SmallVectorImpl<int64_t> &Offsets,
-                       ScalarEvolution &SE, DominatorTree &DT) {
+                       ScalarEvolution &SE, DominatorTree &DT,
+                       SmallPtrSet<const SCEV *, 16> *LoserRegs) {
   // Tally up the registers.
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
       Loose();
       return;
     }
-    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT);
+    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
     if (isLoser())
       return;
   }
@@ -821,7 +841,7 @@ void Cost::RateFormula(const Formula &F,
       Loose();
       return;
     }
-    RatePrimaryRegister(BaseReg, Regs, L, SE, DT);
+    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
     if (isLoser())
       return;
   }
@@ -1103,7 +1123,6 @@ bool LSRUse::InsertFormula(const Formula &F) {
   Formulae.push_back(F);
 
   // Record registers now being used by this use.
-  if (F.ScaledReg) Regs.insert(F.ScaledReg);
   Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
 
   return true;
@@ -1114,7 +1133,6 @@ void LSRUse::DeleteFormula(Formula &F) {
   if (&F != &Formulae.back())
     std::swap(F, Formulae.back());
   Formulae.pop_back();
-  assert(!Formulae.empty() && "LSRUse has no formulae left!");
 }
 
 /// RecomputeRegs - Recompute the Regs field, and update RegUses.
@@ -2912,6 +2930,7 @@ LSRInstance::GenerateAllReuseFormulae() {
 void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
   DenseSet<const SCEV *> VisitedRegs;
   SmallPtrSet<const SCEV *, 16> Regs;
+  SmallPtrSet<const SCEV *, 16> LoserRegs;
 #ifndef NDEBUG
   bool ChangedFormulae = false;
 #endif
@@ -2931,46 +2950,66 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
          FIdx != NumForms; ++FIdx) {
       Formula &F = LU.Formulae[FIdx];
 
-      SmallVector<const SCEV *, 2> Key;
-      for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
-           JE = F.BaseRegs.end(); J != JE; ++J) {
-        const SCEV *Reg = *J;
-        if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
-          Key.push_back(Reg);
+      // Some formulas are instant losers. For example, they may depend on
+      // nonexistent AddRecs from other loops. These need to be filtered
+      // immediately, otherwise heuristics could choose them over others leading
+      // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
+      // avoids the need to recompute this information across formulae using the
+      // same bad AddRec. Passing LoserRegs is also essential unless we remove
+      // the corresponding bad register from the Regs set.
+      Cost CostF;
+      Regs.clear();
+      CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT,
+                        &LoserRegs);
+      if (CostF.isLoser()) {
+        // During initial formula generation, undesirable formulae are generated
+        // by uses within other loops that have some non-trivial address mode or
+        // use the postinc form of the IV. LSR needs to provide these formulae
+        // as the basis of rediscovering the desired formula that uses an AddRec
+        // corresponding to the existing phi. Once all formulae have been
+        // generated, these initial losers may be pruned.
+        DEBUG(dbgs() << "  Filtering loser "; F.print(dbgs());
+              dbgs() << "\n");
       }
-      if (F.ScaledReg &&
-          RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
-        Key.push_back(F.ScaledReg);
-      // Unstable sort by host order ok, because this is only used for
-      // uniquifying.
-      std::sort(Key.begin(), Key.end());
-
-      std::pair<BestFormulaeTy::const_iterator, bool> P =
-        BestFormulae.insert(std::make_pair(Key, FIdx));
-      if (!P.second) {
+      else {
+        SmallVector<const SCEV *, 2> Key;
+        for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
+               JE = F.BaseRegs.end(); J != JE; ++J) {
+          const SCEV *Reg = *J;
+          if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
+            Key.push_back(Reg);
+        }
+        if (F.ScaledReg &&
+            RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
+          Key.push_back(F.ScaledReg);
+        // Unstable sort by host order ok, because this is only used for
+        // uniquifying.
+        std::sort(Key.begin(), Key.end());
+
+        std::pair<BestFormulaeTy::const_iterator, bool> P =
+          BestFormulae.insert(std::make_pair(Key, FIdx));
+        if (P.second)
+          continue;
+
         Formula &Best = LU.Formulae[P.first->second];
 
-        Cost CostF;
-        CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
-        Regs.clear();
         Cost CostBest;
-        CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
         Regs.clear();
+        CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT);
         if (CostF < CostBest)
           std::swap(F, Best);
         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
               dbgs() << "\n"
                         "    in favor of formula "; Best.print(dbgs());
               dbgs() << '\n');
+      }
 #ifndef NDEBUG
-        ChangedFormulae = true;
+      ChangedFormulae = true;
 #endif
-        LU.DeleteFormula(F);
-        --FIdx;
-        --NumForms;
-        Any = true;
-        continue;
-      }
+      LU.DeleteFormula(F);
+      --FIdx;
+      --NumForms;
+      Any = true;
     }
 
     // Now that we've filtered out some formulae, recompute the Regs set.
diff --git a/test/Transforms/LoopStrengthReduce/2011-12-04-loserreg.ll b/test/Transforms/LoopStrengthReduce/2011-12-04-loserreg.ll
new file mode 100644
index 0000000000..5108650962
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/2011-12-04-loserreg.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s | FileCheck %s
+;
+; Test LSR's ability to prune formulae that refer to nonexistant
+; AddRecs in other loops.
+;
+; Unable to reduce this case further because it requires LSR to exceed
+; ComplexityLimit.
+;
+; We really just want to ensure that LSR can process this loop without
+; finding an unsatisfactory solution and bailing out. I've added
+; dummyout, an obvious candidate for postinc replacement so we can
+; verify that LSR removes it.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin"
+
+; CHECK: @test
+; CHECK: # %for.body{{$}}
+; dummyiv copy should be removed
+; CHECK-NOT: movq
+; CHECK: # %for.cond19.preheader
+; dummycnt should be removed
+; CHECK-NOT: incq
+; CHECK: # %for.body23{{$}}
+define i64 @test(i64 %count, float* nocapture %srcrow, i32* nocapture %destrow) nounwind uwtable ssp {
+entry:
+  %cmp34 = icmp eq i64 %count, 0
+  br i1 %cmp34, label %for.end29, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %dummyiv = phi i64 [ %dummycnt, %for.body ], [ 0, %entry ]
+  %indvars.iv39 = phi i64 [ %indvars.iv.next40, %for.body ], [ 0, %entry ]
+  %dp.036 = phi i32* [ %add.ptr, %for.body ], [ %destrow, %entry ]
+  %p.035 = phi float* [ %incdec.ptr4, %for.body ], [ %srcrow, %entry ]
+  %incdec.ptr = getelementptr inbounds float* %p.035, i64 1
+  %0 = load float* %incdec.ptr, align 4
+  %incdec.ptr2 = getelementptr inbounds float* %p.035, i64 2
+  %1 = load float* %incdec.ptr2, align 4
+  %incdec.ptr3 = getelementptr inbounds float* %p.035, i64 3
+  %2 = load float* %incdec.ptr3, align 4
+  %incdec.ptr4 = getelementptr inbounds float* %p.035, i64 4
+  %3 = load float* %incdec.ptr4, align 4
+  %4 = load i32* %dp.036, align 4
+  %conv5 = fptoui float %0 to i32
+  %or = or i32 %4, %conv5
+  %arrayidx6 = getelementptr inbounds i32* %dp.036, i64 1
+  %5 = load i32* %arrayidx6, align 4
+  %conv7 = fptoui float %1 to i32
+  %or8 = or i32 %5, %conv7
+  %arrayidx9 = getelementptr inbounds i32* %dp.036, i64 2
+  %6 = load i32* %arrayidx9, align 4
+  %conv10 = fptoui float %2 to i32
+  %or11 = or i32 %6, %conv10
+  %arrayidx12 = getelementptr inbounds i32* %dp.036, i64 3
+  %7 = load i32* %arrayidx12, align 4
+  %conv13 = fptoui float %3 to i32
+  %or14 = or i32 %7, %conv13
+  store i32 %or, i32* %dp.036, align 4
+  store i32 %or8, i32* %arrayidx6, align 4
+  store i32 %or11, i32* %arrayidx9, align 4
+  store i32 %or14, i32* %arrayidx12, align 4
+  %add.ptr = getelementptr inbounds i32* %dp.036, i64 4
+  %indvars.iv.next40 = add i64 %indvars.iv39, 4
+  %dummycnt = add i64 %dummyiv, 1
+  %cmp = icmp ult i64 %indvars.iv.next40, %count
+  br i1 %cmp, label %for.body, label %for.cond19.preheader
+
+for.cond19.preheader:                             ; preds = %for.body
+  %dummyout = add i64 %dummyiv, 1
+  %rem = and i64 %count, 3
+  %cmp2130 = icmp eq i64 %rem, 0
+  br i1 %cmp2130, label %for.end29, label %for.body23.lr.ph
+
+for.body23.lr.ph:                                 ; preds = %for.cond19.preheader
+  %8 = and i64 %count, 3
+  br label %for.body23
+
+for.body23:                                       ; preds = %for.body23, %for.body23.lr.ph
+  %indvars.iv = phi i64 [ 0, %for.body23.lr.ph ], [ %indvars.iv.next, %for.body23 ]
+  %dp.132 = phi i32* [ %add.ptr, %for.body23.lr.ph ], [ %incdec.ptr28, %for.body23 ]
+  %p.131 = phi float* [ %incdec.ptr4, %for.body23.lr.ph ], [ %incdec.ptr24, %for.body23 ]
+  %incdec.ptr24 = getelementptr inbounds float* %p.131, i64 1
+  %9 = load float* %incdec.ptr24, align 4
+  %10 = load i32* %dp.132, align 4
+  %conv25 = fptoui float %9 to i32
+  %or26 = or i32 %10, %conv25
+  store i32 %or26, i32* %dp.132, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %incdec.ptr28 = getelementptr inbounds i32* %dp.132, i64 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %8
+  br i1 %exitcond, label %for.end29, label %for.body23
+
+for.end29:                                        ; preds = %entry, %for.body23, %for.cond19.preheader
+  %result = phi i64 [ 0, %entry ], [ %dummyout, %for.body23 ], [ %dummyout, %for.cond19.preheader ]
+  ret i64 %result
+}