2 files changed, 175 insertions, 31 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0dd6abb1ae..58a1a74655 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -318,6 +318,93 @@ private:
   ValueMap WidenMap;
 };
 
+/// \brief Check if conditionally executed loads are hoistable.
+///
+/// This class has two functions: isHoistableLoad and canHoistAllLoads.
+/// isHoistableLoad should be called on all load instructions that are executed
+/// conditionally. After all conditional loads are processed, the client should
+/// call canHoistAllLoads to determine if all of the conditional executed loads
+/// have an unconditional memory access to the same memory address in the loop.
+class LoadHoisting {
+  typedef SmallPtrSet<Value *, 8> MemorySet;
+
+  Loop *TheLoop;
+  DominatorTree *DT;
+  MemorySet CondLoadAddrSet;
+
+public:
+  LoadHoisting(Loop *L, DominatorTree *D) : TheLoop(L), DT(D) {}
+
+  /// \brief Check if the instruction is a load with a identifiable address.
+  bool isHoistableLoad(Instruction *L);
+
+  /// \brief Check if all of the conditional loads are hoistable because there
+  /// exists an unconditional memory access to the same address in the loop.
+  bool canHoistAllLoads();
+};
+
+bool LoadHoisting::isHoistableLoad(Instruction *L) {
+  LoadInst *LI = dyn_cast<LoadInst>(L);
+  if (!LI)
+    return false;
+
+  CondLoadAddrSet.insert(LI->getPointerOperand());
+  return true;
+}
+
+static void addMemAccesses(BasicBlock *BB, SmallPtrSet<Value *, 8> &Set) {
+  for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) {
+    Instruction *I = &*BI;
+    Value *Addr = 0;
+
+    // Try a load.
+    LoadInst *LI = dyn_cast<LoadInst>(I);
+    if (LI) {
+      Addr = LI->getPointerOperand();
+      Set.insert(Addr);
+      continue;
+    }
+
+    // Try a store.
+    StoreInst *SI = dyn_cast<StoreInst>(I);
+    if (!SI)
+      continue;
+
+    Addr = SI->getPointerOperand();
+    Set.insert(Addr);
+  }
+}
+
+bool LoadHoisting::canHoistAllLoads() {
+  // No conditional loads.
+  if (CondLoadAddrSet.empty())
+    return true;
+
+  MemorySet UncondMemAccesses;
+  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
+  BasicBlock *LoopLatch = TheLoop->getLoopLatch();
+
+  // Iterate over the unconditional blocks and collect memory access addresses.
+  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
+    BasicBlock *BB = LoopBlocks[i];
+
+    // Ignore conditional blocks.
+    if (BB != LoopLatch && !DT->dominates(BB, LoopLatch))
+      continue;
+
+    addMemAccesses(BB, UncondMemAccesses);
+  }
+
+  // And make sure there is a matching unconditional access for every
+  // conditional load.
+  for (MemorySet::iterator MI = CondLoadAddrSet.begin(),
+       ME = CondLoadAddrSet.end(); MI != ME; ++MI)
+    if (!UncondMemAccesses.count(*MI))
+      return false;
+
+  return true;
+}
+
 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 /// to what vectorization factor.
 /// This class does not look at the profitability of vectorization, only the
@@ -337,7 +424,8 @@ public:
                             DominatorTree *DT, TargetTransformInfo* TTI,
                             AliasAnalysis *AA, TargetLibraryInfo *TLI)
       : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
-        Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false) {}
+        Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
+        LoadSpeculation(L, DT) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -598,6 +686,9 @@ private:
   RuntimePointerCheck PtrRtCheck;
   /// Can we assume the absence of NaNs.
   bool HasFunNoNaNAttr;
+
+  /// Utility to determine whether loads can be speculated.
+  LoadHoisting LoadSpeculation;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1389,9 +1480,10 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
     case LoopVectorizationLegality::IK_NoInduction:
       llvm_unreachable("Unknown induction");
     case LoopVectorizationLegality::IK_IntInduction: {
-      // Handle the integer induction counter:
+      // Handle the integer induction counter.
       assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
-      assert(OrigPhi == OldInduction && "Unknown integer PHI");
+
+      // We have the canonical induction variable.
       if (OrigPhi == OldInduction) {
         // Create a truncated version of the resume value for the scalar loop,
         // we might have promoted the type to a larger width.
@@ -1402,11 +1494,20 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
         for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
           TruncResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
         TruncResumeVal->addIncoming(EndValue, VecBody);
+
+        // We know what the end value is.
+        EndValue = IdxEndRoundDown;
+        // We also know which PHI node holds it.
+        ResumeIndex = ResumeVal;
+        break;
       }
-      // We know what the end value is.
-      EndValue = IdxEndRoundDown;
-      // We also know which PHI node holds it.
-      ResumeIndex = ResumeVal;
+
+      // Not the canonical induction variable - add the vector loop count to the
+      // start value.
+      Value *CRD = BypassBuilder.CreateSExtOrTrunc(CountRoundDown,
+                                                   II.StartValue->getType(),
+                                                   "cast.crd");
+      EndValue = BypassBuilder.CreateAdd(CRD, II.StartValue , "ind.end");
       break;
     }
     case LoopVectorizationLegality::IK_ReverseIntInduction: {
@@ -2056,12 +2157,25 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
       case LoopVectorizationLegality::IK_NoInduction:
         llvm_unreachable("Unknown induction");
       case LoopVectorizationLegality::IK_IntInduction: {
-        assert(P == OldInduction && "Unexpected PHI");
-        // We might have had to extend the type.
-        Value *Trunc = Builder.CreateTrunc(Induction, P->getType());
-        Value *Broadcasted = getBroadcastInstrs(Trunc);
-        // After broadcasting the induction variable we need to make the
-        // vector consecutive by adding 0, 1, 2 ...
+        assert(P->getType() == II.StartValue->getType() && "Types must match");
+        Type *PhiTy = P->getType();
+        Value *Broadcasted;
+        if (P == OldInduction) {
+          // Handle the canonical induction variable. We might have had to
+          // extend the type.
+          Broadcasted = Builder.CreateTrunc(Induction, PhiTy);
+        } else {
+          // Handle other induction variables that are now based on the
+          // canonical one.
+          Value *NormalizedIdx = Builder.CreateSub(Induction, ExtendedIdx,
+                                                   "normalized.idx");
+          NormalizedIdx = Builder.CreateSExtOrTrunc(NormalizedIdx, PhiTy);
+          Broadcasted = Builder.CreateAdd(II.StartValue, NormalizedIdx,
+                                          "offset.idx");
+        }
+        Broadcasted = getBroadcastInstrs(Broadcasted);
+        // After broadcasting the induction variable we need to make the vector
+        // consecutive by adding 0, 1, 2, etc.
         for (unsigned part = 0; part < UF; ++part)
           Entry[part] = getConsecutiveVector(Broadcasted, VF * part, false);
         continue;
@@ -2466,11 +2580,11 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
           // Int inductions are special because we only allow one IV.
           if (IK == IK_IntInduction) {
-            if (Induction) {
-              DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
-              return false;
-            }
-            Induction = Phi;
+            // Use the phi node with the widest type as induction. Use the last
+            // one if there are multiple (no good reason for doing this other
+            // than it is expedient).
+            if (!Induction || PhiTy == WidestIndTy)
+              Induction = Phi;
           }
 
           DEBUG(dbgs() << "LV: Found an induction variable.\n");
@@ -3236,8 +3350,12 @@ bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
 
 bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    // We don't predicate loads/stores at the moment.
-    if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
+    // We might be able to hoist the load.
+    if (it->mayReadFromMemory() && !LoadSpeculation.isHoistableLoad(it))
+      return false;
+
+    // We don't predicate stores at the moment.
+    if (it->mayWriteToMemory() || it->mayThrow())
       return false;
 
     // The instructions below can trap.
@@ -3251,6 +3369,10 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
     }
   }
 
+  // Check that we can actually speculate the hoistable loads.
+  if (!LoadSpeculation.canHoistAllLoads())
+    return false;
+
   return true;
 }
 
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
index 55adf8a816..50d2af0f65 100644
--- a/lib/Transforms/Vectorize/VecUtils.cpp
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -282,6 +282,7 @@ int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
         DEBUG(dbgs()<<"SLP: Adding to MustExtract "
               "because of a safe out of tree usage.\n");
         MustExtract.insert(*it);
+        continue;
       }
       if (Lane == -1) Lane = LaneMap[*I];
       if (Lane != LaneMap[*I]) {
@@ -610,6 +611,9 @@ Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
     GatherInstructions.push_back(Vec);
   }
 
+  for (unsigned i = 0; i < Ty->getNumElements(); ++i)
+    VectorizedValues[VL[i]] = Vec;
+
   return Vec;
 }
 
@@ -617,6 +621,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
   Value *V = vectorizeTree_rec(VL, VF);
 
   Instruction *LastInstr = GetLastInstr(VL, VL.size());
+  int LastInstrIdx = InstrIdx[LastInstr];
   IRBuilder<> Builder(LastInstr);
   for (ValueSet::iterator it = MustExtract.begin(), e = MustExtract.end();
        it != e; ++it) {
@@ -625,7 +630,16 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
     assert(LaneMap.count(I) && "Unable to find the lane for the external use");
     Value *Idx = Builder.getInt32(LaneMap[I]);
     Value *Extract = Builder.CreateExtractElement(Vec, Idx);
-    I->replaceAllUsesWith(Extract);
+    bool Replaced = false;
+    for (Value::use_iterator U = I->use_begin(), UE = U->use_end(); U != UE;
+         ++U) {
+      Instruction *UI = cast<Instruction>(*U);
+      if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx)
+        UI->replaceUsesOfWith(I ,Extract);
+      Replaced = true;
+    }
+    assert(Replaced && "Must replace at least one outside user");
+    (void)Replaced;
   }
 
   // We moved some instructions around. We have to number them again
@@ -633,6 +647,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
   numberInstructions();
   MustScalarize.clear();
   MustExtract.clear();
+  VectorizedValues.clear();
   return V;
 }
 
@@ -690,7 +705,10 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
     IRBuilder<> Builder(GetLastInstr(VL, VF));
     CastInst *CI = dyn_cast<CastInst>(VL0);
     Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
-    VectorizedValues[VL0] = V;
+
+    for (int i = 0; i < VF; ++i)
+      VectorizedValues[VL[i]] = V;
+
     return V;
   }
   case Instruction::Add:
@@ -713,16 +731,19 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
   case Instruction::Xor: {
     ValueList LHSVL, RHSVL;
     for (int i = 0; i < VF; ++i) {
-      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
-      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
+      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
     }
 
-    Value *RHS = vectorizeTree_rec(RHSVL, VF);
     Value *LHS = vectorizeTree_rec(LHSVL, VF);
+    Value *RHS = vectorizeTree_rec(RHSVL, VF);
     IRBuilder<> Builder(GetLastInstr(VL, VF));
     BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
-    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
-    VectorizedValues[VL0] = V;
+    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS,RHS);
+
+    for (int i = 0; i < VF; ++i)
+      VectorizedValues[VL[i]] = V;
+
     return V;
   }
   case Instruction::Load: {
@@ -739,7 +760,10 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
                                           VecTy->getPointerTo());
     LI = Builder.CreateLoad(VecPtr);
     LI->setAlignment(Alignment);
-    VectorizedValues[VL0] = LI;
+
+    for (int i = 0; i < VF; ++i)
+      VectorizedValues[VL[i]] = LI;
+
     return LI;
   }
   case Instruction::Store: {
@@ -762,9 +786,7 @@ Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
     return 0;
   }
   default:
-    Value *S = Scalarize(VL, VecTy);
-    VectorizedValues[VL0] = S;
-    return S;
+    return Scalarize(VL, VecTy);
   }
 }