LoopIdiom: Replace custom dependence analysis with LoopDependenceAnalysis.

Requires a lot less code and complexity on loop-idiom's side and the more precise analysis can catch more cases, like the one I included as a test case. This also fixes the edge-case miscompilation from PR9481. I'm not entirely sure that all cases are handled that the old checks handled but LDA will certainly become smarter in the future. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166390 91177308-0d34-0410-b5e6-96231b3b80d8
author: Benjamin Kramer <benny.kra@googlemail.com> 2012-10-21 15:03:07 +0000
committer: Benjamin Kramer <benny.kra@googlemail.com> 2012-10-21 15:03:07 +0000
commit: 5c6e9ae14e012bfe1fdf0e19957922d3c4d85670 (patch)
tree: e8a0f8e874ee1e924d23af4e0601267968409aec
parent: bb950854acbb5966875763eaae7ab58e48e4f5a9 (diff)
download: llvm-5c6e9ae14e012bfe1fdf0e19957922d3c4d85670.tar.gz
llvm-5c6e9ae14e012bfe1fdf0e19957922d3c4d85670.tar.bz2
llvm-5c6e9ae14e012bfe1fdf0e19957922d3c4d85670.tar.xz
3 files changed, 128 insertions, 74 deletions
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a44e798f12..5c398a8b69 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Module.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopDependenceAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
@@ -106,6 +107,8 @@ namespace {
       AU.addPreserved<AliasAnalysis>();
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<LoopDependenceAnalysis>();
+      AU.addPreserved<LoopDependenceAnalysis>();
       AU.addPreserved<DominatorTree>();
       AU.addRequired<DominatorTree>();
       AU.addRequired<TargetLibraryInfo>();
@@ -122,6 +125,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
+INITIALIZE_PASS_DEPENDENCY(LoopDependenceAnalysis)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms",
                     false, false)
@@ -163,15 +167,6 @@ static void deleteDeadInstruction(Instruction *I, ScalarEvolution &SE,
   } while (!NowDeadInsts.empty());
 }
 
-/// deleteIfDeadInstruction - If the specified value is a dead instruction,
-/// delete it and any recursively used instructions.
-static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
-                                    const TargetLibraryInfo *TLI) {
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    if (isInstructionTriviallyDead(I, TLI))
-      deleteDeadInstruction(I, SE, TLI);
-}
-
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   CurLoop = L;
 
@@ -368,35 +363,16 @@ processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) {
                                  MSI, Ev, BECount);
 }
 
-
-/// mayLoopAccessLocation - Return true if the specified loop might access the
-/// specified pointer location, which is a loop-strided access.  The 'Access'
-/// argument specifies what the verboten forms of access are (read or write).
-static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access,
-                                  Loop *L, const SCEV *BECount,
-                                  unsigned StoreSize, AliasAnalysis &AA,
-                                  Instruction *IgnoredStore) {
-  // Get the location that may be stored across the loop.  Since the access is
-  // strided positively through memory, we say that the modified location starts
-  // at the pointer and has infinite size.
-  uint64_t AccessSize = AliasAnalysis::UnknownSize;
-
-  // If the loop iterates a fixed number of times, we can refine the access size
-  // to be exactly the size of the memset, which is (BECount+1)*StoreSize
-  if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
-    AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize;
-
-  // TODO: For this to be really effective, we have to dive into the pointer
-  // operand in the store.  Store to &A[i] of 100 will always return may alias
-  // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
-  // which will then no-alias a store to &A[100].
-  AliasAnalysis::Location StoreLoc(Ptr, AccessSize);
-
+/// hasDependence - Uses the LoopDependenceAnalysis to determine whether 'Inst'
+/// depends on any other value in the Loop 'L'.
+static bool hasDependence(Instruction *Inst, Loop *L,
+                          LoopDependenceAnalysis &LDA) {
   for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
        ++BI)
     for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I)
-      if (&*I != IgnoredStore &&
-          (AA.getModRefInfo(I, StoreLoc) & Access))
+      if (&*I != Inst && I->mayReadOrWriteMemory() &&
+          (I->mayWriteToMemory() || Inst->mayWriteToMemory()) &&
+          LDA.depends(Inst, I))
         return true;
 
   return false;
@@ -474,6 +450,11 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
     return false;
   }
 
+  // Make sure the store has no dependencies (i.e. other loads and stores) in
+  // the loop.
+  if (hasDependence(TheStore, CurLoop, getAnalysis<LoopDependenceAnalysis>()))
+    return false;
+
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
   // header.  This allows us to insert code for it in the preheader.
@@ -482,25 +463,13 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   SCEVExpander Expander(*SE, "loop-idiom");
 
   // Okay, we have a strided store "p[i]" of a splattable value.  We can turn
-  // this into a memset in the loop preheader now if we want.  However, this
-  // would be unsafe to do if there is anything else in the loop that may read
-  // or write to the aliased location.  Check for any overlap by generating the
-  // base pointer and checking the region.
+  // this into a memset in the loop preheader now if we want.
   unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace();
   Value *BasePtr =
     Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace),
                            Preheader->getTerminator());
 
 
-  if (mayLoopAccessLocation(BasePtr, AliasAnalysis::ModRef,
-                            CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), TheStore)){
-    Expander.clear();
-    // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(BasePtr, *SE, TLI);
-    return false;
-  }
-
   // Okay, everything looks good, insert the memset.
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
@@ -563,6 +532,14 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
 
+  // Make sure the load and the store have no dependencies (i.e. other loads and
+  // stores) in the loop.
+  // FIXME: If we want to form a memmove SI and LI can be dependent but the
+  // distance must be positive. LDA doesn't provide that info currently.
+  LoopDependenceAnalysis &LDA = getAnalysis<LoopDependenceAnalysis>();
+  if (hasDependence(SI, CurLoop, LDA) || hasDependence(LI, CurLoop, LDA))
+    return false;
+
   // The trip count of the loop and the base pointer of the addrec SCEV is
   // guaranteed to be loop invariant, which means that it should dominate the
   // header.  This allows us to insert code for it in the preheader.
@@ -571,41 +548,16 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
   SCEVExpander Expander(*SE, "loop-idiom");
 
   // Okay, we have a strided store "p[i]" of a loaded value.  We can turn
-  // this into a memcpy in the loop preheader now if we want.  However, this
-  // would be unsafe to do if there is anything else in the loop that may read
-  // or write the memory region we're storing to.  This includes the load that
-  // feeds the stores.  Check for an alias by generating the base address and
-  // checking everything.
+  // this into a memcpy in the loop preheader now if we want.
   Value *StoreBasePtr =
     Expander.expandCodeFor(StoreEv->getStart(),
                            Builder.getInt8PtrTy(SI->getPointerAddressSpace()),
                            Preheader->getTerminator());
-
-  if (mayLoopAccessLocation(StoreBasePtr, AliasAnalysis::ModRef,
-                            CurLoop, BECount, StoreSize,
-                            getAnalysis<AliasAnalysis>(), SI)) {
-    Expander.clear();
-    // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
-    return false;
-  }
-
-  // For a memcpy, we have to make sure that the input array is not being
-  // mutated by the loop.
   Value *LoadBasePtr =
     Expander.expandCodeFor(LoadEv->getStart(),
                            Builder.getInt8PtrTy(LI->getPointerAddressSpace()),
                            Preheader->getTerminator());
 
-  if (mayLoopAccessLocation(LoadBasePtr, AliasAnalysis::Mod, CurLoop, BECount,
-                            StoreSize, getAnalysis<AliasAnalysis>(), SI)) {
-    Expander.clear();
-    // If we generated new code for the base pointer, clean up.
-    deleteIfDeadInstruction(LoadBasePtr, *SE, TLI);
-    deleteIfDeadInstruction(StoreBasePtr, *SE, TLI);
-    return false;
-  }
-
   // Okay, everything is safe, we can transform this!
 
 
diff --git a/test/Transforms/LoopIdiom/multi-dimensional.ll b/test/Transforms/LoopIdiom/multi-dimensional.ll
new file mode 100644
index 0000000000..991f2688d7
--- /dev/null
+++ b/test/Transforms/LoopIdiom/multi-dimensional.ll
@@ -0,0 +1,49 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%struct.ham = type { [2 x [2 x [2 x [16 x [8 x i32]]]]], i32, %struct.zot }
+%struct.zot = type { i32, i16, i16, [2 x [1152 x i32]] }
+
+define void @test1(%struct.ham* nocapture %arg) nounwind {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb11, %bb
+  %tmp = phi i64 [ 0, %bb ], [ %tmp12, %bb11 ]
+  br label %bb2
+
+bb2:                                              ; preds = %bb2, %bb1
+  %tmp3 = phi i64 [ 0, %bb1 ], [ %tmp8, %bb2 ]
+  %tmp4 = getelementptr inbounds %struct.ham* %arg, i64 0, i32 0, i64 0, i64 1, i64 1, i64 %tmp, i64 %tmp3
+  store i32 0, i32* %tmp4, align 4
+  %tmp5 = getelementptr inbounds %struct.ham* %arg, i64 0, i32 0, i64 0, i64 1, i64 0, i64 %tmp, i64 %tmp3
+  store i32 0, i32* %tmp5, align 4
+  %tmp6 = getelementptr inbounds %struct.ham* %arg, i64 0, i32 0, i64 0, i64 0, i64 1, i64 %tmp, i64 %tmp3
+  store i32 0, i32* %tmp6, align 4
+  %tmp7 = getelementptr inbounds %struct.ham* %arg, i64 0, i32 0, i64 0, i64 0, i64 0, i64 %tmp, i64 %tmp3
+  store i32 0, i32* %tmp7, align 4
+  %tmp8 = add i64 %tmp3, 1
+  %tmp9 = trunc i64 %tmp8 to i32
+  %tmp10 = icmp eq i32 %tmp9, 8
+  br i1 %tmp10, label %bb11, label %bb2
+
+bb11:                                             ; preds = %bb2
+  %tmp12 = add i64 %tmp, 1
+  %tmp13 = trunc i64 %tmp12 to i32
+  %tmp14 = icmp eq i32 %tmp13, 16
+  br i1 %tmp14, label %bb15, label %bb1
+
+bb15:                                             ; preds = %bb11
+  ret void
+
+; CHECK: @test1
+; CHECK: bb1:
+; CHECK-NOT: store
+; CHECK: call void @llvm.memset.p0i8.i64
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+; CHECK-NEXT: call void @llvm.memset.p0i8.i64
+; CHECK-NOT: store
+; CHECK: br
+}
diff --git a/test/Transforms/LoopIdiom/sideeffect.ll b/test/Transforms/LoopIdiom/sideeffect.ll
new file mode 100644
index 0000000000..460e5233f9
--- /dev/null
+++ b/test/Transforms/LoopIdiom/sideeffect.ll
@@ -0,0 +1,53 @@
+; RUN: opt -basicaa -loop-idiom < %s -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+; PR9481
+define i32 @test1() nounwind uwtable ssp {
+entry:
+  %a = alloca [10 x i8], align 1
+  br label %for.body
+
+for.cond1.preheader:                              ; preds = %for.body
+  %arrayidx5.phi.trans.insert = getelementptr inbounds [10 x i8]* %a, i64 0, i64 0
+  %.pre = load i8* %arrayidx5.phi.trans.insert, align 1
+  br label %for.body3
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv29 = phi i64 [ 0, %entry ], [ %indvars.iv.next30, %for.body ]
+  call void (...)* @bar() nounwind
+  %arrayidx = getelementptr inbounds [10 x i8]* %a, i64 0, i64 %indvars.iv29
+  store i8 23, i8* %arrayidx, align 1
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, 1000000
+  br i1 %exitcond32, label %for.cond1.preheader, label %for.body
+
+for.body3:                                        ; preds = %for.body3, %for.cond1.preheader
+  %0 = phi i8 [ %.pre, %for.cond1.preheader ], [ %add, %for.body3 ]
+  %indvars.iv = phi i64 [ 1, %for.cond1.preheader ], [ %indvars.iv.next, %for.body3 ]
+  call void (...)* @bar() nounwind
+  %arrayidx7 = getelementptr inbounds [10 x i8]* %a, i64 0, i64 %indvars.iv
+  %1 = load i8* %arrayidx7, align 1
+  %add = add i8 %1, %0
+  store i8 %add, i8* %arrayidx7, align 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1000000
+  br i1 %exitcond, label %for.end12, label %for.body3
+
+for.end12:                                        ; preds = %for.body3
+  %arrayidx13 = getelementptr inbounds [10 x i8]* %a, i64 0, i64 2
+  %2 = load i8* %arrayidx13, align 1
+  %conv14 = sext i8 %2 to i32
+  %arrayidx15 = getelementptr inbounds [10 x i8]* %a, i64 0, i64 6
+  %3 = load i8* %arrayidx15, align 1
+  %conv16 = sext i8 %3 to i32
+  %add17 = add nsw i32 %conv16, %conv14
+  ret i32 %add17
+
+; CHECK: @test1
+; CHECK-NOT: @llvm.memset
+}
+
+declare void @bar(...)
author	Benjamin Kramer <benny.kra@googlemail.com>	2012-10-21 15:03:07 +0000
committer	Benjamin Kramer <benny.kra@googlemail.com>	2012-10-21 15:03:07 +0000
commit	5c6e9ae14e012bfe1fdf0e19957922d3c4d85670 (patch)
tree	e8a0f8e874ee1e924d23af4e0601267968409aec
parent	bb950854acbb5966875763eaae7ab58e48e4f5a9 (diff)
download	llvm-5c6e9ae14e012bfe1fdf0e19957922d3c4d85670.tar.gz llvm-5c6e9ae14e012bfe1fdf0e19957922d3c4d85670.tar.bz2 llvm-5c6e9ae14e012bfe1fdf0e19957922d3c4d85670.tar.xz