summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNadav Rotem <nrotem@apple.com>2013-01-11 07:11:59 +0000
committerNadav Rotem <nrotem@apple.com>2013-01-11 07:11:59 +0000
commita675c74208f77351ea7fa3eed4f542ae781ab566 (patch)
treea2948ca75bec6ed4f9c71bd45b1ae55b36c6cc4f
parent04a0dc772916ddebdeefaf1344247e243ba6636d (diff)
downloadllvm-a675c74208f77351ea7fa3eed4f542ae781ab566.tar.gz
llvm-a675c74208f77351ea7fa3eed4f542ae781ab566.tar.bz2
llvm-a675c74208f77351ea7fa3eed4f542ae781ab566.tar.xz
ARM Cost Model: We need to detect the max bitwidth of types in the loop in order to select the max vectorization factor.
We don't have a detailed analysis on which values are vectorized and which stay scalars in the vectorized loop so we use another method. We look at reduction variables, loads and stores, which are the only ways to get information in and out of loop iterations. If the data types are extended and truncated then the cost model will catch the cost of the vector zext/sext/trunc operations. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@172178 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp18
-rw-r--r--test/Transforms/LoopVectorize/ARM/width-detect.ll52
2 files changed, 64 insertions, 6 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4803a9d038..4bb8c43656 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2635,7 +2635,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
assert(MaxVectorSize <= 32 && "Did not expect to pack so many elements"
" into one vector.");
-
+
unsigned VF = MaxVectorSize;
// If we optimize the program for size, avoid creating the tail loop.
@@ -2697,17 +2697,23 @@ unsigned LoopVectorizationCostModel::getWidestType() {
// For each instruction in the loop.
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- if (Legal->isUniformAfterVectorization(it))
+ Type *T = it->getType();
+
+ // Only examine Loads, Stores and PHINodes.
+ if (!isa<LoadInst>(it) && !isa<StoreInst>(it) && !isa<PHINode>(it))
continue;
- Type *T = it->getType();
+ // Examine PHI nodes that are reduction variables.
+ if (PHINode *PN = dyn_cast<PHINode>(it))
+ if (!Legal->getReductionVars()->count(PN))
+ continue;
+ // Examine the stored values.
if (StoreInst *ST = dyn_cast<StoreInst>(it))
T = ST->getValueOperand()->getType();
- // PHINodes and pointers are difficult to analyze, but we catch all other
- // uses of the types in other instructions.
- if (isa<PHINode>(it) || T->isPointerTy() || T->isVoidTy())
+ // Ignore stored/loaded pointer types.
+ if (T->isPointerTy())
continue;
MaxWidth = std::max(MaxWidth, T->getScalarSizeInBits());
diff --git a/test/Transforms/LoopVectorize/ARM/width-detect.ll b/test/Transforms/LoopVectorize/ARM/width-detect.ll
new file mode 100644
index 0000000000..c0795b6a79
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM/width-detect.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -loop-vectorize -mtriple=thumbv7-apple-ios3.0.0 -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+;CHECK:foo_F64
+;CHECK: <2 x double>
+;CHECK:ret
+define double @foo_F64(double* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0, %.lr.ph
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+ %prod.01 = phi double [ %4, %.lr.ph ], [ 0.000000e+00, %0 ]
+ %2 = getelementptr inbounds double* %A, i64 %indvars.iv
+ %3 = load double* %2, align 8
+ %4 = fmul fast double %prod.01, %3
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ %prod.0.lcssa = phi double [ 0.000000e+00, %0 ], [ %4, %.lr.ph ]
+ ret double %prod.0.lcssa
+}
+
+;CHECK:foo_I8
+;CHECK: xor <16 x i8>
+;CHECK:ret
+define signext i8 @foo_I8(i8* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
+ %1 = icmp sgt i32 %n, 0
+ br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph: ; preds = %0, %.lr.ph
+ %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+ %red.01 = phi i8 [ %4, %.lr.ph ], [ 0, %0 ]
+ %2 = getelementptr inbounds i8* %A, i64 %indvars.iv
+ %3 = load i8* %2, align 1
+ %4 = xor i8 %3, %red.01
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+ %exitcond = icmp eq i32 %lftr.wideiv, %n
+ br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge: ; preds = %.lr.ph, %0
+ %red.0.lcssa = phi i8 [ 0, %0 ], [ %4, %.lr.ph ]
+ ret i8 %red.0.lcssa
+}
+
+