summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp47
-rw-r--r--test/Transforms/SLPVectorizer/X86/phi.ll95
-rw-r--r--test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll45
-rw-r--r--test/Transforms/SLPVectorizer/X86/rgb_phi.ll14
4 files changed, 177 insertions, 24 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 57cd2a7f82..1f288bcd3f 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1781,28 +1781,53 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
// Check that all of the parts are scalar instructions of the same type.
Instruction *I0 = dyn_cast<Instruction>(VL[0]);
if (!I0)
- return 0;
+ return false;
unsigned Opcode0 = I0->getOpcode();
+
+ Type *Ty0 = I0->getType();
+ unsigned Sz = DL->getTypeSizeInBits(Ty0);
+ unsigned VF = MinVecRegSize / Sz;
for (int i = 0, e = VL.size(); i < e; ++i) {
Type *Ty = VL[i]->getType();
if (Ty->isAggregateType() || Ty->isVectorTy())
- return 0;
+ return false;
Instruction *Inst = dyn_cast<Instruction>(VL[i]);
if (!Inst || Inst->getOpcode() != Opcode0)
- return 0;
+ return false;
}
- R.buildTree(VL);
- int Cost = R.getTreeCost();
-
- if (Cost >= -SLPCostThreshold)
- return false;
+ bool Changed = false;
+
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ unsigned OpsWidth = 0;
+
+ if (i + VF > e)
+ OpsWidth = e - i;
+ else
+ OpsWidth = VF;
+
+ if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+ break;
- DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
- R.vectorizeTree();
- return true;
+ DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n");
+ ArrayRef<Value *> Ops = VL.slice(i, OpsWidth);
+
+ R.buildTree(Ops);
+ int Cost = R.getTreeCost();
+
+ if (Cost < -SLPCostThreshold) {
+ DEBUG(dbgs() << "SLP: Vectorizing pair at cost:" << Cost << ".\n");
+ R.vectorizeTree();
+
+ // Move to the next bundle.
+ i += VF - 1;
+ Changed = true;
+ }
+ }
+
+ return Changed;
}
bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
diff --git a/test/Transforms/SLPVectorizer/X86/phi.ll b/test/Transforms/SLPVectorizer/X86/phi.ll
index f77e945aad..9cc48910d8 100644
--- a/test/Transforms/SLPVectorizer/X86/phi.ll
+++ b/test/Transforms/SLPVectorizer/X86/phi.ll
@@ -95,11 +95,92 @@ for.end: ; preds = %for.body
ret i32 0
}
+; float foo3(float *A) {
+;
+; float R = A[0];
+; float G = A[1];
+; float B = A[2];
+; float Y = A[3];
+; float P = A[4];
+; for (int i=0; i < 121; i+=3) {
+; R+=A[i+0]*7;
+; G+=A[i+1]*8;
+; B+=A[i+2]*9;
+; Y+=A[i+3]*10;
+; P+=A[i+4]*11;
+; }
+;
+; return R+G+B+Y+P;
+; }
+
+;CHECK: foo3
+;CHECK: phi <4 x float>
+;CHECK: fmul <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK-NOT: phi <5 x float>
+;CHECK-NOT: fmul <5 x float>
+;CHECK-NOT: fadd <5 x float>
+
+define float @foo3(float* nocapture readonly %A) #0 {
+entry:
+ %0 = load float* %A, align 4
+ %arrayidx1 = getelementptr inbounds float* %A, i64 1
+ %1 = load float* %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds float* %A, i64 2
+ %2 = load float* %arrayidx2, align 4
+ %arrayidx3 = getelementptr inbounds float* %A, i64 3
+ %3 = load float* %arrayidx3, align 4
+ %arrayidx4 = getelementptr inbounds float* %A, i64 4
+ %4 = load float* %arrayidx4, align 4
+ br label %for.body
+
+for.body: ; preds = %for.body, %entry
+ %5 = phi float [ %1, %entry ], [ %11, %for.body ]
+ %6 = phi float [ %0, %entry ], [ %9, %for.body ]
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %P.056 = phi float [ %4, %entry ], [ %add26, %for.body ]
+ %Y.055 = phi float [ %3, %entry ], [ %add21, %for.body ]
+ %B.054 = phi float [ %2, %entry ], [ %add16, %for.body ]
+ %G.053 = phi float [ %1, %entry ], [ %add11, %for.body ]
+ %R.052 = phi float [ %0, %entry ], [ %add6, %for.body ]
+ %mul = fmul float %6, 7.000000e+00
+ %add6 = fadd float %R.052, %mul
+ %mul10 = fmul float %5, 8.000000e+00
+ %add11 = fadd float %G.053, %mul10
+ %7 = add nsw i64 %indvars.iv, 2
+ %arrayidx14 = getelementptr inbounds float* %A, i64 %7
+ %8 = load float* %arrayidx14, align 4
+ %mul15 = fmul float %8, 9.000000e+00
+ %add16 = fadd float %B.054, %mul15
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
+ %arrayidx19 = getelementptr inbounds float* %A, i64 %indvars.iv.next
+ %9 = load float* %arrayidx19, align 4
+ %mul20 = fmul float %9, 1.000000e+01
+ %add21 = fadd float %Y.055, %mul20
+ %10 = add nsw i64 %indvars.iv, 4
+ %arrayidx24 = getelementptr inbounds float* %A, i64 %10
+ %11 = load float* %arrayidx24, align 4
+ %mul25 = fmul float %11, 1.100000e+01
+ %add26 = fadd float %P.056, %mul25
+ %12 = trunc i64 %indvars.iv.next to i32
+ %cmp = icmp slt i32 %12, 121
+ br i1 %cmp, label %for.body, label %for.end
+
+for.end: ; preds = %for.body
+ %add28 = fadd float %add6, %add11
+ %add29 = fadd float %add28, %add16
+ %add30 = fadd float %add29, %add21
+ %add31 = fadd float %add30, %add26
+ ret float %add31
+}
+
define void @test(x86_fp80* %i1, x86_fp80* %i2, x86_fp80* %o) {
; CHECK-LABEL: @test(
;
; Test that we correctly recognize the discontiguous memory in arrays where the
; size is less than the alignment, and through various different GEP formations.
+;
+; We disable the vectorization of x86_fp80 for now.
entry:
%i1.0 = load x86_fp80* %i1, align 16
@@ -107,8 +188,8 @@ entry:
%i1.1 = load x86_fp80* %i1.gep1, align 16
; CHECK: load x86_fp80*
; CHECK: load x86_fp80*
-; CHECK: insertelement <2 x x86_fp80>
-; CHECK: insertelement <2 x x86_fp80>
+; CHECK-NOT: insertelement <2 x x86_fp80>
+; CHECK_NOT: insertelement <2 x x86_fp80>
br i1 undef, label %then, label %end
then:
@@ -118,16 +199,16 @@ then:
%i2.1 = load x86_fp80* %i2.gep1, align 16
; CHECK: load x86_fp80*
; CHECK: load x86_fp80*
-; CHECK: insertelement <2 x x86_fp80>
-; CHECK: insertelement <2 x x86_fp80>
+; CHECK-NOT: insertelement <2 x x86_fp80>
+; CHECK-NOT: insertelement <2 x x86_fp80>
br label %end
end:
%phi0 = phi x86_fp80 [ %i1.0, %entry ], [ %i2.0, %then ]
%phi1 = phi x86_fp80 [ %i1.1, %entry ], [ %i2.1, %then ]
-; CHECK: phi <2 x x86_fp80>
-; CHECK: extractelement <2 x x86_fp80>
-; CHECK: extractelement <2 x x86_fp80>
+; CHECK-NOT: phi <2 x x86_fp80>
+; CHECK-NOT: extractelement <2 x x86_fp80>
+; CHECK-NOT: extractelement <2 x x86_fp80>
store x86_fp80 %phi0, x86_fp80* %o, align 16
%o.gep1 = getelementptr inbounds x86_fp80* %o, i64 1
store x86_fp80 %phi1, x86_fp80* %o.gep1, align 16
diff --git a/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll b/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
new file mode 100644
index 0000000000..520e6729de
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/phi_overalignedtype.ll
@@ -0,0 +1,45 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-100 -dce -S -mtriple=i386-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+; We purposely over-align f64 to 128bit here.
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:128:128-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9.0"
+
+
+define void @test(double* %i1, double* %i2, double* %o) {
+; CHECK-LABEL: @test(
+;
+; Test that we correctly recognize the discontiguous memory in arrays where the
+; size is less than the alignment, and through various different GEP formations.
+
+entry:
+ %i1.0 = load double* %i1, align 16
+ %i1.gep1 = getelementptr double* %i1, i64 1
+ %i1.1 = load double* %i1.gep1, align 16
+; CHECK: load double*
+; CHECK: load double*
+; CHECK: insertelement <2 x double>
+; CHECK: insertelement <2 x double>
+ br i1 undef, label %then, label %end
+
+then:
+ %i2.gep0 = getelementptr inbounds double* %i2, i64 0
+ %i2.0 = load double* %i2.gep0, align 16
+ %i2.gep1 = getelementptr inbounds double* %i2, i64 1
+ %i2.1 = load double* %i2.gep1, align 16
+; CHECK: load double*
+; CHECK: load double*
+; CHECK: insertelement <2 x double>
+; CHECK: insertelement <2 x double>
+ br label %end
+
+end:
+ %phi0 = phi double [ %i1.0, %entry ], [ %i2.0, %then ]
+ %phi1 = phi double [ %i1.1, %entry ], [ %i2.1, %then ]
+; CHECK: phi <2 x double>
+; CHECK: extractelement <2 x double>
+; CHECK: extractelement <2 x double>
+ store double %phi0, double* %o, align 16
+ %o.gep1 = getelementptr inbounds double* %o, i64 1
+ store double %phi1, double* %o.gep1, align 16
+ ret void
+}
diff --git a/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
index 3235fd9a30..6aea5d3c6f 100644
--- a/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
+++ b/test/Transforms/SLPVectorizer/X86/rgb_phi.ll
@@ -3,6 +3,8 @@
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
target triple = "i386-apple-macosx10.9.0"
+; We disable the vectorization of <3 x float> for now
+
; float foo(float *A) {
;
; float R = A[0];
@@ -19,14 +21,14 @@ target triple = "i386-apple-macosx10.9.0"
;CHECK-LABEL: @foo(
;CHECK: br
-;CHECK: phi <3 x float>
-;CHECK: fmul <3 x float>
-;CHECK: fadd <3 x float>
+;CHECK-NOT: phi <3 x float>
+;CHECK-NOT: fmul <3 x float>
+;CHECK-NOT: fadd <3 x float>
; At the moment we don't sink extractelements.
;CHECK: br
-;CHECK: extractelement
-;CHECK: extractelement
-;CHECK: extractelement
+;CHECK-NOT: extractelement
+;CHECK-NOT: extractelement
+;CHECK-NOT: extractelement
;CHECK: ret
define float @foo(float* nocapture readonly %A) {