diff options
-rw-r--r-- | lib/Transforms/Vectorize/SLPVectorizer.cpp | 101 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll | 60 | ||||
-rw-r--r-- | test/Transforms/SLPVectorizer/X86/value-bug.ll | 80 |
3 files changed, 211 insertions, 30 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index d7ccb796d3..895826f1d8 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" +#include "llvm/IR/NoFolder.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" @@ -357,13 +358,13 @@ public: /// A negative number means that this is profitable. int getTreeCost(); - /// Construct a vectorizable tree that starts at \p Roots and is possibly - /// used by a reduction of \p RdxOps. - void buildTree(ArrayRef<Value *> Roots, ValueSet *RdxOps = 0); + /// Construct a vectorizable tree that starts at \p Roots, ignoring users for + /// the purpose of scheduling and extraction in the \p UserIgnoreLst. + void buildTree(ArrayRef<Value *> Roots, + ArrayRef<Value *> UserIgnoreLst = None); /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { - RdxOps = 0; VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -526,8 +527,8 @@ private: return I.first->second; } - /// Reduction operators. - ValueSet *RdxOps; + /// List of users to ignore during scheduling and that don't need extracting. + ArrayRef<Value *> UserIgnoreList; // Analysis and block reference. Function *F; @@ -542,9 +543,10 @@ private: IRBuilder<> Builder; }; -void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { +void BoUpSLP::buildTree(ArrayRef<Value *> Roots, + ArrayRef<Value *> UserIgnoreLst) { deleteTree(); - RdxOps = Rdx; + UserIgnoreList = UserIgnoreLst; if (!getSameType(Roots)) return; buildTree_rec(Roots, 0); @@ -576,8 +578,9 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots, ValueSet *Rdx) { if (!UserInst) continue; - // Ignore uses that are part of the reduction. - if (Rdx && std::find(Rdx->begin(), Rdx->end(), UserInst) != Rdx->end()) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UserInst) != + UserIgnoreList.end()) continue; DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " << @@ -708,8 +711,9 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) { continue; } - // This user is part of the reduction. - if (RdxOps && RdxOps->count(UI)) + // Ignore users in the user ignore list. + if (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), UI) != + UserIgnoreList.end()) continue; // Make sure that we can schedule this unknown user. @@ -1747,8 +1751,9 @@ Value *BoUpSLP::vectorizeTree() { DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); assert((ScalarToTreeEntry.count(U) || - // It is legal to replace the reduction users by undef. - (RdxOps && RdxOps->count(U))) && + // It is legal to replace users in the ignorelist by undef. + (std::find(UserIgnoreList.begin(), UserIgnoreList.end(), U) != + UserIgnoreList.end())) && "Replacing out-of-tree value with undef"); } #endif @@ -1954,8 +1959,11 @@ private: bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); /// \brief Try to vectorize a list of operands. + /// \@param BuildVector A list of users to ignore for the purpose of + /// scheduling and that don't need extracting. /// \returns true if a value was vectorized. - bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R); + bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, + ArrayRef<Value *> BuildVector = None); /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); @@ -2128,7 +2136,8 @@ bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { return tryToVectorizeList(VL, R); } -bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { +bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, + ArrayRef<Value *> BuildVector) { if (VL.size() < 2) return false; @@ -2178,13 +2187,38 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) { << "\n"); ArrayRef<Value *> Ops = VL.slice(i, OpsWidth); - R.buildTree(Ops); + ArrayRef<Value *> BuildVectorSlice; + if (!BuildVector.empty()) + BuildVectorSlice = BuildVector.slice(i, OpsWidth); + + R.buildTree(Ops, BuildVectorSlice); int Cost = R.getTreeCost(); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); - R.vectorizeTree(); - + Value *VectorizedRoot = R.vectorizeTree(); + + // Reconstruct the build vector by extracting the vectorized root. This + // way we handle the case where some elements of the vector are undefined. + // (return (inserelt <4 xi32> (insertelt undef (opd0) 0) (opd1) 2)) + if (!BuildVectorSlice.empty()) { + // The insert point is the last build vector instruction. The vectorized + // root will precede it. This guarantees that we get an instruction. The + // vectorized tree could have been constant folded. + Instruction *InsertAfter = cast<Instruction>(BuildVectorSlice.back()); + unsigned VecIdx = 0; + for (auto &V : BuildVectorSlice) { + IRBuilder<true, NoFolder> Builder( + ++BasicBlock::iterator(InsertAfter)); + InsertElementInst *IE = cast<InsertElementInst>(V); + Instruction *Extract = cast<Instruction>(Builder.CreateExtractElement( + VectorizedRoot, Builder.getInt32(VecIdx++))); + IE->setOperand(1, Extract); + IE->removeFromParent(); + IE->insertAfter(Extract); + InsertAfter = IE; + } + } // Move to the next bundle. i += VF - 1; Changed = true; @@ -2293,7 +2327,7 @@ static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx, /// *p = /// class HorizontalReduction { - SmallPtrSet<Value *, 16> ReductionOps; + SmallVector<Value *, 16> ReductionOps; SmallVector<Value *, 32> ReducedVals; BinaryOperator *ReductionRoot; @@ -2387,7 +2421,7 @@ public: // We need to be able to reassociate the adds. if (!TreeN->isAssociative()) return false; - ReductionOps.insert(TreeN); + ReductionOps.push_back(TreeN); } // Retract. Stack.pop_back(); @@ -2424,7 +2458,7 @@ public: for (; i < NumReducedVals - ReduxWidth + 1; i += ReduxWidth) { ArrayRef<Value *> ValsToReduce(&ReducedVals[i], ReduxWidth); - V.buildTree(ValsToReduce, &ReductionOps); + V.buildTree(ValsToReduce, ReductionOps); // Estimate cost. int Cost = V.getTreeCost() + getReductionCost(TTI, ReducedVals[i]); @@ -2543,13 +2577,16 @@ private: /// /// Returns true if it matches /// -static bool findBuildVector(InsertElementInst *IE, - SmallVectorImpl<Value *> &Ops) { - if (!isa<UndefValue>(IE->getOperand(0))) +static bool findBuildVector(InsertElementInst *FirstInsertElem, + SmallVectorImpl<Value *> &BuildVector, + SmallVectorImpl<Value *> &BuildVectorOpds) { + if (!isa<UndefValue>(FirstInsertElem->getOperand(0))) return false; + InsertElementInst *IE = FirstInsertElem; while (true) { - Ops.push_back(IE->getOperand(1)); + BuildVector.push_back(IE); + BuildVectorOpds.push_back(IE->getOperand(1)); if (IE->use_empty()) return false; @@ -2720,12 +2757,16 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } // Try to vectorize trees that start at insertelement instructions. - if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) { - SmallVector<Value *, 8> Ops; - if (!findBuildVector(IE, Ops)) + if (InsertElementInst *FirstInsertElem = dyn_cast<InsertElementInst>(it)) { + SmallVector<Value *, 16> BuildVector; + SmallVector<Value *, 16> BuildVectorOpds; + if (!findBuildVector(FirstInsertElem, BuildVector, BuildVectorOpds)) continue; - if (tryToVectorizeList(Ops, R)) { + // Vectorize starting with the build vector operands ignoring the + // BuildVector instructions for the purpose of scheduling and user + // extraction. + if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { Changed = true; it = BB->begin(); e = BB->end(); diff --git a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll index 7537ea3b05..db33927cb7 100644 --- a/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -195,6 +195,30 @@ define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, ret <4 x float> %rb } +; Make sure that vectorization happens even if insertelements operations +; must be rescheduled. The case here is from compiling Julia. +define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @reschedule_extract( +; CHECK: %1 = fadd <4 x float> %a, %b + %a0 = extractelement <4 x float> %a, i32 0 + %b0 = extractelement <4 x float> %b, i32 0 + %c0 = fadd float %a0, %b0 + %v0 = insertelement <4 x float> undef, float %c0, i32 0 + %a1 = extractelement <4 x float> %a, i32 1 + %b1 = extractelement <4 x float> %b, i32 1 + %c1 = fadd float %a1, %b1 + %v1 = insertelement <4 x float> %v0, float %c1, i32 1 + %a2 = extractelement <4 x float> %a, i32 2 + %b2 = extractelement <4 x float> %b, i32 2 + %c2 = fadd float %a2, %b2 + %v2 = insertelement <4 x float> %v1, float %c2, i32 2 + %a3 = extractelement <4 x float> %a, i32 3 + %b3 = extractelement <4 x float> %b, i32 3 + %c3 = fadd float %a3, %b3 + %v3 = insertelement <4 x float> %v2, float %c3, i32 3 + ret <4 x float> %v3 +} + ; Check that cost model for vectorization takes credit for ; instructions that are erased. define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { @@ -219,4 +243,40 @@ define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { ret <4 x float> %v3 } +; Make sure we handle multiple trees that feed one build vector correctly. +define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) { +entry: + %t0 = fadd double %w , 0.000000e+00 + %t1 = fadd double %x , 1.000000e+00 + %t2 = fadd double %y , 2.000000e+00 + %t3 = fadd double %z , 3.000000e+00 + %t4 = fmul double %t0, 1.000000e+00 + %i1 = insertelement <4 x double> undef, double %t4, i32 3 + %t5 = fmul double %t1, 1.000000e+00 + %i2 = insertelement <4 x double> %i1, double %t5, i32 2 + %t6 = fmul double %t2, 1.000000e+00 + %i3 = insertelement <4 x double> %i2, double %t6, i32 1 + %t7 = fmul double %t3, 1.000000e+00 + %i4 = insertelement <4 x double> %i3, double %t7, i32 0 + ret <4 x double> %i4 +} +; CHECK-LABEL: @multi_tree +; CHECK-DAG: %[[V0:.+]] = insertelement <2 x double> undef, double %w, i32 0 +; CHECK-DAG: %[[V1:.+]] = insertelement <2 x double> %[[V0]], double %x, i32 1 +; CHECK-DAG: %[[V2:.+]] = fadd <2 x double> %[[V1]], <double 0.000000e+00, double 1.000000e+00> +; CHECK-DAG: %[[V3:.+]] = insertelement <2 x double> undef, double %y, i32 0 +; CHECK-DAG: %[[V4:.+]] = insertelement <2 x double> %[[V3]], double %z, i32 1 +; CHECK-DAG: %[[V5:.+]] = fadd <2 x double> %[[V4]], <double 2.000000e+00, double 3.000000e+00> +; CHECK-DAG: %[[V6:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V2]] +; CHECK-DAG: %[[V7:.+]] = extractelement <2 x double> %[[V6]], i32 0 +; CHECK-DAG: %[[I1:.+]] = insertelement <4 x double> undef, double %[[V7]], i32 3 +; CHECK-DAG: %[[V8:.+]] = extractelement <2 x double> %[[V6]], i32 1 +; CHECK-DAG: %[[I2:.+]] = insertelement <4 x double> %[[I1]], double %[[V8]], i32 2 +; CHECK-DAG: %[[V9:.+]] = fmul <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[V5]] +; CHECK-DAG: %[[V10:.+]] = extractelement <2 x double> %[[V9]], i32 0 +; CHECK-DAG: %[[I3:.+]] = insertelement <4 x double> %i2, double %[[V10]], i32 1 +; CHECK-DAG: %[[V11:.+]] = extractelement <2 x double> %[[V9]], i32 1 +; CHECK-DAG: %[[I4:.+]] = insertelement <4 x double> %i3, double %[[V11]], i32 0 +; CHECK: ret <4 x double> %[[I4]] + attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/test/Transforms/SLPVectorizer/X86/value-bug.ll b/test/Transforms/SLPVectorizer/X86/value-bug.ll new file mode 100644 index 0000000000..64d2ae1c7d --- /dev/null +++ b/test/Transforms/SLPVectorizer/X86/value-bug.ll @@ -0,0 +1,80 @@ +; RUN: opt -slp-vectorizer < %s -S -mtriple="x86_64-grtev3-linux-gnu" -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-grtev3-linux-gnu" + +; We used to crash on this example because we were building a constant +; expression during vectorization and the vectorizer expects instructions +; as elements of the vectorized tree. +; CHECK-LABEL: @test +; PR19621 + +define void @test() { +bb279: + br label %bb283 + +bb283: + %Av.sroa.8.0 = phi float [ undef, %bb279 ], [ %tmp315, %exit ] + %Av.sroa.5.0 = phi float [ undef, %bb279 ], [ %tmp319, %exit ] + %Av.sroa.3.0 = phi float [ undef, %bb279 ], [ %tmp307, %exit ] + %Av.sroa.0.0 = phi float [ undef, %bb279 ], [ %tmp317, %exit ] + br label %bb284 + +bb284: + %tmp7.i = fpext float %Av.sroa.3.0 to double + %tmp8.i = fsub double %tmp7.i, undef + %tmp9.i = fsub double %tmp8.i, undef + %tmp17.i = fpext float %Av.sroa.8.0 to double + %tmp19.i = fsub double %tmp17.i, undef + %tmp20.i = fsub double %tmp19.i, undef + br label %bb21.i + +bb21.i: + br i1 undef, label %bb22.i, label %exit + +bb22.i: + %tmp24.i = fadd double undef, %tmp9.i + %tmp26.i = fadd double undef, %tmp20.i + br label %bb32.i + +bb32.i: + %xs.0.i = phi double [ %tmp24.i, %bb22.i ], [ 0.000000e+00, %bb32.i ] + %ys.0.i = phi double [ %tmp26.i, %bb22.i ], [ 0.000000e+00, %bb32.i ] + br i1 undef, label %bb32.i, label %bb21.i + +exit: + %tmp303 = fpext float %Av.sroa.0.0 to double + %tmp304 = fmul double %tmp303, undef + %tmp305 = fadd double undef, %tmp304 + %tmp306 = fadd double %tmp305, undef + %tmp307 = fptrunc double %tmp306 to float + %tmp311 = fpext float %Av.sroa.5.0 to double + %tmp312 = fmul double %tmp311, 0.000000e+00 + %tmp313 = fadd double undef, %tmp312 + %tmp314 = fadd double %tmp313, undef + %tmp315 = fptrunc double %tmp314 to float + %tmp317 = fptrunc double undef to float + %tmp319 = fptrunc double undef to float + br label %bb283 +} + +; Make sure that we probably handle constant folded vectorized trees. The +; vectorizer starts at the type (%t2, %t3) and wil constant fold the tree. +; The code that handles insertelement instructions must handle this. +define <4 x double> @constant_folding() { +entry: + %t0 = fadd double 1.000000e+00 , 0.000000e+00 + %t1 = fadd double 1.000000e+00 , 1.000000e+00 + %t2 = fmul double %t0, 1.000000e+00 + %i1 = insertelement <4 x double> undef, double %t2, i32 1 + %t3 = fmul double %t1, 1.000000e+00 + %i2 = insertelement <4 x double> %i1, double %t3, i32 0 + ret <4 x double> %i2 +} + +; CHECK-LABEL: @constant_folding +; CHECK: %[[V0:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 0 +; CHECK: %[[V1:.+]] = insertelement <4 x double> undef, double %[[V0]], i32 1 +; CHECK: %[[V2:.+]] = extractelement <2 x double> <double 1.000000e+00, double 2.000000e+00>, i32 1 +; CHECK: %[[V3:.+]] = insertelement <4 x double> %[[V1]], double %[[V2]], i32 0 +; CHECK: ret <4 x double> %[[V3]] |