summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorNadav Rotem <nrotem@apple.com>2013-07-07 06:57:07 +0000
committerNadav Rotem <nrotem@apple.com>2013-07-07 06:57:07 +0000
commit369cc938d261de3295eb70d0738f54ef1a82806c (patch)
tree10497d00e5f4f7a2cd0a9d4b741da0d448787f10 /lib
parent95a1b3484d7daf7830161f1613fc812303641abe (diff)
downloadllvm-369cc938d261de3295eb70d0738f54ef1a82806c.tar.gz
llvm-369cc938d261de3295eb70d0738f54ef1a82806c.tar.bz2
llvm-369cc938d261de3295eb70d0738f54ef1a82806c.tar.xz
SLPVectorizer: Implement DCE as part of vectorization.
This is a complete re-write if the bottom-up vectorization class. Before this commit we scanned the instruction tree 3 times. First in search of merge points for the trees. Second, for estimating the cost. And finally for vectorization. There was a lot of code duplication and adding the DCE exposed bugs. The new design is simpler and DCE was a part of the design. In this implementation we build the tree once. After that we estimate the cost by scanning the different entries in the constructed tree (in any order). The vectorization phase also works on the built tree. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185774 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/Transforms/Vectorize/SLPVectorizer.cpp2052
1 files changed, 1041 insertions, 1011 deletions
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 85c01bdeff..d2e7450d24 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -59,7 +59,7 @@ static const unsigned RecursionMaxDepth = 12;
class BuilderLocGuard {
public:
BuilderLocGuard(IRBuilder<> &B) : Builder(B), Loc(B.GetInsertPoint()) {}
- ~BuilderLocGuard() { Builder.SetInsertPoint(Loc); }
+ ~BuilderLocGuard() { if (Loc) Builder.SetInsertPoint(Loc); }
private:
// Prevent copying.
@@ -91,6 +91,7 @@ struct BlockNumbering {
}
int getIndex(Instruction *I) {
+ assert(I->getParent() == BB && "Invalid instruction");
if (!Valid)
numberInstructions();
assert(InstrIdx.count(I) && "Unknown instruction");
@@ -117,72 +118,169 @@ private:
std::vector<Instruction *> InstrVec;
};
-class FuncSLP {
+/// \returns the parent basic block if all of the instructions in \p VL
+/// are in the same block or null otherwise.
+static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ if (!I0)
+ return 0;
+ BasicBlock *BB = I0->getParent();
+ for (int i = 1, e = VL.size(); i < e; i++) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ if (!I)
+ return 0;
+
+ if (BB != I->getParent())
+ return 0;
+ }
+ return BB;
+}
+
+/// \returns True if all of the values in \p VL are constants.
+static bool allConstant(ArrayRef<Value *> VL) {
+ for (unsigned i = 0, e = VL.size(); i < e; ++i)
+ if (!isa<Constant>(VL[i]))
+ return false;
+ return true;
+}
+
+/// \returns True if all of the values in \p VL are identical.
+static bool isSplat(ArrayRef<Value *> VL) {
+ for (unsigned i = 1, e = VL.size(); i < e; ++i)
+ if (VL[i] != VL[0])
+ return false;
+ return true;
+}
+
+/// \returns The opcode if all of the Instructions in \p VL have the same
+/// opcode, or zero.
+static unsigned getSameOpcode(ArrayRef<Value *> VL) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ if (!I0)
+ return 0;
+ unsigned Opcode = I0->getOpcode();
+ for (int i = 1, e = VL.size(); i < e; i++) {
+ Instruction *I = dyn_cast<Instruction>(VL[i]);
+ if (!I || Opcode != I->getOpcode())
+ return 0;
+ }
+ return Opcode;
+}
+
+/// \returns The type that all of the values in \p VL have or null if there
+/// are different types.
+static Type* getSameType(ArrayRef<Value *> VL) {
+ Type *Ty = VL[0]->getType();
+ for (int i = 1, e = VL.size(); i < e; i++)
+ if (VL[0]->getType() != Ty)
+ return 0;
+
+ return Ty;
+}
+
+/// \returns True if the ExtractElement instructions in VL can be vectorized
+/// to use the original vector.
+static bool CanReuseExtract(ArrayRef<Value *> VL) {
+ assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
+ // Check if all of the extracts come from the same vector and from the
+ // correct offset.
+ Value *VL0 = VL[0];
+ ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
+ Value *Vec = E0->getOperand(0);
+
+ // We have to extract from the same vector type.
+ unsigned NElts = Vec->getType()->getVectorNumElements();
+
+ if (NElts != VL.size())
+ return false;
+
+ // Check that all of the indices extract from the correct offset.
+ ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
+ if (!CI || CI->getZExtValue())
+ return false;
+
+ for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+ ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
+ ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+
+ if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
+ return false;
+ }
+
+ return true;
+}
+
+/// Bottom Up SLP Vectorizer.
+class BoUpSLP {
+public:
typedef SmallVector<Value *, 8> ValueList;
typedef SmallVector<Instruction *, 16> InstrList;
typedef SmallPtrSet<Value *, 16> ValueSet;
typedef SmallVector<StoreInst *, 8> StoreList;
-public:
- static const int MAX_COST = INT_MIN;
-
- FuncSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl,
- TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li,
+ BoUpSLP(Function *Func, ScalarEvolution *Se, DataLayout *Dl,
+ TargetTransformInfo *Tti, AliasAnalysis *Aa, LoopInfo *Li,
DominatorTree *Dt) :
F(Func), SE(Se), DL(Dl), TTI(Tti), AA(Aa), LI(Li), DT(Dt),
Builder(Se->getContext()) {
- for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
- BasicBlock *BB = it;
- BlocksNumbers[BB] = BlockNumbering(BB);
+ // Setup the block numbering utility for all of the blocks in the
+ // function.
+ for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
+ BasicBlock *BB = it;
+ BlocksNumbers[BB] = BlockNumbering(BB);
+ }
}
- }
-
- /// \brief Take the pointer operand from the Load/Store instruction.
- /// \returns NULL if this is not a valid Load/Store instruction.
- static Value *getPointerOperand(Value *I);
-
- /// \brief Take the address space operand from the Load/Store instruction.
- /// \returns -1 if this is not a valid Load/Store instruction.
- static unsigned getAddressSpaceOperand(Value *I);
-
- /// \returns true if the memory operations A and B are consecutive.
- bool isConsecutiveAccess(Value *A, Value *B);
/// \brief Vectorize the tree that starts with the elements in \p VL.
- /// \returns the vectorized value.
- Value *vectorizeTree(ArrayRef<Value *> VL);
+ void vectorizeTree();
/// \returns the vectorization cost of the subtree that starts at \p VL.
/// A negative number means that this is profitable.
- int getTreeCost(ArrayRef<Value *> VL);
+ int getTreeCost();
+
+ /// Construct a vectorizable tree that starts at \p Roots.
+ void buildTree(ArrayRef<Value *> Roots);
+
+ /// Clear the internal data structures that are created by 'buildTree'.
+ void deleteTree() {
+ VectorizableTree.clear();
+ ScalarToTreeEntry.clear();
+ MustGather.clear();
+ MemBarrierIgnoreList.clear();
+ }
/// \returns the scalarization cost for this list of values. Assuming that
/// this subtree gets vectorized, we may need to extract the values from the
/// roots. This method calculates the cost of extracting the values.
int getGatherCost(ArrayRef<Value *> VL);
- /// \brief Attempts to order and vectorize a sequence of stores. This
- /// function does a quadratic scan of the given stores.
- /// \returns true if the basic block was modified.
- bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
+ /// \returns true if the memory operations A and B are consecutive.
+ bool isConsecutiveAccess(Value *A, Value *B);
- /// \brief Vectorize a group of scalars into a vector tree.
- /// \returns the vectorized value.
- Value *vectorizeArith(ArrayRef<Value *> Operands);
+ /// \brief Perform LICM and CSE on the newly generated gather sequences.
+ void optimizeGatherSequence();
+private:
+ struct TreeEntry;
- /// \brief This method contains the recursive part of getTreeCost.
- int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
+ /// \returns the cost of the vectorizable entry.
+ int getEntryCost(TreeEntry *E);
- /// \brief This recursive method looks for vectorization hazards such as
- /// values that are used by multiple users and checks that values are used
- /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
- void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
+ /// This is the recursive part of buildTree.
+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
- /// \brief This method contains the recursive part of vectorizeTree.
- Value *vectorizeTree_rec(ArrayRef<Value *> VL);
+ /// Vectorizer a single entry in the tree.
+ Value *vectorizeTree(TreeEntry *E);
- /// \brief Vectorize a sorted sequence of stores.
- bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
+ /// Vectorizer a single entry in the tree, starting in \p VL.
+ Value *vectorizeTree(ArrayRef<Value *> VL);
+
+ /// \brief Take the pointer operand from the Load/Store instruction.
+ /// \returns NULL if this is not a valid Load/Store instruction.
+ static Value *getPointerOperand(Value *I);
+
+ /// \brief Take the address space operand from the Load/Store instruction.
+ /// \returns -1 if this is not a valid Load/Store instruction.
+ static unsigned getAddressSpaceOperand(Value *I);
/// \returns the scalarization cost for this type. Scalarization in this
/// context means the creation of vectors from a group of scalars.
@@ -211,57 +309,65 @@ public:
/// \returns a vector from a collection of scalars in \p VL.
Value *Gather(ArrayRef<Value *> VL, VectorType *Ty);
- /// \brief Perform LICM and CSE on the newly generated gather sequences.
- void optimizeGatherSequence();
+ struct TreeEntry {
+ TreeEntry() : Scalars(), VectorizedValue(0), LastScalarIndex(0),
+ NeedToGather(0) {}
- bool needToGatherAny(ArrayRef<Value *> VL) {
- for (int i = 0, e = VL.size(); i < e; ++i)
- if (MustGather.count(VL[i]))
- return true;
- return false;
- }
+ /// \returns true if the scalars in VL are equal to this entry.
+ bool isSame(ArrayRef<Value *> VL) {
+ assert(VL.size() == Scalars.size() && "Invalid size");
+ for (int i = 0, e = VL.size(); i != e; ++i)
+ if (VL[i] != Scalars[i])
+ return false;
+ return true;
+ }
+
+ /// A vector of scalars.
+ ValueList Scalars;
+
+ /// The Scalars are vectorized into this value. It is initialized to Null.
+ Value *VectorizedValue;
+
+ /// The index in the basic block of the last scalar.
+ int LastScalarIndex;
- void forgetNumbering() {
- for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it)
- BlocksNumbers[it].forget();
+ /// Do we need to gather this sequence ?
+ bool NeedToGather;
+ };
+
+ /// Create a new VectorizableTree entry.
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized) {
+ VectorizableTree.push_back(TreeEntry());
+ int idx = VectorizableTree.size() - 1;
+ TreeEntry *Last = &VectorizableTree[idx];
+ Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
+ Last->NeedToGather = !Vectorized;
+ if (Vectorized) {
+ Last->LastScalarIndex = getLastIndex(VL);
+ for (int i = 0, e = VL.size(); i != e; ++i) {
+ assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
+ ScalarToTreeEntry[VL[i]] = idx;
+ }
+ } else {
+ Last->LastScalarIndex = 0;
+ MustGather.insert(VL.begin(), VL.end());
+ }
+ return Last;
}
/// -- Vectorization State --
+ /// Holds all of the tree entries.
+ std::vector<TreeEntry> VectorizableTree;
- /// Maps values in the tree to the vector lanes that uses them. This map must
- /// be reset between runs of getCost.
- std::map<Value *, int> LaneMap;
- /// A list of instructions to ignore while sinking
- /// memory instructions. This map must be reset between runs of getCost.
- ValueSet MemBarrierIgnoreList;
-
- /// Maps between the first scalar to the vector. This map must be reset
- /// between runs.
- DenseMap<Value *, Value *> VectorizedValues;
+ /// Maps a specific scalar to its tree entry.
+ SmallDenseMap<Value*, int> ScalarToTreeEntry;
- /// Contains values that must be gathered because they are used
- /// by multiple lanes, or by users outside the tree.
- /// NOTICE: The vectorization methods also use this set.
+ /// A list of scalars that we found that we need to keep as scalars.
ValueSet MustGather;
- /// Contains PHINodes that are being processed. We use this data structure
- /// to stop cycles in the graph.
- ValueSet VisitedPHIs;
-
- /// Contains a list of values that are used outside the current tree, the
- /// first element in the bundle and the insertion point for extracts. This
- /// set must be reset between runs.
- struct UseInfo{
- UseInfo(Instruction *VL0, int I) :
- Leader(VL0), LastIndex(I) {}
- UseInfo() : Leader(0), LastIndex(0) {}
- /// The first element in the bundle.
- Instruction *Leader;
- /// The insertion index.
- int LastIndex;
- };
- MapVector<Instruction*, UseInfo> MultiUserVals;
- SetVector<Instruction*> ExtractedLane;
+ /// A list of instructions to ignore while sinking
+ /// memory instructions. This map must be reset between runs of getCost.
+ ValueSet MemBarrierIgnoreList;
/// Holds all of the instructions that we gathered.
SetVector<Instruction *> GatherSeq;
@@ -281,14 +387,478 @@ public:
IRBuilder<> Builder;
};
-int FuncSLP::getGatherCost(Type *Ty) {
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots) {
+ deleteTree();
+ buildTree_rec(Roots, 0);
+}
+
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
+ bool SameTy = getSameType(VL); (void)SameTy;
+ assert(SameTy && "Invalid types!");
+
+ if (Depth == RecursionMaxDepth) {
+ DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ // Don't handle vectors.
+ if (VL[0]->getType()->isVectorTy()) {
+ DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ if (SI->getValueOperand()->getType()->isVectorTy()) {
+ DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ // If all of the operands are identical or constant we have a simple solution.
+ if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) ||
+ !getSameOpcode(VL)) {
+ DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ // We now know that this is a vector of instructions of the same type from
+ // the same block.
+
+ // Check if this is a duplicate of another entry.
+ if (ScalarToTreeEntry.count(VL[0])) {
+ int Idx = ScalarToTreeEntry[VL[0]];
+ TreeEntry *E = &VectorizableTree[Idx];
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
+ if (E->Scalars[i] != VL[i]) {
+ DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+ DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
+ return;
+ }
+
+ // Check that none of the instructions in the bundle are already in the tree.
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ if (ScalarToTreeEntry.count(VL[i])) {
+ DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
+ ") is already in tree.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // If any of the scalars appears in the table OR it is marked as a value that
+ // needs to stat scalar then we need to gather the scalars.
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ if (ScalarToTreeEntry.count(VL[i]) || MustGather.count(VL[i])) {
+ DEBUG(dbgs() << "SLP: Gathering due to gathered scalar. \n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+
+ // Check that all of the users of the scalars that we want to vectorize are
+ // schedulable.
+ Instruction *VL0 = cast<Instruction>(VL[0]);
+ int MyLastIndex = getLastIndex(VL);
+ BasicBlock *BB = cast<Instruction>(VL0)->getParent();
+
+ for (unsigned i = 0, e = VL.size(); i != e; ++i) {
+ Instruction *Scalar = cast<Instruction>(VL[i]);
+ DEBUG(dbgs() << "SLP: Checking users of " << *Scalar << ". \n");
+ for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end();
+ U != UE; ++U) {
+ DEBUG(dbgs() << "SLP: \tUser " << **U << ". \n");
+ Instruction *User = dyn_cast<Instruction>(*U);
+ if (!User) {
+ DEBUG(dbgs() << "SLP: Gathering due unknown user. \n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ // We don't care if the user is in a different basic block.
+ BasicBlock *UserBlock = User->getParent();
+ if (UserBlock != BB) {
+ DEBUG(dbgs() << "SLP: User from a different basic block "
+ << *User << ". \n");
+ continue;
+ }
+
+ // If this is a PHINode within this basic block then we can place the
+ // extract wherever we want.
+ if (isa<PHINode>(*User)) {
+ DEBUG(dbgs() << "SLP: \tWe can schedule PHIs:" << *User << ". \n");
+ continue;
+ }
+
+ // Check if this is a safe in-tree user.
+ if (ScalarToTreeEntry.count(User)) {
+ int Idx = ScalarToTreeEntry[User];
+ int VecLocation = VectorizableTree[Idx].LastScalarIndex;
+ if (VecLocation <= MyLastIndex) {
+ DEBUG(dbgs() << "SLP: Gathering due to unschedulable vector. \n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ DEBUG(dbgs() << "SLP: In-tree user (" << *User << ") at #" <<
+ VecLocation << " vector value (" << *Scalar << ") at #"
+ << MyLastIndex << ".\n");
+ continue;
+ }
+
+ // Make sure that we can schedule this unknown user.
+ BlockNumbering &BN = BlocksNumbers[BB];
+ int UserIndex = BN.getIndex(User);
+ if (UserIndex < MyLastIndex) {
+
+ DEBUG(dbgs() << "SLP: Can't schedule extractelement for "
+ << *User << ". \n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+ }
+
+ // Check that every instructions appears once in this bundle.
+ for (unsigned i = 0, e = VL.size(); i < e; ++i)
+ for (unsigned j = i+1; j < e; ++j)
+ if (VL[i] == VL[j]) {
+ DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+
+ // Check that instructions in this bundle don't reference other instructions.
+ // The runtime of this check is O(N * N-1 * uses(N)) and a typical N is 4.
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
+ U != UE; ++U) {
+ for (unsigned j = 0; j < e; ++j) {
+ if (i != j && *U == VL[j]) {
+ DEBUG(dbgs() << "SLP: Intra-bundle dependencies!" << **U << ". \n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+ }
+ }
+
+ DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+
+ unsigned Opcode = getSameOpcode(VL);
+
+ // Check if it is safe to sink the loads or the stores.
+ if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
+ Instruction *Last = getLastInstruction(VL);
+
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ if (VL[i] == Last)
+ continue;
+ Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
+ if (Barrier) {
+ DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
+ << "\n because of " << *Barrier << ". Gathering.\n");
+ newTreeEntry(VL, false);
+ return;
+ }
+ }
+ }
+
+ switch (Opcode) {
+ case Instruction::PHI: {
+ PHINode *PH = dyn_cast<PHINode>(VL0);
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+
+ for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i));
+
+ buildTree_rec(Operands, Depth + 1);
+ }
+ return;
+ }
+ case Instruction::ExtractElement: {
+ bool Reuse = CanReuseExtract(VL);
+ if (Reuse) {
+ DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
+ }
+ newTreeEntry(VL, Reuse);
+ return;
+ }
+ case Instruction::Load: {
+ // Check if the loads are consecutive or of we need to swizzle them.
+ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
+ if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Need to swizzle loads.\n");
+ return;
+ }
+
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ return;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
+ for (unsigned i = 0; i < VL.size(); ++i) {
+ Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
+ if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) {
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
+ return;
+ }
+ }
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth+1);
+ }
+ return;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Check that all of the compares have the same predicate.
+ CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+ for (unsigned i = 1, e = VL.size(); i < e; ++i) {
+ CmpInst *Cmp = cast<CmpInst>(VL[i]);
+ if (Cmp->getPredicate() != P0) {
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
+ return;
+ }
+ }
+
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth+1);
+ }
+ return;
+ }
+ case Instruction::Select:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
+
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+ buildTree_rec(Operands, Depth+1);
+ }
+ return;
+ }
+ case Instruction::Store: {
+ // Check if the stores are consecutive or of we need to swizzle them.
+ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
+ if (!isConsecutiveAccess(VL[i], VL[i + 1])) {
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Non consecutive store.\n");
+ return;
+ }
+
+ newTreeEntry(VL, true);
+ DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+
+ ValueList Operands;
+ for (unsigned j = 0; j < VL.size(); ++j)
+ Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+
+ // We can ignore these values because we are sinking them down.
+ MemBarrierIgnoreList.insert(VL.begin(), VL.end());
+ buildTree_rec(Operands, Depth + 1);
+ return;
+ }
+ default:
+ newTreeEntry(VL, false);
+ DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+ return;
+ }
+}
+
+int BoUpSLP::getEntryCost(TreeEntry *E) {
+ ArrayRef<Value*> VL = E->Scalars;
+
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+ if (E->NeedToGather) {
+ if (allConstant(VL))
+ return 0;
+ if (isSplat(VL)) {
+ return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+ }
+ return getGatherCost(E->Scalars);
+ }
+
+ assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) &&
+ "Invalid VL");
+ Instruction *VL0 = cast<Instruction>(VL[0]);
+ unsigned Opcode = VL0->getOpcode();
+ switch (Opcode) {
+ case Instruction::PHI: {
+ return 0;
+ }
+ case Instruction::ExtractElement: {
+ if (CanReuseExtract(VL))
+ return 0;
+ return getGatherCost(VecTy);
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
+
+ // Calculate the cost of this instruction.
+ int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
+ VL0->getType(), SrcTy);
+
+ VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+ int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+ return VecCost - ScalarCost;
+ }
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ case Instruction::Select:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Calculate the cost of this instruction.
+ int ScalarCost = 0;
+ int VecCost = 0;
+ if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
+ Opcode == Instruction::Select) {
+ VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+ ScalarCost = VecTy->getNumElements() *
+ TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
+ VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+ } else {
+ ScalarCost = VecTy->getNumElements() *
+ TTI->getArithmeticInstrCost(Opcode, ScalarTy);
+ VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+ }
+ return VecCost - ScalarCost;
+ }
+ case Instruction::Load: {
+ // Cost of wide load - cost of scalar loads.
+ int ScalarLdCost = VecTy->getNumElements() *
+ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+ int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+ return VecLdCost - ScalarLdCost;
+ }
+ case Instruction::Store: {
+ // We know that we can merge the stores. Calculate the cost.
+ int ScalarStCost = VecTy->getNumElements() *
+ TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+ int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+ return VecStCost - ScalarStCost;
+ }
+ default:
+ llvm_unreachable("Unknown instruction");
+ }
+}
+
+int BoUpSLP::getTreeCost() {
+ int Cost = 0;
+ DEBUG(dbgs() << "SLP: Calculating cost for tree of size " <<
+ VectorizableTree.size() << ".\n");
+
+ for (unsigned i = 0, e = VectorizableTree.size(); i != e; ++i) {
+ int C = getEntryCost(&VectorizableTree[i]);
+ DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with "
+ << *VectorizableTree[i].Scalars[0] << " .\n");
+ Cost += C;
+ }
+ DEBUG(dbgs() << "SLP: Total Cost " << Cost << ".\n");
+ return Cost;
+}
+
+int BoUpSLP::getGatherCost(Type *Ty) {
int Cost = 0;
for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
return Cost;
}
-int FuncSLP::getGatherCost(ArrayRef<Value *> VL) {
+int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) {
// Find the type of the operands in VL.
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
@@ -298,7 +868,7 @@ int FuncSLP::getGatherCost(ArrayRef<Value *> VL) {
return getGatherCost(VecTy);
}
-AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) {
+AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
if (StoreInst *SI = dyn_cast<StoreInst>(I))
return AA->getLocation(SI);
if (LoadInst *LI = dyn_cast<LoadInst>(I))
@@ -306,7 +876,7 @@ AliasAnalysis::Location FuncSLP::getLocation(Instruction *I) {
return AliasAnalysis::Location();
}
-Value *FuncSLP::getPointerOperand(Value *I) {
+Value *BoUpSLP::getPointerOperand(Value *I) {
if (LoadInst *LI = dyn_cast<LoadInst>(I))
return LI->getPointerOperand();
if (StoreInst *SI = dyn_cast<StoreInst>(I))
@@ -314,7 +884,7 @@ Value *FuncSLP::getPointerOperand(Value *I) {
return 0;
}
-unsigned FuncSLP::getAddressSpaceOperand(Value *I) {
+unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
if (LoadInst *L = dyn_cast<LoadInst>(I))
return L->getPointerAddressSpace();
if (StoreInst *S = dyn_cast<StoreInst>(I))
@@ -322,7 +892,7 @@ unsigned FuncSLP::getAddressSpaceOperand(Value *I) {
return -1;
}
-bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) {
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
Value *PtrA = getPointerOperand(A);
Value *PtrB = getPointerOperand(B);
unsigned ASA = getAddressSpaceOperand(A);
@@ -354,7 +924,7 @@ bool FuncSLP::isConsecutiveAccess(Value *A, Value *B) {
return ((-Offset) == Sz);
}
-Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
+Value *BoUpSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
assert(Src->getParent() == Dst->getParent() && "Not the same BB");
BasicBlock::iterator I = Src, E = Dst;
/// Scan all of the instruction from SRC to DST and check if
@@ -379,234 +949,7 @@ Value *FuncSLP::getSinkBarrier(Instruction *Src, Instruction *Dst) {
return 0;
}
-static BasicBlock *getSameBlock(ArrayRef<Value *> VL) {
- BasicBlock *BB = 0;
- for (int i = 0, e = VL.size(); i < e; i++) {
- Instruction *I = dyn_cast<Instruction>(VL[i]);
- if (!I)
- return 0;
-
- if (!BB) {
- BB = I->getParent();
- continue;
- }
-
- if (BB != I->getParent())
- return 0;
- }
- return BB;
-}
-
-static bool allConstant(ArrayRef<Value *> VL) {
- for (unsigned i = 0, e = VL.size(); i < e; ++i)
- if (!isa<Constant>(VL[i]))
- return false;
- return true;
-}
-
-static bool isSplat(ArrayRef<Value *> VL) {
- for (unsigned i = 1, e = VL.size(); i < e; ++i)
- if (VL[i] != VL[0])
- return false;
- return true;
-}
-
-static unsigned getSameOpcode(ArrayRef<Value *> VL) {
- unsigned Opcode = 0;
- for (int i = 0, e = VL.size(); i < e; i++) {
- if (Instruction *I = dyn_cast<Instruction>(VL[i])) {
- if (!Opcode) {
- Opcode = I->getOpcode();
- continue;
- }
- if (Opcode != I->getOpcode())
- return 0;
- }
- }
- return Opcode;
-}
-
-static bool CanReuseExtract(ArrayRef<Value *> VL, unsigned VF,
- VectorType *VecTy) {
- assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
- // Check if all of the extracts come from the same vector and from the
- // correct offset.
- Value *VL0 = VL[0];
- ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
- Value *Vec = E0->getOperand(0);
-
- // We have to extract from the same vector type.
- if (Vec->getType() != VecTy)
- return false;
-
- // Check that all of the indices extract from the correct offset.
- ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
- if (!CI || CI->getZExtValue())
- return false;
-
- for (unsigned i = 1, e = VF; i < e; ++i) {
- ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
- ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
-
- if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
- return false;
- }
-
- return true;
-}
-
-void FuncSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
- if (Depth == RecursionMaxDepth)
- return MustGather.insert(VL.begin(), VL.end());
-
- // Don't handle vectors.
- if (VL[0]->getType()->isVectorTy())
- return;
-
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
- if (SI->getValueOperand()->getType()->isVectorTy())
- return;
-
- // If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL))
- return MustGather.insert(VL.begin(), VL.end());
-
- // Stop the scan at unknown IR.
- Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
- assert(VL0 && "Invalid instruction");
-
- // Mark instructions with multiple users.
- int LastIndex = getLastIndex(VL);
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- if (PHINode *PN = dyn_cast<PHINode>(VL[i])) {
- unsigned NumUses = 0;
- // Check that PHINodes have only one external (non-self) use.
- for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end();
- U != UE; ++U) {
- // Don't count self uses.
- if (*U == PN)
- continue;
- NumUses++;
- }
- if (NumUses > 1) {
- DEBUG(dbgs() << "SLP: Adding PHI to MultiUserVals "
- "because it has " << NumUses << " users:" << *PN << " \n");
- UseInfo UI(VL0, 0);
- MultiUserVals[PN] = UI;
- }
- continue;
- }
-
- Instruction *I = dyn_cast<Instruction>(VL[i]);
- // Remember to check if all of the users of this instruction are vectorized
- // within our tree. At depth zero we have no local users, only external
- // users that we don't care about.
- if (Depth && I && I->getNumUses() > 1) {
- DEBUG(dbgs() << "SLP: Adding to MultiUserVals "
- "because it has " << I->getNumUses() << " users:" << *I << " \n");
- UseInfo UI(VL0, LastIndex);
- MultiUserVals[I] = UI;
- }
- }
-
- // Check that the instruction is only used within one lane.
- for (int i = 0, e = VL.size(); i < e; ++i) {
- if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) {
- DEBUG(dbgs() << "SLP: Value used by multiple lanes:" << *VL[i] << "\n");
- return MustGather.insert(VL.begin(), VL.end());
- }
- // Make this instruction as 'seen' and remember the lane.
- LaneMap[VL[i]] = i;
- }
-
- unsigned Opcode = getSameOpcode(VL);
- if (!Opcode)
- return MustGather.insert(VL.begin(), VL.end());
-
- switch (Opcode) {
- case Instruction::PHI: {
- PHINode *PH = dyn_cast<PHINode>(VL0);
-
- // Stop self cycles.
- if (VisitedPHIs.count(PH))
- return;
-
- VisitedPHIs.insert(PH);
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i));
-
- getTreeUses_rec(Operands, Depth + 1);
- }
- return;
- }
- case Instruction::ExtractElement: {
- VectorType *VecTy = VectorType::get(VL[0]->getType(), VL.size());
- // No need to follow ExtractElements that are going to be optimized away.
- if (CanReuseExtract(VL, VL.size(), VecTy))
- return;
- // Fall through.
- }
- case Instruction::Load:
- return;
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast:
- case Instruction::Select:
- case Instruction::ICmp:
- case Instruction::FCmp:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
-
- getTreeUses_rec(Operands, Depth + 1);
- }
- return;
- }
- case Instruction::Store: {
- ValueList Operands;
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
- getTreeUses_rec(Operands, Depth + 1);
- return;
- }
- default:
- return MustGather.insert(VL.begin(), VL.end());
- }
-}
-
-int FuncSLP::getLastIndex(ArrayRef<Value *> VL) {
+int BoUpSLP::getLastIndex(ArrayRef<Value *> VL) {
BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
BlockNumbering &BN = BlocksNumbers[BB];
@@ -617,7 +960,7 @@ int FuncSLP::getLastIndex(ArrayRef<Value *> VL) {
return MaxIdx;
}
-Instruction *FuncSLP::getLastInstruction(ArrayRef<Value *> VL) {
+Instruction *BoUpSLP::getLastInstruction(ArrayRef<Value *> VL) {
BasicBlock *BB = cast<Instruction>(VL[0])->getParent();
assert(BB == getSameBlock(VL) && BlocksNumbers.count(BB) && "Invalid block");
BlockNumbering &BN = BlocksNumbers[BB];
@@ -625,15 +968,17 @@ Instruction *FuncSLP::getLastInstruction(ArrayRef<Value *> VL) {
int MaxIdx = BN.getIndex(cast<Instruction>(VL[0]));
for (unsigned i = 1, e = VL.size(); i < e; ++i)
MaxIdx = std::max(MaxIdx, BN.getIndex(cast<Instruction>(VL[i])));
- return BN.getInstruction(MaxIdx);
+ Instruction *I = BN.getInstruction(MaxIdx);
+ assert(I && "bad location");
+ return I;
}
-Instruction *FuncSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) {
+Instruction *BoUpSLP::getInstructionForIndex(unsigned Index, BasicBlock *BB) {
BlockNumbering &BN = BlocksNumbers[BB];
return BN.getInstruction(Index);
}
-int FuncSLP::getFirstUserIndex(ArrayRef<Value *> VL) {
+int BoUpSLP::getFirstUserIndex(ArrayRef<Value *> VL) {
BasicBlock *BB = getSameBlock(VL);
assert(BB && "All instructions must come from the same block");
BlockNumbering &BN = BlocksNumbers[BB];
@@ -654,444 +999,7 @@ int FuncSLP::getFirstUserIndex(ArrayRef<Value *> VL) {
return FirstUser;
}
-int FuncSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
- Type *ScalarTy = VL[0]->getType();
-
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
- ScalarTy = SI->getValueOperand()->getType();
-
- /// Don't mess with vectors.
- if (ScalarTy->isVectorTy())
- return FuncSLP::MAX_COST;
-
- if (allConstant(VL))
- return 0;
-
- VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
-
- if (isSplat(VL))
- return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
-
- int GatherCost = getGatherCost(VecTy);
- if (Depth == RecursionMaxDepth || needToGatherAny(VL))
- return GatherCost;
-
- BasicBlock *BB = getSameBlock(VL);
- unsigned Opcode = getSameOpcode(VL);
- assert(Opcode && BB && "Invalid Instruction Value");
-
- // Check if it is safe to sink the loads or the stores.
- if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
- int MaxIdx = getLastIndex(VL);
- Instruction *Last = getInstructionForIndex(MaxIdx, BB);
-
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- if (VL[i] == Last)
- continue;
- Value *Barrier = getSinkBarrier(cast<Instruction>(VL[i]), Last);
- if (Barrier) {
- DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last
- << "\n because of " << *Barrier << "\n");
- return MAX_COST;
- }
- }
- }
-
- // Calculate the extract cost.
- unsigned ExternalUserExtractCost = 0;
- for (unsigned i = 0, e = VL.size(); i < e; ++i)
- if (ExtractedLane.count(cast<Instruction>(VL[i])))
- ExternalUserExtractCost +=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
-
- Instruction *VL0 = cast<Instruction>(VL[0]);
- switch (Opcode) {
- case Instruction::PHI: {
- PHINode *PH = dyn_cast<PHINode>(VL0);
-
- // Stop self cycles.
- if (VisitedPHIs.count(PH))
- return 0;
-
- VisitedPHIs.insert(PH);
- int TotalCost = 0;
- // Calculate the cost of all of the operands.
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<PHINode>(VL[j])->getIncomingValue(i));
-
- int Cost = getTreeCost_rec(Operands, Depth + 1);
- if (Cost == MAX_COST)
- return MAX_COST;
- TotalCost += TotalCost;
- }
-
- if (TotalCost > GatherCost) {
- MustGather.insert(VL.begin(), VL.end());
- return GatherCost;
- }
-
- return TotalCost + ExternalUserExtractCost;
- }
- case Instruction::ExtractElement: {
- if (CanReuseExtract(VL, VL.size(), VecTy))
- return 0;
- return getGatherCost(VecTy);
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- ValueList Operands;
- Type *SrcTy = VL0->getOperand(0)->getType();
- // Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j) {
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
- // Check that the casted type is the same for all users.
- if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
- return getGatherCost(VecTy);
- }
-
- int Cost = getTreeCost_rec(Operands, Depth + 1);
- if (Cost == MAX_COST)
- return MAX_COST;
-
- // Calculate the cost of this instruction.
- int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
- VL0->getType(), SrcTy);
-
- VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
- int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
- Cost += (VecCost - ScalarCost);
-
- if (Cost > GatherCost) {
- MustGather.insert(VL.begin(), VL.end());
- return GatherCost;
- }
-
- return Cost + ExternalUserExtractCost;
- }
- case Instruction::FCmp:
- case Instruction::ICmp: {
- // Check that all of the compares have the same predicate.
- CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
- for (unsigned i = 1, e = VL.size(); i < e; ++i) {
- CmpInst *Cmp = cast<CmpInst>(VL[i]);
- if (Cmp->getPredicate() != P0)
- return getGatherCost(VecTy);
- }
- // Fall through.
- }
- case Instruction::Select:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- int TotalCost = 0;
- // Calculate the cost of all of the operands.
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
-
- int Cost = getTreeCost_rec(Operands, Depth + 1);
- if (Cost == MAX_COST)
- return MAX_COST;
- TotalCost += Cost;
- }
-
- // Calculate the cost of this instruction.
- int ScalarCost = 0;
- int VecCost = 0;
- if (Opcode == Instruction::FCmp || Opcode == Instruction::ICmp ||
- Opcode == Instruction::Select) {
- VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
- ScalarCost =
- VecTy->getNumElements() *
- TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
- VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
- } else {
- ScalarCost = VecTy->getNumElements() *
- TTI->getArithmeticInstrCost(Opcode, ScalarTy);
- VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
- }
- TotalCost += (VecCost - ScalarCost);
-
- if (TotalCost > GatherCost) {
- MustGather.insert(VL.begin(), VL.end());
- return GatherCost;
- }
-
- return TotalCost + ExternalUserExtractCost;
- }
- case Instruction::Load: {
- // If we are scalarize the loads, add the cost of forming the vector.
- for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
- if (!isConsecutiveAccess(VL[i], VL[i + 1]))
- return getGatherCost(VecTy);
-
- // Cost of wide load - cost of scalar loads.
- int ScalarLdCost = VecTy->getNumElements() *
- TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
- int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
- int TotalCost = VecLdCost - ScalarLdCost;
-
- if (TotalCost > GatherCost) {
- MustGather.insert(VL.begin(), VL.end());
- return GatherCost;
- }
-
- return TotalCost + ExternalUserExtractCost;
- }
- case Instruction::Store: {
- // We know that we can merge the stores. Calculate the cost.
- int ScalarStCost = VecTy->getNumElements() *
- TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
- int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
- int StoreCost = VecStCost - ScalarStCost;
-
- ValueList Operands;
- for (unsigned j = 0; j < VL.size(); ++j) {
- Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
- MemBarrierIgnoreList.insert(VL[j]);
- }
-
- int Cost = getTreeCost_rec(Operands, Depth + 1);
- if (Cost == MAX_COST)
- return MAX_COST;
-
- int TotalCost = StoreCost + Cost;
- return TotalCost + ExternalUserExtractCost;
- }
- default:
- // Unable to vectorize unknown instructions.
- return getGatherCost(VecTy);
- }
-}
-
-int FuncSLP::getTreeCost(ArrayRef<Value *> VL) {
- // Get rid of the list of stores that were removed, and from the
- // lists of instructions with multiple users.
- MemBarrierIgnoreList.clear();
- LaneMap.clear();
- MultiUserVals.clear();
- ExtractedLane.clear();
- MustGather.clear();
- VisitedPHIs.clear();
-
- if (!getSameBlock(VL))
- return MAX_COST;
-
- // Find the location of the last root.
- int LastRootIndex = getLastIndex(VL);
- int FirstUserIndex = getFirstUserIndex(VL);
-
- // Don't vectorize if there are users of the tree roots inside the tree
- // itself.
- if (LastRootIndex > FirstUserIndex)
- return MAX_COST;
-
- // Scan the tree and find which value is used by which lane, and which values
- // must be scalarized.
- getTreeUses_rec(VL, 0);
-
- // Check that instructions with multiple users can be vectorized. Mark
- // unsafe instructions.
- for (MapVector<Instruction *, UseInfo>::iterator UI = MultiUserVals.begin(),
- e = MultiUserVals.end(); UI != e; ++UI) {
- Instruction *Scalar = UI->first;
-
- if (MustGather.count(Scalar))
- continue;
-
- assert(LaneMap.count(Scalar) && "Unknown scalar");
- int ScalarLane = LaneMap[Scalar];
-
- bool ExternalUse = false;
- // Check that all of the users of this instr are within the tree.
- for (Value::use_iterator Usr = Scalar->use_begin(),
- UE = Scalar->use_end(); Usr != UE; ++Usr) {
- // If this user is within the tree, make sure it is from the same lane.
- // Notice that we have both in-tree and out-of-tree users.
- if (LaneMap.count(*Usr)) {
- if (LaneMap[*Usr] != ScalarLane) {
- DEBUG(dbgs() << "SLP: Adding to MustExtract "
- "because of an out-of-lane usage.\n");
- MustGather.insert(Scalar);
- break;
- }
- continue;
- }
-
- // We have an out-of-tree user. Check if we can place an 'extract'.
- Instruction *User = cast<Instruction>(*Usr);
- // We care about the order only if the user is in the same block.
- if (User->getParent() == Scalar->getParent()) {
- int LastLoc = UI->second.LastIndex;
- BlockNumbering &BN = BlocksNumbers[User->getParent()];
- int UserIdx = BN.getIndex(User);
- if (UserIdx <= LastLoc) {
- DEBUG(dbgs() << "SLP: Adding to MustExtract because of an external "
- "user that we can't schedule.\n");
- MustGather.insert(Scalar);
- break;
- }
- }
- // We have an external user.
- ExternalUse = true;
- }
-
- if (ExternalUse) {
- // Items that are left in MultiUserVals are to be extracted.
- // ExtractLane is used for the lookup.
- ExtractedLane.insert(Scalar);
- }
-
- }
-
- // Now calculate the cost of vectorizing the tree.
- return getTreeCost_rec(VL, 0);
-}
-bool FuncSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
- unsigned ChainLen = Chain.size();
- DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
- << "\n");
- Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
- unsigned Sz = DL->getTypeSizeInBits(StoreTy);
- unsigned VF = MinVecRegSize / Sz;
-
- if (!isPowerOf2_32(Sz) || VF < 2)
- return false;
-
- bool Changed = false;
- // Look for profitable vectorizable trees at all offsets, starting at zero.
- for (unsigned i = 0, e = ChainLen; i < e; ++i) {
- if (i + VF > e)
- break;
- DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
- << "\n");
- ArrayRef<Value *> Operands = Chain.slice(i, VF);
-
- int Cost = getTreeCost(Operands);
- if (Cost == FuncSLP::MAX_COST)
- continue;
- DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
- if (Cost < CostThreshold) {
- DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
- vectorizeTree(Operands);
-
- // Remove the scalar stores.
- for (int j = 0, e = VF; j < e; ++j)
- cast<Instruction>(Operands[j])->eraseFromParent();
-
- // Move to the next bundle.
- i += VF - 1;
- Changed = true;
- }
- }
-
- if (Changed || ChainLen > VF)
- return Changed;
-
- // Handle short chains. This helps us catch types such as <3 x float> that
- // are smaller than vector size.
- int Cost = getTreeCost(Chain);
- if (Cost == FuncSLP::MAX_COST)
- return false;
- if (Cost < CostThreshold) {
- DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost
- << " for size = " << ChainLen << "\n");
- vectorizeTree(Chain);
-
- // Remove all of the scalar stores.
- for (int i = 0, e = Chain.size(); i < e; ++i)
- cast<Instruction>(Chain[i])->eraseFromParent();
-
- return true;
- }
-
- return false;
-}
-
-bool FuncSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
- SetVector<Value *> Heads, Tails;
- SmallDenseMap<Value *, Value *> ConsecutiveChain;
-
- // We may run into multiple chains that merge into a single chain. We mark the
- // stores that we vectorized so that we don't visit the same store twice.
- ValueSet VectorizedStores;
- bool Changed = false;
-
- // Do a quadratic search on all of the given stores and find
- // all of the pairs of loads that follow each other.
- for (unsigned i = 0, e = Stores.size(); i < e; ++i)
- for (unsigned j = 0; j < e; ++j) {
- if (i == j)
- continue;
-
- if (isConsecutiveAccess(Stores[i], Stores[j])) {
- Tails.insert(Stores[j]);
- Heads.insert(Stores[i]);
- ConsecutiveChain[Stores[i]] = Stores[j];
- }
- }
-
- // For stores that start but don't end a link in the chain:
- for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
- it != e; ++it) {
- if (Tails.count(*it))
- continue;
-
- // We found a store instr that starts a chain. Now follow the chain and try
- // to vectorize it.
- ValueList Operands;
- Value *I = *it;
- // Collect the chain into a list.
- while (Tails.count(I) || Heads.count(I)) {
- if (VectorizedStores.count(I))
- break;
- Operands.push_back(I);
- // Move to the next value in the chain.
- I = ConsecutiveChain[I];
- }
-
- bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
-
- // Mark the vectorized stores so that we don't vectorize them again.
- if (Vectorized)
- VectorizedStores.insert(Operands.begin(), Operands.end());
- Changed |= Vectorized;
- }
-
- return Changed;
-}
-
-Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
+Value *BoUpSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
Value *Vec = UndefValue::get(Ty);
// Generate the 'InsertElement' instruction.
for (unsigned i = 0; i < Ty->getNumElements(); ++i) {
@@ -1103,282 +1011,292 @@ Value *FuncSLP::Gather(ArrayRef<Value *> VL, VectorType *Ty) {
return Vec;
}
-Value *FuncSLP::vectorizeTree_rec(ArrayRef<Value *> VL) {
- BuilderLocGuard Guard(Builder);
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+ if (ScalarToTreeEntry.count(VL[0])) {
+ int Idx = ScalarToTreeEntry[VL[0]];
+ TreeEntry *E = &VectorizableTree[Idx];
+ if (E->isSame(VL))
+ return vectorizeTree(E);
+ }
Type *ScalarTy = VL[0]->getType();
if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
ScalarTy = SI->getValueOperand()->getType();
VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
- if (needToGatherAny(VL))
- return Gather(VL, VecTy);
+ return Gather(VL, VecTy);
+}
+
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+ BuilderLocGuard Guard(Builder);
- if (VectorizedValues.count(VL[0])) {
- DEBUG(dbgs() << "SLP: Diamond merged at depth.\n");
- return VectorizedValues[VL[0]];
+ if (E->VectorizedValue) {
+ DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+ return E->VectorizedValue;
}
- Instruction *VL0 = cast<Instruction>(VL[0]);
- unsigned Opcode = VL0->getOpcode();
- assert(Opcode == getSameOpcode(VL) && "Invalid opcode");
+ Type *ScalarTy = E->Scalars[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(E->Scalars[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
- switch (Opcode) {
- case Instruction::PHI: {
- PHINode *PH = dyn_cast<PHINode>(VL0);
- Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt());
- PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
- VectorizedValues[VL0] = NewPhi;
+ if (E->NeedToGather) {
+ return Gather(E->Scalars, VecTy);
+ }
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
- ValueList Operands;
- BasicBlock *IBB = PH->getIncomingBlock(i);
+ Instruction *VL0 = cast<Instruction>(E->Scalars[0]);
+ unsigned Opcode = VL0->getOpcode();
+ assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode");
- // Prepare the operand vector.
- for (unsigned j = 0; j < VL.size(); ++j)
- Operands.push_back(cast<PHINode>(VL[j])->getIncomingValueForBlock(IBB));
+ switch (Opcode) {
+ case Instruction::PHI: {
+ PHINode *PH = dyn_cast<PHINode>(VL0);
+ Builder.SetInsertPoint(PH->getParent()->getFirstInsertionPt());
+ PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+ E->VectorizedValue = NewPhi;
+
+ for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ ValueList Operands;
+ BasicBlock *IBB = PH->getIncomingBlock(i);
+
+ // Prepare the operand vector.
+ for (unsigned j = 0; j < E->Scalars.size(); ++j)
+ Operands.push_back(cast<PHINode>(E->Scalars[j])->
+ getIncomingValueForBlock(IBB));
+
+ Builder.SetInsertPoint(IBB->getTerminator());
+ Value *Vec = vectorizeTree(Operands);
+ NewPhi->addIncoming(Vec, IBB);
+ }
- Builder.SetInsertPoint(IBB->getTerminator());
- Value *Vec = vectorizeTree_rec(Operands);
- NewPhi->addIncoming(Vec, IBB);
+ assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+ "Invalid number of incoming values");
+ return NewPhi;
}
- assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
- "Invalid number of incoming values");
- return NewPhi;
- }
-
- case Instruction::ExtractElement: {
- if (CanReuseExtract(VL, VL.size(), VecTy))
- return VL0->getOperand(0);
- return Gather(VL, VecTy);
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- ValueList INVL;
- for (int i = 0, e = VL.size(); i < e; ++i)
- INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
-
- Builder.SetInsertPoint(getLastInstruction(VL));
- Value *InVec = vectorizeTree_rec(INVL);
- CastInst *CI = dyn_cast<CastInst>(VL0);
- Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
- VectorizedValues[VL0] = V;
- return V;
- }
- case Instruction::FCmp:
- case Instruction::ICmp: {
- // Check that all of the compares have the same predicate.
- CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
- for (unsigned i = 1, e = VL.size(); i < e; ++i) {
- CmpInst *Cmp = cast<CmpInst>(VL[i]);
- if (Cmp->getPredicate() != P0)
- return Gather(VL, VecTy);
+ case Instruction::ExtractElement: {
+ if (CanReuseExtract(E->Scalars)) {
+ Value *V = VL0->getOperand(0);
+ E->VectorizedValue = V;
+ return V;
+ }
+ return Gather(E->Scalars, VecTy);
}
-
- ValueList LHSV, RHSV;
- for (int i = 0, e = VL.size(); i < e; ++i) {
- LHSV.push_back(cast<Instruction>(VL[i])->getOperand(0));
- RHSV.push_back(cast<Instruction>(VL[i])->getOperand(1));
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ ValueList INVL;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+ INVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+
+ Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+ Value *InVec = vectorizeTree(INVL);
+ CastInst *CI = dyn_cast<CastInst>(VL0);
+ Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+ E->VectorizedValue = V;
+ return V;
}
+ case Instruction::FCmp:
+ case Instruction::ICmp: {
+ ValueList LHSV, RHSV;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+ LHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ RHSV.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ }
- Builder.SetInsertPoint(getLastInstruction(VL));
- Value *L = vectorizeTree_rec(LHSV);
- Value *R = vectorizeTree_rec(RHSV);
- Value *V;
+ Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+ Value *L = vectorizeTree(LHSV);
+ Value *R = vectorizeTree(RHSV);
+ Value *V;
- if (Opcode == Instruction::FCmp)
- V = Builder.CreateFCmp(P0, L, R);
- else
- V = Builder.CreateICmp(P0, L, R);
+ CmpInst::Predicate P0 = dyn_cast<CmpInst>(VL0)->getPredicate();
+ if (Opcode == Instruction::FCmp)
+ V = Builder.CreateFCmp(P0, L, R);
+ else
+ V = Builder.CreateICmp(P0, L, R);
- VectorizedValues[VL0] = V;
- return V;
- }
- case Instruction::Select: {
- ValueList TrueVec, FalseVec, CondVec;
- for (int i = 0, e = VL.size(); i < e; ++i) {
- CondVec.push_back(cast<Instruction>(VL[i])->getOperand(0));
- TrueVec.push_back(cast<Instruction>(VL[i])->getOperand(1));
- FalseVec.push_back(cast<Instruction>(VL[i])->getOperand(2));
+ E->VectorizedValue = V;
+ return V;
}
+ case Instruction::Select: {
+ ValueList TrueVec, FalseVec, CondVec;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+ CondVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ TrueVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ FalseVec.push_back(cast<Instruction>(E->Scalars[i])->getOperand(2));
+ }
- Builder.SetInsertPoint(getLastInstruction(VL));
- Value *True = vectorizeTree_rec(TrueVec);
- Value *False = vectorizeTree_rec(FalseVec);
- Value *Cond = vectorizeTree_rec(CondVec);
- Value *V = Builder.CreateSelect(Cond, True, False);
- VectorizedValues[VL0] = V;
- return V;
- }
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- ValueList LHSVL, RHSVL;
- for (int i = 0, e = VL.size(); i < e; ++i) {
- LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
- RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
+ Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+ Value *Cond = vectorizeTree(CondVec);
+ Value *True = vectorizeTree(TrueVec);
+ Value *False = vectorizeTree(FalseVec);
+ Value *V = Builder.CreateSelect(Cond, True, False);
+ E->VectorizedValue = V;
+ return V;
}
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ ValueList LHSVL, RHSVL;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i) {
+ LHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(0));
+ RHSVL.push_back(cast<Instruction>(E->Scalars[i])->getOperand(1));
+ }
- Builder.SetInsertPoint(getLastInstruction(VL));
- Value *LHS = vectorizeTree_rec(LHSVL);
- Value *RHS = vectorizeTree_rec(RHSVL);
+ Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+ Value *LHS = vectorizeTree(LHSVL);
+ Value *RHS = vectorizeTree(RHSVL);
- if (LHS == RHS) {
- assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
- }
+ if (LHS == RHS && isa<Instruction>(LHS)) {
+ assert((VL0->getOperand(0) == VL0->getOperand(1)) && "Invalid order");
+ }
- BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
- Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
- VectorizedValues[VL0] = V;
- return V;
- }
- case Instruction::Load: {
- // Check if all of the loads are consecutive.
- for (unsigned i = 1, e = VL.size(); i < e; ++i)
- if (!isConsecutiveAccess(VL[i - 1], VL[i]))
- return Gather(VL, VecTy);
-
- // Loads are inserted at the head of the tree because we don't want to
- // sink them all the way down past store instructions.
- Builder.SetInsertPoint(getLastInstruction(VL));
- LoadInst *LI = cast<LoadInst>(VL0);
- Value *VecPtr =
- Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo());
- unsigned Alignment = LI->getAlignment();
- LI = Builder.CreateLoad(VecPtr);
- LI->setAlignment(Alignment);
-
- VectorizedValues[VL0] = LI;
- return LI;
+ BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
+ Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS);
+ E->VectorizedValue = V;
+ return V;
+ }
+ case Instruction::Load: {
+ // Loads are inserted at the head of the tree because we don't want to
+ // sink them all the way down past store instructions.
+ Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+ LoadInst *LI = cast<LoadInst>(VL0);
+ Value *VecPtr =
+ Builder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo());
+ unsigned Alignment = LI->getAlignment();
+ LI = Builder.CreateLoad(VecPtr);
+ LI->setAlignment(Alignment);
+ E->VectorizedValue = LI;
+ return LI;
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(VL0);
+ unsigned Alignment = SI->getAlignment();
+
+ ValueList ValueOp;
+ for (int i = 0, e = E->Scalars.size(); i < e; ++i)
+ ValueOp.push_back(cast<StoreInst>(E->Scalars[i])->getValueOperand());
+
+ Builder.SetInsertPoint(getLastInstruction(E->Scalars));
+ Value *VecValue = vectorizeTree(ValueOp);
+ Value *VecPtr =
+ Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo());
+ StoreInst *S = Builder.CreateStore(VecValue, VecPtr);
+ S->setAlignment(Alignment);
+ E->VectorizedValue = S;
+ return S;
+ }
+ default:
+ llvm_unreachable("unknown inst");
}
- case Instruction::Store: {
- StoreInst *SI = cast<StoreInst>(VL0);
- unsigned Alignment = SI->getAlignment();
+ return 0;
+}
- ValueList ValueOp;
- for (int i = 0, e = VL.size(); i < e; ++i)
- ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
+void BoUpSLP::vectorizeTree() {
+ vectorizeTree(&VectorizableTree[0]);
- Value *VecValue = vectorizeTree_rec(ValueOp);
+ // For each vectorized value:
+ for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) {
+ TreeEntry *Entry = &VectorizableTree[EIdx];
- Builder.SetInsertPoint(getLastInstruction(VL));
- Value *VecPtr =
- Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo());
- Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
- return 0;
- }
- default:
- return Gather(VL, VecTy);
- }
-}
+ // For each lane:
+ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+ Value *Scalar = Entry->Scalars[Lane];
-Value *FuncSLP::vectorizeTree(ArrayRef<Value *> VL) {
- Builder.SetInsertPoint(getLastInstruction(VL));
- Value *V = vectorizeTree_rec(VL);
+ // No need to handle users of gathered values.
+ if (Entry->NeedToGather)
+ continue;
- DEBUG(dbgs() << "SLP: Placing 'extracts'\n");
- for (SetVector<Instruction*>::iterator it = ExtractedLane.begin(), e =
- ExtractedLane.end(); it != e; ++it) {
- Instruction *Scalar = *it;
- DEBUG(dbgs() << "SLP: Looking at " << *Scalar);
+ Value *Vec = Entry->VectorizedValue;
+ assert(Vec && "Can't find vectorizable value");
- if (!Scalar)
- continue;
+ SmallVector<User*, 16> Users(Scalar->use_begin(), Scalar->use_end());
- Instruction *Loc = 0;
+ for (SmallVector<User*, 16>::iterator User = Users.begin(),
+ UE = Users.end(); User != UE; ++User) {
+ DEBUG(dbgs() << "SLP: \tupdating user " << **User << ".\n");
- assert(MultiUserVals.count(Scalar) && "Can't find the lane to extract");
- Instruction *Leader = MultiUserVals[Scalar].Leader;
+ bool Gathered = MustGather.count(*User);
- // This value is gathered so we don't need to extract from anywhere.
- if (!VectorizedValues.count(Leader))
- continue;
+ // Skip in-tree scalars that become vectors.
+ if (ScalarToTreeEntry.count(*User) && !Gathered) {
+ DEBUG(dbgs() << "SLP: \tUser will be removed soon:" <<
+ **User << ".\n");
+ int Idx = ScalarToTreeEntry[*User]; (void) Idx;
+ assert(!VectorizableTree[Idx].NeedToGather && "bad state ?");
+ continue;
+ }
- Value *Vec = VectorizedValues[Leader];
- if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
- Loc = PN->getParent()->getFirstInsertionPt();
- } else {
- Instruction *I = cast<Instruction>(Vec);
- BasicBlock::iterator L = *I;
- Loc = ++L;
- }
+ if (!isa<Instruction>(*User))
+ continue;
- Builder.SetInsertPoint(Loc);
- assert(LaneMap.count(Scalar) && "Can't find the extracted lane.");
- int Lane = LaneMap[Scalar];
- Value *Idx = Builder.getInt32(Lane);
- Value *Extract = Builder.CreateExtractElement(Vec, Idx);
+ // Generate extracts for out-of-tree users.
+ // Find the insertion point for the extractelement lane.
+ Instruction *Loc = 0;
+ if (PHINode *PN = dyn_cast<PHINode>(Vec)) {
+ Loc = PN->getParent()->getFirstInsertionPt();
+ } else if (Instruction *Iv = dyn_cast<Instruction>(Vec)){
+ Loc = ++((BasicBlock::iterator)*Iv);
+ } else {
+ Loc = F->getEntryBlock().begin();
+ }
- bool Replaced = false;;
- for (Value::use_iterator U = Scalar->use_begin(), UE = Scalar->use_end();
- U != UE; ++U) {
- Instruction *UI = cast<Instruction>(*U);
- // No need to replace instructions that are inside our lane map.
- if (LaneMap.count(UI))
- continue;
+ Builder.SetInsertPoint(Loc);
+ Value *Ex = Builder.CreateExtractElement(Vec, Builder.getInt32(Lane));
+ (*User)->replaceUsesOfWith(Scalar, Ex);
+ DEBUG(dbgs() << "SLP: \tupdated user:" << **User << ".\n");
+ }
- UI->replaceUsesOfWith(Scalar ,Extract);
- Replaced = true;
+ Type *Ty = Scalar->getType();
+ if (!Ty->isVoidTy()) {
+ for (Value::use_iterator User = Scalar->use_begin(), UE = Scalar->use_end();
+ User != UE; ++User) {
+ DEBUG(dbgs() << "SLP: \tvalidating user:" << **User << ".\n");
+ assert(!MustGather.count(*User) &&
+ "Replacing gathered value with undef");
+ assert(ScalarToTreeEntry.count(*User) &&
+ "Replacing out-of-tree value with undef");
+ }
+ Value *Undef = UndefValue::get(Ty);
+ Scalar->replaceAllUsesWith(Undef);
+ }
+ DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+ cast<Instruction>(Scalar)->eraseFromParent();
}
- assert(Replaced && "Must replace at least one outside user");
- (void)Replaced;
}
- // We moved some instructions around. We have to number them again
- // before we can do any analysis.
- forgetNumbering();
-
- // Clear the state.
- MustGather.clear();
- VisitedPHIs.clear();
- VectorizedValues.clear();
- MemBarrierIgnoreList.clear();
- return V;
-}
-
-Value *FuncSLP::vectorizeArith(ArrayRef<Value *> Operands) {
- Instruction *LastInst = getLastInstruction(Operands);
- Value *Vec = vectorizeTree(Operands);
- // After vectorizing the operands we need to generate extractelement
- // instructions and replace all of the uses of the scalar values with
- // the values that we extracted from the vectorized tree.
- Builder.SetInsertPoint(LastInst);
- for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
- Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
- Operands[i]->replaceAllUsesWith(S);
+ for (Function::iterator it = F->begin(), e = F->end(); it != e; ++it) {
+ BlocksNumbers[it].forget();
}
-
- forgetNumbering();
- return Vec;
}
-void FuncSLP::optimizeGatherSequence() {
+void BoUpSLP::optimizeGatherSequence() {
+ DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+ << " gather sequences instructions.\n");
// LICM InsertElementInst sequences.
for (SetVector<Instruction *>::iterator it = GatherSeq.begin(),
e = GatherSeq.end(); it != e; ++it) {
@@ -1449,8 +1367,6 @@ void FuncSLP::optimizeGatherSequence() {
assert((*v)->getNumUses() == 0 && "Can't remove instructions with uses");
(*v)->eraseFromParent();
}
-
- forgetNumbering();
}
/// The SLPVectorizer Pass.
@@ -1492,7 +1408,7 @@ struct SLPVectorizer : public FunctionPass {
// Use the bollom up slp vectorizer to construct chains that start with
// he store instructions.
- FuncSLP R(&F, SE, DL, TTI, AA, LI, DT);
+ BoUpSLP R(&F, SE, DL, TTI, AA, LI, DT);
// Scan the blocks in the function in post order.
for (po_iterator<BasicBlock*> it = po_begin(&F.getEntryBlock()),
@@ -1536,31 +1452,146 @@ private:
/// object. We sort the stores to their base objects to reduce the cost of the
/// quadratic search on the stores. TODO: We can further reduce this cost
/// if we flush the chain creation every time we run into a memory barrier.
- unsigned collectStores(BasicBlock *BB, FuncSLP &R);
+ unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
/// \brief Try to vectorize a chain that starts at two arithmetic instrs.
- bool tryToVectorizePair(Value *A, Value *B, FuncSLP &R);
+ bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
/// \brief Try to vectorize a list of operands. If \p NeedExtracts is true
/// then we calculate the cost of extracting the scalars from the vector.
/// \returns true if a value was vectorized.
- bool tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R, bool NeedExtracts);
+ bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, bool NeedExtracts);
/// \brief Try to vectorize a chain that may start at the operands of \V;
- bool tryToVectorize(BinaryOperator *V, FuncSLP &R);
+ bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
/// \brief Vectorize the stores that were collected in StoreRefs.
- bool vectorizeStoreChains(FuncSLP &R);
+ bool vectorizeStoreChains(BoUpSLP &R);
/// \brief Scan the basic block and look for patterns that are likely to start
/// a vectorization chain.
- bool vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R);
+ bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
+
+ bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold,
+ BoUpSLP &R);
+ bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
+ BoUpSLP &R);
private:
StoreListMap StoreRefs;
};
-unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) {
+bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
+ int CostThreshold, BoUpSLP &R) {
+ unsigned ChainLen = Chain.size();
+ DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
+ << "\n");
+ Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+ unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+ unsigned VF = MinVecRegSize / Sz;
+
+ if (!isPowerOf2_32(Sz) || VF < 2)
+ return false;
+
+ bool Changed = false;
+ // Look for profitable vectorizable trees at all offsets, starting at zero.
+ for (unsigned i = 0, e = ChainLen; i < e; ++i) {
+ if (i + VF > e)
+ break;
+ DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << i
+ << "\n");
+ ArrayRef<Value *> Operands = Chain.slice(i, VF);
+
+ R.buildTree(Operands);
+
+ int Cost = R.getTreeCost();
+
+ DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+ if (Cost < CostThreshold) {
+ DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+ R.vectorizeTree();
+
+ // Move to the next bundle.
+ i += VF - 1;
+ Changed = true;
+ }
+ }
+
+ if (Changed || ChainLen > VF)
+ return Changed;
+
+ // Handle short chains. This helps us catch types such as <3 x float> that
+ // are smaller than vector size.
+ R.buildTree(Chain);
+
+ int Cost = R.getTreeCost();
+
+ if (Cost < CostThreshold) {
+ DEBUG(dbgs() << "SLP: Found store chain cost = " << Cost
+ << " for size = " << ChainLen << "\n");
+ R.vectorizeTree();
+ return true;
+ }
+
+ return false;
+}
+
+bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
+ int costThreshold, BoUpSLP &R) {
+ SetVector<Value *> Heads, Tails;
+ SmallDenseMap<Value *, Value *> ConsecutiveChain;
+
+ // We may run into multiple chains that merge into a single chain. We mark the
+ // stores that we vectorized so that we don't visit the same store twice.
+ BoUpSLP::ValueSet VectorizedStores;
+ bool Changed = false;
+
+ // Do a quadratic search on all of the given stores and find
+ // all of the pairs of loads that follow each other.
+ for (unsigned i = 0, e = Stores.size(); i < e; ++i)
+ for (unsigned j = 0; j < e; ++j) {
+ if (i == j)
+ continue;
+
+ if (R.isConsecutiveAccess(Stores[i], Stores[j])) {
+ Tails.insert(Stores[j]);
+ Heads.insert(Stores[i]);
+ ConsecutiveChain[Stores[i]] = Stores[j];
+ }
+ }
+
+ // For stores that start but don't end a link in the chain:
+ for (SetVector<Value *>::iterator it = Heads.begin(), e = Heads.end();
+ it != e; ++it) {
+ if (Tails.count(*it))
+ continue;
+
+ // We found a store instr that starts a chain. Now follow the chain and try
+ // to vectorize it.
+ BoUpSLP::ValueList Operands;
+ Value *I = *it;
+ // Collect the chain into a list.
+ while (Tails.count(I) || Heads.count(I)) {
+ if (VectorizedStores.count(I))
+ break;
+ Operands.push_back(I);
+ // Move to the next value in the chain.
+ I = ConsecutiveChain[I];
+ }
+
+ bool Vectorized = vectorizeStoreChain(Operands, costThreshold, R);
+
+ // Mark the vectorized stores so that we don't vectorize them again.
+ if (Vectorized)
+ VectorizedStores.insert(Operands.begin(), Operands.end());
+ Changed |= Vectorized;
+ }
+
+ return Changed;
+}
+
+
+unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
unsigned count = 0;
StoreRefs.clear();
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
@@ -1585,14 +1616,14 @@ unsigned SLPVectorizer::collectStores(BasicBlock *BB, FuncSLP &R) {
return count;
}
-bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, FuncSLP &R) {
+bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
if (!A || !B)
return false;
Value *VL[] = { A, B };
return tryToVectorizeList(VL, R, true);
}
-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R,
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
bool NeedExtracts) {
if (VL.size() < 2)
return false;
@@ -1615,9 +1646,8 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R,
return 0;
}
- int Cost = R.getTreeCost(VL);
- if (Cost == FuncSLP::MAX_COST)
- return false;
+ R.buildTree(VL);
+ int Cost = R.getTreeCost();
int ExtrCost = NeedExtracts ? R.getGatherCost(VL) : 0;
DEBUG(dbgs() << "SLP: Cost of pair:" << Cost
@@ -1625,11 +1655,11 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, FuncSLP &R,
if ((Cost + ExtrCost) >= -SLPCostThreshold)
return false;
DEBUG(dbgs() << "SLP: Vectorizing pair.\n");
- R.vectorizeArith(VL);
+ R.vectorizeTree();
return true;
}
-bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) {
+bool SLPVectorizer::tryToVectorize(BinaryOperator *V, BoUpSLP &R) {
if (!V)
return false;
@@ -1669,7 +1699,7 @@ bool SLPVectorizer::tryToVectorize(BinaryOperator *V, FuncSLP &R) {
return 0;
}
-bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) {
+bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
bool Changed = false;
for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
if (isa<DbgInfoIntrinsic>(it))
@@ -1737,7 +1767,7 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, FuncSLP &R) {
return Changed;
}
-bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) {
+bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
bool Changed = false;
// Attempt to sort and vectorize each of the store-groups.
for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
@@ -1748,7 +1778,7 @@ bool SLPVectorizer::vectorizeStoreChains(FuncSLP &R) {
DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
<< it->second.size() << ".\n");
- Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
+ Changed |= vectorizeStores(it->second, -SLPCostThreshold, R);
}
return Changed;
}