summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorArnold Schwaighofer <aschwaighofer@apple.com>2014-01-28 01:01:53 +0000
committerArnold Schwaighofer <aschwaighofer@apple.com>2014-01-28 01:01:53 +0000
commita47aa4b4efb2b6c5944c2edb9a4346c0f3ffcaf1 (patch)
treef055ceb22bddd71944654b889fb90cf9944c563c /lib
parent1c2827cd6a5456126b9a8041642148279456f71a (diff)
downloadllvm-a47aa4b4efb2b6c5944c2edb9a4346c0f3ffcaf1.tar.gz
llvm-a47aa4b4efb2b6c5944c2edb9a4346c0f3ffcaf1.tar.bz2
llvm-a47aa4b4efb2b6c5944c2edb9a4346c0f3ffcaf1.tar.xz
LoopVectorize: Support conditional stores by scalarizing
The vectorizer takes a loop like this and widens all instructions except for the store. The stores are scalarized/unrolled and hidden behind an "if" block. for (i = 0; i < 128; ++i) { if (a[i] < 10) a[i] += val; } for (i = 0; i < 128; i+=2) { v = a[i:i+1]; v0 = (extract v, 0) + 10; v1 = (extract v, 1) + 10; if (v0 < 10) a[i] = v0; if (v1 < 10) a[i] = v1; } The vectorizer relies on subsequent optimizations to sink instructions into the conditional block where they are anticipated. The flag "vectorize-num-stores-pred" controls whether and how many stores to handle this way. Vectorization of conditional stores is disabled per default for now. This patch also adds a change to the heuristic when the flag "enable-loadstore-runtime-unroll" is enabled (off by default). It unrolls small loops until load/store ports are saturated. This heuristic uses TTI's getMaxUnrollFactor as a measure for load/store ports. I also added a second flag -enable-cond-stores-vec. It will enable vectorization of conditional stores. But there is no cost model for vectorization of conditional stores in place yet so this will not do good at the moment. rdar://15892953 Results for x86-64 -O3 -mavx +/- -mllvm -enable-loadstore-runtime-unroll -vectorize-num-stores-pred=1 (before the BFI change): Performance Regressions: Benchmarks/Ptrdist/yacr2/yacr2 7.35% (maze3() is identical but 10% slower) Applications/siod/siod 2.18% Performance improvements: mesa -4.42% libquantum -4.15% With a patch that slightly changes the register heuristics (by subtracting the induction variable on both sides of the register pressure equation, as the induction variable is probably not really unrolled): Performance Regressions: Benchmarks/Ptrdist/yacr2/yacr2 7.73% Applications/siod/siod 1.97% Performance Improvements: libquantum -13.05% (we now also unroll quantum_toffoli) mesa -4.27% git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200270 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp223
1 files changed, 194 insertions, 29 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5d1f85f86d..7867495284 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -172,6 +172,20 @@ static cl::opt<unsigned> SmallLoopCost(
"small-loop-cost", cl::init(20), cl::Hidden,
cl::desc("The cost of a loop that is considered 'small' by the unroller."));
+// Runtime unroll loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeUnroll(
+ "enable-loadstore-runtime-unroll", cl::init(false), cl::Hidden,
+ cl::desc("Enable runtime unrolling until load/store ports are saturated"));
+
+/// The number of stores in a loop that are allowed to need predication.
+static cl::opt<unsigned> NumberOfStoresToPredicate(
+ "vectorize-num-stores-pred", cl::init(0), cl::Hidden,
+ cl::desc("Max number of stores to be predicated behind an if."));
+
+static cl::opt<bool> EnableCondStoresVectorization(
+ "enable-cond-stores-vec", cl::init(false), cl::Hidden,
+ cl::desc("Enable if predication of stores during vectorization."));
+
namespace {
// Forward declarations.
@@ -275,8 +289,11 @@ protected:
void updateAnalysis();
/// This instruction is un-vectorizable. Implement it as a sequence
- /// of scalars.
- virtual void scalarizeInstruction(Instruction *Instr);
+ /// of scalars. If \p IfPredicateStore is true we need to 'hide' each
+ /// scalarized instruction behind an if block predicated on the control
+ /// dependence of the instruction.
+ virtual void scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore=false);
/// Vectorize Load and Store instructions,
virtual void vectorizeMemoryInstruction(Instruction *Instr);
@@ -379,7 +396,7 @@ protected:
///The ExitBlock of the scalar loop.
BasicBlock *LoopExitBlock;
///The vector loop body.
- BasicBlock *LoopVectorBody;
+ SmallVector<BasicBlock *, 4> LoopVectorBody;
///The scalar loop body.
BasicBlock *LoopScalarBody;
/// A list of all bypass blocks. The first block is the entry of the loop.
@@ -406,7 +423,7 @@ public:
InnerLoopVectorizer(OrigLoop, SE, LI, DT, DL, TLI, 1, UnrollFactor) { }
private:
- virtual void scalarizeInstruction(Instruction *Instr);
+ virtual void scalarizeInstruction(Instruction *Instr, bool IfPredicateStore = false);
virtual void vectorizeMemoryInstruction(Instruction *Instr);
virtual Value *getBroadcastInstrs(Value *V);
virtual Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
@@ -456,10 +473,14 @@ static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
/// induction variable and the different reduction variables.
class LoopVectorizationLegality {
public:
+ unsigned NumLoads;
+ unsigned NumStores;
+ unsigned NumPredStores;
+
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
DominatorTree *DT, TargetLibraryInfo *TLI)
- : TheLoop(L), SE(SE), DL(DL), DT(DT), TLI(TLI),
- Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
+ : NumLoads(0), NumStores(0), NumPredStores(0), TheLoop(L), SE(SE), DL(DL),
+ DT(DT), TLI(TLI), Induction(0), WidestIndTy(0), HasFunNoNaNAttr(false),
MaxSafeDepDistBytes(-1U) {}
/// This enum represents the kinds of reductions that we support.
@@ -1206,7 +1227,9 @@ void LoopVectorizationLegality::RuntimePointerCheck::insert(
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
// We need to place the broadcast of invariant variables outside the loop.
Instruction *Instr = dyn_cast<Instruction>(V);
- bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
+ bool NewInstr =
+ (Instr && std::find(LoopVectorBody.begin(), LoopVectorBody.end(),
+ Instr->getParent()) != LoopVectorBody.end());
bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
// Place the code for broadcasting invariant variables in the new preheader.
@@ -1411,6 +1434,9 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+ if (SI && Legal->blockNeedsPredication(SI->getParent()))
+ return scalarizeInstruction(Instr, true);
+
if (ScalarAllocatedSize != VectorElementSize)
return scalarizeInstruction(Instr);
@@ -1529,7 +1555,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
}
}
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, bool IfPredicateStore) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
@@ -1574,10 +1600,37 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+ Instruction *InsertPt = Builder.GetInsertPoint();
+ BasicBlock *IfBlock = Builder.GetInsertBlock();
+ BasicBlock *CondBlock = 0;
+
+ VectorParts Cond;
+ Loop *VectorLp = 0;
+ if (IfPredicateStore) {
+ assert(Instr->getParent()->getSinglePredecessor() &&
+ "Only support single predecessor blocks");
+ Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
+ Instr->getParent());
+ VectorLp = LI->getLoopFor(IfBlock);
+ assert(VectorLp && "Must have a loop for this block");
+ }
+
// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
// For each scalar that we create:
for (unsigned Width = 0; Width < VF; ++Width) {
+
+ // Start if-block.
+ Value *Cmp = 0;
+ if (IfPredicateStore) {
+ Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Width));
+ Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp, ConstantInt::get(Cmp->getType(), 1));
+ CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+ VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
+ // Update Builder with newly created basic block.
+ Builder.SetInsertPoint(InsertPt);
+ }
+
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
@@ -1598,6 +1651,16 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
if (!IsVoidRetTy)
VecResults[Part] = Builder.CreateInsertElement(VecResults[Part], Cloned,
Builder.getInt32(Width));
+ // End if-block.
+ if (IfPredicateStore) {
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
}
}
}
@@ -2101,7 +2164,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
LoopScalarPreHeader = ScalarPH;
LoopMiddleBlock = MiddleBlock;
LoopExitBlock = ExitBlock;
- LoopVectorBody = VecBody;
+ LoopVectorBody.push_back(VecBody);
LoopScalarBody = OldBasicBlock;
LoopVectorizeHints Hints(Lp, true);
@@ -2369,25 +2432,42 @@ struct CSEDenseMapInfo {
};
}
+/// \brief Check whether this block is a predicated block.
+/// Due to if predication of stores we might create a sequence of "if(pred) a[i]
+/// = ...; " blocks. We start with one vectorized basic block. For every
+/// conditional block we split this vectorized block. Therefore, every second
+/// block will be a predicated one.
+static bool isPredicatedBlock(unsigned BlockNum) {
+ return BlockNum % 2;
+}
+
///\brief Perform cse of induction variable instructions.
-static void cse(BasicBlock *BB) {
+static void cse(SmallVector<BasicBlock *, 4> &BBs) {
// Perform simple cse.
SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
- Instruction *In = I++;
+ for (unsigned i = 0, e = BBs.size(); i != e; ++i) {
+ BasicBlock *BB = BBs[i];
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *In = I++;
- if (!CSEDenseMapInfo::canHandle(In))
- continue;
+ if (!CSEDenseMapInfo::canHandle(In))
+ continue;
- // Check if we can replace this instruction with any of the
- // visited instructions.
- if (Instruction *V = CSEMap.lookup(In)) {
- In->replaceAllUsesWith(V);
- In->eraseFromParent();
- continue;
- }
+ // Check if we can replace this instruction with any of the
+ // visited instructions.
+ if (Instruction *V = CSEMap.lookup(In)) {
+ In->replaceAllUsesWith(V);
+ In->eraseFromParent();
+ continue;
+ }
+ // Ignore instructions in conditional blocks. We create "if (pred) a[i] =
+ // ...;" blocks for predicated stores. Every second block is a predicated
+ // block.
+ if (isPredicatedBlock(i))
+ continue;
- CSEMap[In] = In;
+ CSEMap[In] = In;
+ }
}
}
@@ -2503,7 +2583,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
// first unroll part.
Value *StartVal = (part == 0) ? VectorStart : Identity;
cast<PHINode>(VecRdxPhi[part])->addIncoming(StartVal, VecPreheader);
- cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part], LoopVectorBody);
+ cast<PHINode>(VecRdxPhi[part])->addIncoming(Val[part],
+ LoopVectorBody.back());
}
// Before each round, move the insertion point right between
@@ -2522,7 +2603,8 @@ void InnerLoopVectorizer::vectorizeLoop() {
Value *StartVal = (part == 0) ? VectorStart : Identity;
for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
NewPhi->addIncoming(StartVal, LoopBypassBlocks[I]);
- NewPhi->addIncoming(RdxExitVal[part], LoopVectorBody);
+ NewPhi->addIncoming(RdxExitVal[part],
+ LoopVectorBody.back());
RdxParts.push_back(NewPhi);
}
@@ -2695,7 +2777,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
Type *VecTy = (VF == 1) ? PN->getType() :
VectorType::get(PN->getType(), VF);
Entry[part] = PHINode::Create(VecTy, 2, "vec.phi",
- LoopVectorBody-> getFirstInsertionPt());
+ LoopVectorBody.back()-> getFirstInsertionPt());
}
PV->push_back(P);
return;
@@ -3044,7 +3126,19 @@ void InnerLoopVectorizer::updateAnalysis() {
for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
DT->addNewBlock(LoopBypassBlocks[I], LoopBypassBlocks[I-1]);
DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlocks.back());
- DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+
+ // Due to if predication of stores we might create a sequence of "if(pred)
+ // a[i] = ...; " blocks.
+ for (unsigned i = 0, e = LoopVectorBody.size(); i != e; ++i) {
+ if (i == 0)
+ DT->addNewBlock(LoopVectorBody[0], LoopVectorPreHeader);
+ else if (isPredicatedBlock(i)) {
+ DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-1]);
+ } else {
+ DT->addNewBlock(LoopVectorBody[i], LoopVectorBody[i-2]);
+ }
+ }
+
DT->addNewBlock(LoopMiddleBlock, LoopBypassBlocks.front());
DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
@@ -4292,6 +4386,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
DEBUG(dbgs() << "LV: Found a non-simple load.\n");
return false;
}
+ NumLoads++;
Loads.push_back(Ld);
DepChecker.addAccess(Ld);
continue;
@@ -4305,6 +4400,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
DEBUG(dbgs() << "LV: Found a non-simple store.\n");
return false;
}
+ NumStores++;
Stores.push_back(St);
DepChecker.addAccess(St);
}
@@ -4816,7 +4912,16 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
}
// We don't predicate stores at the moment.
- if (it->mayWriteToMemory() || it->mayThrow())
+ if (it->mayWriteToMemory()) {
+ StoreInst *SI = dyn_cast<StoreInst>(it);
+ // We only support predication of stores in basic blocks with one
+ // predecessor.
+ if (!SI || ++NumPredStores > NumberOfStoresToPredicate ||
+ !SafePtrs.count(SI->getPointerOperand()) ||
+ !SI->getParent()->getSinglePredecessor())
+ return false;
+ }
+ if (it->mayThrow())
return false;
// Check that we don't have a constant expression that can trap as operand.
@@ -4851,6 +4956,11 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
return Factor;
}
+ if (!EnableCondStoresVectorization && Legal->NumPredStores) {
+ DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+ return Factor;
+ }
+
// Find the trip count.
unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
@@ -5066,6 +5176,17 @@ LoopVectorizationCostModel::selectUnrollFactor(bool OptForSize,
return UF;
}
+ if (EnableLoadStoreRuntimeUnroll &&
+ !Legal->getRuntimePointerCheck()->Need &&
+ LoopCost < SmallLoopCost) {
+ // Unroll until store/load ports (estimated by max unroll factor) are
+ // saturated.
+ unsigned UnrollStores = UF / (Legal->NumStores ? Legal->NumStores : 1);
+ unsigned UnrollLoads = UF / (Legal->NumLoads ? Legal->NumLoads : 1);
+ UF = std::max(std::min(UnrollStores, UnrollLoads), 1u);
+ return UF;
+ }
+
// We want to unroll tiny loops in order to reduce the loop overhead.
// We assume that the cost overhead is 1 and we use the cost model
// to estimate the cost of the loop and unroll until the cost of the
@@ -5515,7 +5636,8 @@ bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
}
-void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
+void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
+ bool IfPredicateStore) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<VectorParts, 4> Params;
@@ -5560,10 +5682,39 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
// Create a new entry in the WidenMap and initialize it to Undef or Null.
VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
+ Instruction *InsertPt = Builder.GetInsertPoint();
+ BasicBlock *IfBlock = Builder.GetInsertBlock();
+ BasicBlock *CondBlock = 0;
+
+ VectorParts Cond;
+ Loop *VectorLp = 0;
+ if (IfPredicateStore) {
+ assert(Instr->getParent()->getSinglePredecessor() &&
+ "Only support single predecessor blocks");
+ Cond = createEdgeMask(Instr->getParent()->getSinglePredecessor(),
+ Instr->getParent());
+ VectorLp = LI->getLoopFor(IfBlock);
+ assert(VectorLp && "Must have a loop for this block");
+ }
+
// For each vector unroll 'part':
for (unsigned Part = 0; Part < UF; ++Part) {
// For each scalar that we create:
+ // Start an "if (pred) a[i] = ..." block.
+ Value *Cmp = 0;
+ if (IfPredicateStore) {
+ if (Cond[Part]->getType()->isVectorTy())
+ Cond[Part] =
+ Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
+ Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
+ ConstantInt::get(Cond[Part]->getType(), 1));
+ CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+ VectorLp->addBasicBlockToLoop(CondBlock, LI->getBase());
+ // Update Builder with newly created basic block.
+ Builder.SetInsertPoint(InsertPt);
+ }
+
Instruction *Cloned = Instr->clone();
if (!IsVoidRetTy)
Cloned->setName(Instr->getName() + ".cloned");
@@ -5580,11 +5731,25 @@ void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr) {
// so that future users will be able to use it.
if (!IsVoidRetTy)
VecResults[Part] = Cloned;
+
+ // End if-block.
+ if (IfPredicateStore) {
+ BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+ VectorLp->addBasicBlockToLoop(NewIfBlock, LI->getBase());
+ Builder.SetInsertPoint(InsertPt);
+ Instruction *OldBr = IfBlock->getTerminator();
+ BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+ OldBr->eraseFromParent();
+ IfBlock = NewIfBlock;
+ }
}
}
void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
- return scalarizeInstruction(Instr);
+ StoreInst *SI = dyn_cast<StoreInst>(Instr);
+ bool IfPredicateStore = (SI && Legal->blockNeedsPredication(SI->getParent()));
+
+ return scalarizeInstruction(Instr, IfPredicateStore);
}
Value *InnerLoopUnroller::reverseVector(Value *Vec) {