//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// #define DEBUG_TYPE "SLP" #include "VecUtils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include #include using namespace llvm; static const unsigned MinVecRegSize = 128; static const unsigned RecursionMaxDepth = 6; namespace llvm { BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl, TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) : Builder(S->getContext()), BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp) { numberInstructions(); } void BoUpSLP::numberInstructions() { int Loc = 0; InstrIdx.clear(); InstrVec.clear(); // Number the instructions in the block. for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) { InstrIdx[it] = Loc++; InstrVec.push_back(it); assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation"); } } Value *BoUpSLP::getPointerOperand(Value *I) { if (LoadInst *LI = dyn_cast(I)) return LI->getPointerOperand(); if (StoreInst *SI = dyn_cast(I)) return SI->getPointerOperand(); return 0; } unsigned BoUpSLP::getAddressSpaceOperand(Value *I) { if (LoadInst *L=dyn_cast(I)) return L->getPointerAddressSpace(); if (StoreInst *S=dyn_cast(I)) return S->getPointerAddressSpace(); return -1; } bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) { Value *PtrA = getPointerOperand(A); Value *PtrB = getPointerOperand(B); unsigned ASA = getAddressSpaceOperand(A); unsigned ASB = getAddressSpaceOperand(B); // Check that the address spaces match and that the pointers are valid. if (!PtrA || !PtrB || (ASA != ASB)) return false; // Check that A and B are of the same type. if (PtrA->getType() != PtrB->getType()) return false; // Calculate the distance. const SCEV *PtrSCEVA = SE->getSCEV(PtrA); const SCEV *PtrSCEVB = SE->getSCEV(PtrB); const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB); const SCEVConstant *ConstOffSCEV = dyn_cast(OffsetSCEV); // Non constant distance. if (!ConstOffSCEV) return false; int64_t Offset = ConstOffSCEV->getValue()->getSExtValue(); Type *Ty = cast(PtrA->getType())->getElementType(); // The Instructions are connsecutive if the size of the first load/store is // the same as the offset. int64_t Sz = DL->getTypeStoreSize(Ty); return ((-Offset) == Sz); } bool BoUpSLP::vectorizeStoreChain(ArrayRef Chain, int CostThreshold) { Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); unsigned Sz = DL->getTypeSizeInBits(StoreTy); unsigned VF = MinVecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) return false; bool Changed = false; // Look for profitable vectorizable trees at all offsets, starting at zero. for (unsigned i = 0, e = Chain.size(); i < e; ++i) { if (i + VF > e) return Changed; DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n"); ArrayRef Operands = Chain.slice(i, VF); int Cost = getTreeCost(Operands); DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < CostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); Builder.SetInsertPoint(getInsertionPoint(getLastIndex(Operands,VF))); vectorizeTree(Operands, VF); i += VF - 1; Changed = true; } } return Changed; } bool BoUpSLP::vectorizeStores(ArrayRef Stores, int costThreshold) { SetVector Heads, Tails; SmallDenseMap ConsecutiveChain; // We may run into multiple chains that merge into a single chain. We mark the // stores that we vectorized so that we don't visit the same store twice. ValueSet VectorizedStores; bool Changed = false; // Do a quadratic search on all of the given stores and find // all of the pairs of loads that follow each other. for (unsigned i = 0, e = Stores.size(); i < e; ++i) for (unsigned j = 0; j < e; ++j) { if (i == j) continue; if (isConsecutiveAccess(Stores[i], Stores[j])) { Tails.insert(Stores[j]); Heads.insert(Stores[i]); ConsecutiveChain[Stores[i]] = Stores[j]; } } // For stores that start but don't end a link in the chain: for (SetVector::iterator it = Heads.begin(), e = Heads.end(); it != e; ++it) { if (Tails.count(*it)) continue; // We found a store instr that starts a chain. Now follow the chain and try // to vectorize it. ValueList Operands; Value *I = *it; // Collect the chain into a list. while (Tails.count(I) || Heads.count(I)) { if (VectorizedStores.count(I)) break; Operands.push_back(I); // Move to the next value in the chain. I = ConsecutiveChain[I]; } bool Vectorized = vectorizeStoreChain(Operands, costThreshold); // Mark the vectorized stores so that we don't vectorize them again. if (Vectorized) VectorizedStores.insert(Operands.begin(), Operands.end()); Changed |= Vectorized; } return Changed; } int BoUpSLP::getScalarizationCost(ArrayRef VL) { // Find the type of the operands in VL. Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); // Find the cost of inserting/extracting values from the vector. return getScalarizationCost(VecTy); } int BoUpSLP::getScalarizationCost(Type *Ty) { int Cost = 0; for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); return Cost; } AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) { if (StoreInst *SI = dyn_cast(I)) return AA->getLocation(SI); if (LoadInst *LI = dyn_cast(I)) return AA->getLocation(LI); return AliasAnalysis::Location(); } Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) { assert(Src->getParent() == Dst->getParent() && "Not the same BB"); BasicBlock::iterator I = Src, E = Dst; /// Scan all of the instruction from SRC to DST and check if /// the source may alias. for (++I; I != E; ++I) { // Ignore store instructions that are marked as 'ignore'. if (MemBarrierIgnoreList.count(I)) continue; if (Src->mayWriteToMemory()) /* Write */ { if (!I->mayReadOrWriteMemory()) continue; } else /* Read */ { if (!I->mayWriteToMemory()) continue; } AliasAnalysis::Location A = getLocation(&*I); AliasAnalysis::Location B = getLocation(Src); if (!A.Ptr || !B.Ptr || AA->alias(A, B)) return I; } return 0; } void BoUpSLP::vectorizeArith(ArrayRef Operands) { int LastIdx = getLastIndex(Operands, Operands.size()); Instruction *Loc = getInsertionPoint(LastIdx); Builder.SetInsertPoint(Loc); assert(getFirstUserIndex(Operands, Operands.size()) > LastIdx && "Vectorizing with in-tree users"); Value *Vec = vectorizeTree(Operands, Operands.size()); // After vectorizing the operands we need to generate extractelement // instructions and replace all of the uses of the scalar values with // the values that we extracted from the vectorized tree. for (unsigned i = 0, e = Operands.size(); i != e; ++i) { Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i)); Operands[i]->replaceAllUsesWith(S); } } int BoUpSLP::getTreeCost(ArrayRef VL) { // Get rid of the list of stores that were removed, and from the // lists of instructions with multiple users. MemBarrierIgnoreList.clear(); LaneMap.clear(); MultiUserVals.clear(); MustScalarize.clear(); MustExtract.clear(); // Find the location of the last root. int LastRootIndex = getLastIndex(VL, VL.size()); int FirstUserIndex = getFirstUserIndex(VL, VL.size()); // Don't vectorize if there are users of the tree roots inside the tree // itself. if (LastRootIndex > FirstUserIndex) return max_cost; // Scan the tree and find which value is used by which lane, and which values // must be scalarized. getTreeUses_rec(VL, 0); // Check that instructions with multiple users can be vectorized. Mark unsafe // instructions. for (SetVector::iterator it = MultiUserVals.begin(), e = MultiUserVals.end(); it != e; ++it) { // Check that all of the users of this instr are within the tree // and that they are all from the same lane. int Lane = -1; for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end(); I != E; ++I) { if (LaneMap.find(*I) == LaneMap.end()) { DEBUG(dbgs()<<"SLP: Instr " << **it << " has multiple users.\n"); // We don't have an ordering problem if the user is not in this basic // block. Instruction *Inst = cast(*I); if (Inst->getParent() != BB) { MustExtract.insert(*it); continue; } // We don't have an ordering problem if the user is after the last root. int Idx = InstrIdx[Inst]; if (Idx < LastRootIndex) { MustScalarize.insert(*it); DEBUG(dbgs()<<"SLP: Adding to MustScalarize " "because of an unsafe out of tree usage.\n"); break; } DEBUG(dbgs()<<"SLP: Adding to MustExtract " "because of a safe out of tree usage.\n"); MustExtract.insert(*it); continue; } if (Lane == -1) Lane = LaneMap[*I]; if (Lane != LaneMap[*I]) { MustScalarize.insert(*it); DEBUG(dbgs()<<"SLP: Adding " << **it << " to MustScalarize because multiple lane use it: " << Lane << " and " << LaneMap[*I] << ".\n"); break; } } } // Now calculate the cost of vectorizing the tree. return getTreeCost_rec(VL, 0); } void BoUpSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { if (Depth == RecursionMaxDepth) return; // Don't handle vectors. if (VL[0]->getType()->isVectorTy()) return; if (StoreInst *SI = dyn_cast(VL[0])) if (SI->getValueOperand()->getType()->isVectorTy()) return; // Check if all of the operands are constants. bool AllConst = true; bool AllSameScalar = true; for (unsigned i = 0, e = VL.size(); i < e; ++i) { AllConst &= isa(VL[i]); AllSameScalar &= (VL[0] == VL[i]); Instruction *I = dyn_cast(VL[i]); // If one of the instructions is out of this BB, we need to scalarize all. if (I && I->getParent() != BB) return; } // If all of the operands are identical or constant we have a simple solution. if (AllConst || AllSameScalar) return; // Scalarize unknown structures. Instruction *VL0 = dyn_cast(VL[0]); if (!VL0) return; unsigned Opcode = VL0->getOpcode(); for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *I = dyn_cast(VL[i]); // If not all of the instructions are identical then we have to scalarize. if (!I || Opcode != I->getOpcode()) return; } for (int i = 0, e = VL.size(); i < e; ++i) { // Check that the instruction is only used within // one lane. if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return; // Make this instruction as 'seen' and remember the lane. LaneMap[VL[i]] = i; } // Mark instructions with multiple users. for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *I = dyn_cast(VL[i]); // Remember to check if all of the users of this instr are vectorized // within our tree. At depth zero we have no local users, only external // users that we don't care about. if (Depth && I && I->getNumUses() > 1) { DEBUG(dbgs()<<"SLP: Adding to MultiUserVals " "because it has multiple users:" << *I << " \n"); MultiUserVals.insert(I); } } switch (Opcode) { case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (unsigned j = 0; j < VL.size(); ++j) Operands.push_back(cast(VL[j])->getOperand(i)); getTreeUses_rec(Operands, Depth+1); } return; } case Instruction::Store: { ValueList Operands; for (unsigned j = 0; j < VL.size(); ++j) Operands.push_back(cast(VL[j])->getOperand(0)); getTreeUses_rec(Operands, Depth+1); return; } default: return; } } int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); /// Don't mess with vectors. if (ScalarTy->isVectorTy()) return max_cost; VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy); // Check if all of the operands are constants. bool AllConst = true; bool AllSameScalar = true; bool MustScalarizeFlag = false; for (unsigned i = 0, e = VL.size(); i < e; ++i) { AllConst &= isa(VL[i]); AllSameScalar &= (VL[0] == VL[i]); // Must have a single use. Instruction *I = dyn_cast(VL[i]); MustScalarizeFlag |= MustScalarize.count(VL[i]); // This instruction is outside the basic block. if (I && I->getParent() != BB) return getScalarizationCost(VecTy); } // Is this a simple vector constant. if (AllConst) return 0; // If all of the operands are identical we can broadcast them. Instruction *VL0 = dyn_cast(VL[0]); if (AllSameScalar) { // If we are in a loop, and this is not an instruction (e.g. constant or // argument) or the instruction is defined outside the loop then assume // that the cost is zero. if (L && (!VL0 || !L->contains(VL0))) return 0; // We need to broadcast the scalar. return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } // If this is not a constant, or a scalar from outside the loop then we // need to scalarize it. if (MustScalarizeFlag) return getScalarizationCost(VecTy); if (!VL0) return getScalarizationCost(VecTy); assert(VL0->getParent() == BB && "Wrong BB"); unsigned Opcode = VL0->getOpcode(); for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *I = dyn_cast(VL[i]); // If not all of the instructions are identical then we have to scalarize. if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy); } // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { int MaxIdx = getLastIndex(VL, VL.size()); Instruction *Last = InstrVec[MaxIdx]; for (unsigned i = 0, e = VL.size(); i < e; ++i ) { if (VL[i] == Last) continue; Value *Barrier = isUnsafeToSink(cast(VL[i]), Last); if (Barrier) { DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " << *Last << "\n because of " << *Barrier << "\n"); return max_cost; } } } // Calculate the extract cost. unsigned ExternalUserExtractCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) if (MustExtract.count(VL[i])) ExternalUserExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); switch (Opcode) { case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { int Cost = ExternalUserExtractCost; ValueList Operands; Type *SrcTy = VL0->getOperand(0)->getType(); // Prepare the operand vector. for (unsigned j = 0; j < VL.size(); ++j) { Operands.push_back(cast(VL[j])->getOperand(0)); // Check that the casted type is the same for all users. if (cast(VL[j])->getOperand(0)->getType() != SrcTy) return getScalarizationCost(VecTy); } Cost += getTreeCost_rec(Operands, Depth+1); if (Cost >= max_cost) return max_cost; // Calculate the cost of this instruction. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), VL0->getType(), SrcTy); VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); Cost += (VecCost - ScalarCost); return Cost; } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { int Cost = ExternalUserExtractCost; // Calculate the cost of all of the operands. for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (unsigned j = 0; j < VL.size(); ++j) Operands.push_back(cast(VL[j])->getOperand(i)); Cost += getTreeCost_rec(Operands, Depth+1); if (Cost >= max_cost) return max_cost; } // Calculate the cost of this instruction. int ScalarCost = VecTy->getNumElements() * TTI->getArithmeticInstrCost(Opcode, ScalarTy); int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); Cost += (VecCost - ScalarCost); return Cost; } case Instruction::Load: { // If we are scalarize the loads, add the cost of forming the vector. for (unsigned i = 0, e = VL.size()-1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i+1])) return getScalarizationCost(VecTy); // Cost of wide load - cost of scalar loads. int ScalarLdCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); return VecLdCost - ScalarLdCost + ExternalUserExtractCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. int ScalarStCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0); int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0); int StoreCost = VecStCost - ScalarStCost; ValueList Operands; for (unsigned j = 0; j < VL.size(); ++j) { Operands.push_back(cast(VL[j])->getOperand(0)); MemBarrierIgnoreList.insert(VL[j]); } int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); return TotalCost + ExternalUserExtractCost; } default: // Unable to vectorize unknown instructions. return getScalarizationCost(VecTy); } } int BoUpSLP::getLastIndex(ArrayRef VL, unsigned VF) { int MaxIdx = InstrIdx[BB->getFirstNonPHI()]; for (unsigned i = 0; i < VF; ++i ) MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]); return MaxIdx; } int BoUpSLP::getFirstUserIndex(ArrayRef VL, unsigned VF) { // Find the first user of the values. int FirstUser = InstrVec.size(); for (unsigned i = 0; i < VF; ++i) { for (Value::use_iterator U = VL[i]->use_begin(), UE = VL[i]->use_end(); U != UE; ++U) { Instruction *Instr = dyn_cast(*U); if (!Instr || Instr->getParent() != BB) continue; FirstUser = std::min(FirstUser, InstrIdx[Instr]); } } return FirstUser; } int BoUpSLP::getLastIndex(Instruction *I, Instruction *J) { assert(I->getParent() == BB && "Invalid parent for instruction I"); assert(J->getParent() == BB && "Invalid parent for instruction J"); return std::max(InstrIdx[I],InstrIdx[J]); } Instruction *BoUpSLP::getInsertionPoint(unsigned Index) { return InstrVec[Index + 1]; } Value *BoUpSLP::Scalarize(ArrayRef VL, VectorType *Ty) { Value *Vec = UndefValue::get(Ty); for (unsigned i=0; i < Ty->getNumElements(); ++i) { // Generate the 'InsertElement' instruction. Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); // Remember that this instruction is used as part of a 'gather' sequence. // The caller of the bottom-up slp vectorizer can try to hoist the sequence // if the users are outside of the basic block. GatherInstructions.push_back(Vec); } for (unsigned i = 0; i < Ty->getNumElements(); ++i) VectorizedValues[VL[i]] = Vec; return Vec; } Value *BoUpSLP::vectorizeTree(ArrayRef VL, int VF) { Value *V = vectorizeTree_rec(VL, VF); int LastInstrIdx = getLastIndex(VL, VL.size()); for (SetVector::iterator it = MustExtract.begin(), e = MustExtract.end(); it != e; ++it) { Instruction *I = cast(*it); // This is a scalarized value, so we can use the original value. // No need to extract from the vector. if (!LaneMap.count(I)) continue; Value *Vec = VectorizedValues[I]; // We decided not to vectorize I because one of its users was not // vectorizerd. This is okay. if (!Vec) continue; Value *Idx = Builder.getInt32(LaneMap[I]); Value *Extract = Builder.CreateExtractElement(Vec, Idx); bool Replaced = false; for (Value::use_iterator U = I->use_begin(), UE = I->use_end(); U != UE; ++U) { Instruction *UI = cast(*U); if (UI->getParent() != I->getParent() || InstrIdx[UI] > LastInstrIdx) UI->replaceUsesOfWith(I ,Extract); Replaced = true; } assert(Replaced && "Must replace at least one outside user"); (void)Replaced; } // We moved some instructions around. We have to number them again // before we can do any analysis. numberInstructions(); MustScalarize.clear(); MustExtract.clear(); VectorizedValues.clear(); return V; } Value *BoUpSLP::vectorizeTree_rec(ArrayRef VL, int VF) { Type *ScalarTy = VL[0]->getType(); if (StoreInst *SI = dyn_cast(VL[0])) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VF); // Check if all of the operands are constants or identical. bool AllConst = true; bool AllSameScalar = true; for (unsigned i = 0, e = VF; i < e; ++i) { AllConst &= isa(VL[i]); AllSameScalar &= (VL[0] == VL[i]); // The instruction must be in the same BB, and it must be vectorizable. Instruction *I = dyn_cast(VL[i]); if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB)) return Scalarize(VL, VecTy); } // Check that this is a simple vector constant. if (AllConst || AllSameScalar) return Scalarize(VL, VecTy); // Scalarize unknown structures. Instruction *VL0 = dyn_cast(VL[0]); if (!VL0) return Scalarize(VL, VecTy); if (VectorizedValues.count(VL0)) { Value * Vec = VectorizedValues[VL0]; for (int i = 0; i < VF; ++i) VectorizedValues[VL[i]] = Vec; return Vec; } unsigned Opcode = VL0->getOpcode(); for (unsigned i = 0, e = VF; i < e; ++i) { Instruction *I = dyn_cast(VL[i]); // If not all of the instructions are identical then we have to scalarize. if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy); } switch (Opcode) { case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: case Instruction::FPToSI: case Instruction::FPExt: case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::SIToFP: case Instruction::UIToFP: case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { ValueList INVL; for (int i = 0; i < VF; ++i) INVL.push_back(cast(VL[i])->getOperand(0)); Value *InVec = vectorizeTree_rec(INVL, VF); CastInst *CI = dyn_cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); for (int i = 0; i < VF; ++i) VectorizedValues[VL[i]] = V; return V; } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: case Instruction::FSub: case Instruction::Mul: case Instruction::FMul: case Instruction::UDiv: case Instruction::SDiv: case Instruction::FDiv: case Instruction::URem: case Instruction::SRem: case Instruction::FRem: case Instruction::Shl: case Instruction::LShr: case Instruction::AShr: case Instruction::And: case Instruction::Or: case Instruction::Xor: { ValueList LHSVL, RHSVL; for (int i = 0; i < VF; ++i) { LHSVL.push_back(cast(VL[i])->getOperand(0)); RHSVL.push_back(cast(VL[i])->getOperand(1)); } Value *LHS = vectorizeTree_rec(LHSVL, VF); Value *RHS = vectorizeTree_rec(RHSVL, VF); BinaryOperator *BinOp = cast(VL0); Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS,RHS); for (int i = 0; i < VF; ++i) VectorizedValues[VL[i]] = V; return V; } case Instruction::Load: { LoadInst *LI = cast(VL0); unsigned Alignment = LI->getAlignment(); // Check if all of the loads are consecutive. for (unsigned i = 1, e = VF; i < e; ++i) if (!isConsecutiveAccess(VL[i-1], VL[i])) return Scalarize(VL, VecTy); // Loads are inserted at the head of the tree because we don't want to sink // them all the way down past store instructions. Instruction *Loc = getInsertionPoint(getLastIndex(VL, VL.size())); IRBuilder<> LoadBuilder(Loc); Value *VecPtr = LoadBuilder.CreateBitCast(LI->getPointerOperand(), VecTy->getPointerTo()); LI = LoadBuilder.CreateLoad(VecPtr); LI->setAlignment(Alignment); for (int i = 0; i < VF; ++i) VectorizedValues[VL[i]] = LI; return LI; } case Instruction::Store: { StoreInst *SI = cast(VL0); unsigned Alignment = SI->getAlignment(); ValueList ValueOp; for (int i = 0; i < VF; ++i) ValueOp.push_back(cast(VL[i])->getValueOperand()); Value *VecValue = vectorizeTree_rec(ValueOp, VF); Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), VecTy->getPointerTo()); Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment); for (int i = 0; i < VF; ++i) cast(VL[i])->eraseFromParent(); return 0; } default: return Scalarize(VL, VecTy); } } } // end of namespace