From de5e5ec3045a73a06b1054417f9ac6c02929e9ce Mon Sep 17 00:00:00 2001 From: Hal Finkel Date: Wed, 1 Feb 2012 03:51:43 +0000 Subject: Add a basic-block autovectorization pass. This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure. Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/Passes.html | 21 + include/llvm-c/Initialization.h | 1 + include/llvm-c/Transforms/Vectorize.h | 37 + include/llvm/InitializePasses.h | 6 +- include/llvm/LinkAllPasses.h | 2 + include/llvm/Transforms/IPO/PassManagerBuilder.h | 1 + include/llvm/Transforms/Vectorize.h | 30 + lib/Transforms/CMakeLists.txt | 1 + lib/Transforms/IPO/LLVMBuild.txt | 2 +- lib/Transforms/IPO/PassManagerBuilder.cpp | 14 + lib/Transforms/LLVMBuild.txt | 2 +- lib/Transforms/Makefile | 2 +- lib/Transforms/Vectorize/BBVectorize.cpp | 1796 ++++++++++++++++++++++ lib/Transforms/Vectorize/CMakeLists.txt | 4 + lib/Transforms/Vectorize/LLVMBuild.txt | 24 + lib/Transforms/Vectorize/Makefile | 15 + lib/Transforms/Vectorize/Vectorize.cpp | 39 + test/Transforms/BBVectorize/cycle.ll | 112 ++ test/Transforms/BBVectorize/dg.exp | 3 + test/Transforms/BBVectorize/ld1.ll | 41 + test/Transforms/BBVectorize/loop1.ll | 93 ++ test/Transforms/BBVectorize/req-depth.ll | 17 + test/Transforms/BBVectorize/search-limit.ll | 46 + test/Transforms/BBVectorize/simple-int.ll | 59 + test/Transforms/BBVectorize/simple-ldstr.ll | 110 ++ test/Transforms/BBVectorize/simple.ll | 152 ++ tools/bugpoint/CMakeLists.txt | 2 +- tools/bugpoint/Makefile | 2 +- tools/llvm-ld/CMakeLists.txt | 2 +- tools/llvm-ld/Makefile | 2 +- tools/lto/CMakeLists.txt | 2 +- tools/lto/Makefile | 2 +- tools/opt/CMakeLists.txt | 2 +- tools/opt/Makefile | 2 +- tools/opt/opt.cpp | 1 + 35 files changed, 2635 insertions(+), 12 deletions(-) create mode 100644 include/llvm-c/Transforms/Vectorize.h create mode 100644 include/llvm/Transforms/Vectorize.h create mode 100644 lib/Transforms/Vectorize/BBVectorize.cpp create mode 100644 lib/Transforms/Vectorize/CMakeLists.txt create mode 100644 lib/Transforms/Vectorize/LLVMBuild.txt create mode 100644 lib/Transforms/Vectorize/Makefile create mode 100644 lib/Transforms/Vectorize/Vectorize.cpp create mode 100644 test/Transforms/BBVectorize/cycle.ll create mode 100644 test/Transforms/BBVectorize/dg.exp create mode 100644 test/Transforms/BBVectorize/ld1.ll create mode 100644 test/Transforms/BBVectorize/loop1.ll create mode 100644 test/Transforms/BBVectorize/req-depth.ll create mode 100644 test/Transforms/BBVectorize/search-limit.ll create mode 100644 test/Transforms/BBVectorize/simple-int.ll create mode 100644 test/Transforms/BBVectorize/simple-ldstr.ll create mode 100644 test/Transforms/BBVectorize/simple.ll diff --git a/docs/Passes.html b/docs/Passes.html index 5c42f3fdd5..840b2eff06 100644 --- a/docs/Passes.html +++ b/docs/Passes.html @@ -126,6 +126,7 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "

\n" if ! -adceAggressive Dead Code Elimination -always-inlineInliner for always_inline functions -argpromotionPromote 'by reference' arguments to scalars +-bb-vectorizeCombine instructions to form vector instructions within basic blocks -block-placementProfile Guided Basic Block Placement -break-crit-edgesBreak critical edges in CFG -codegenprepareOptimize for code generation @@ -815,6 +816,26 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "

\n" if !

+ +

+ -bb-vectorize: Basic-Block Vectorization +

+
+

This pass combines instructions inside basic blocks to form vector + instructions. It iterates over each basic block, attempting to pair + compatible instructions, repeating this process until no additional + pairs are selected for vectorization. When the outputs of some pair + of compatible instructions are used as inputs by some other pair of + compatible instructions, those pairs are part of a potential + vectorization chain. Instruction pairs are only fused into vector + instructions when they are part of a chain longer than some + threshold length. Moreover, the pass attempts to find the best + possible chain for each pair of compatible instructions. These + heuristics are intended to prevent vectorization in cases where + it would not yield a performance increase of the resulting code. +

+
+

-block-placement: Profile Guided Basic Block Placement diff --git a/include/llvm-c/Initialization.h b/include/llvm-c/Initialization.h index 3b59abbec0..cbe60df908 100644 --- a/include/llvm-c/Initialization.h +++ b/include/llvm-c/Initialization.h @@ -25,6 +25,7 @@ extern "C" { void LLVMInitializeCore(LLVMPassRegistryRef R); void LLVMInitializeTransformUtils(LLVMPassRegistryRef R); void LLVMInitializeScalarOpts(LLVMPassRegistryRef R); +void LLVMInitializeVectorization(LLVMPassRegistryRef R); void LLVMInitializeInstCombine(LLVMPassRegistryRef R); void LLVMInitializeIPO(LLVMPassRegistryRef R); void LLVMInitializeInstrumentation(LLVMPassRegistryRef R); diff --git a/include/llvm-c/Transforms/Vectorize.h b/include/llvm-c/Transforms/Vectorize.h new file mode 100644 index 0000000000..178465ac82 --- /dev/null +++ b/include/llvm-c/Transforms/Vectorize.h @@ -0,0 +1,37 @@ +/*===---------------------------Vectorize.h ------------------- -*- C++ -*-===*\ +|*===----------- Vectorization Transformation Library C Interface ---------===*| +|* *| +|* The LLVM Compiler Infrastructure *| +|* *| +|* This file is distributed under the University of Illinois Open Source *| +|* License. See LICENSE.TXT for details. *| +|* *| +|*===----------------------------------------------------------------------===*| +|* *| +|* This header declares the C interface to libLLVMVectorize.a, which *| +|* implements various vectorization transformations of the LLVM IR. *| +|* *| +|* Many exotic languages can interoperate with C code but have a harder time *| +|* with C++ due to name mangling. So in addition to C, this interface enables *| +|* tools written in such languages. *| +|* *| +\*===----------------------------------------------------------------------===*/ + +#ifndef LLVM_C_TRANSFORMS_VECTORIZE_H +#define LLVM_C_TRANSFORMS_VECTORIZE_H + +#include "llvm-c/Core.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** See llvm::createBBVectorizePass function. */ +void LLVMAddBBVectorizePass(LLVMPassManagerRef PM); + +#ifdef __cplusplus +} +#endif /* defined(__cplusplus) */ + +#endif + diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 6cc3f195c7..ac129c77d5 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -31,6 +31,10 @@ void initializeTransformUtils(PassRegistry&); /// ScalarOpts library. void initializeScalarOpts(PassRegistry&); +/// initializeVectorization - Initialize all passes linked into the +/// Vectorize library. +void initializeVectorization(PassRegistry&); + /// initializeInstCombine - Initialize all passes linked into the /// ScalarOpts library. void initializeInstCombine(PassRegistry&); @@ -236,7 +240,7 @@ void initializeVirtRegMapPass(PassRegistry&); void initializeInstSimplifierPass(PassRegistry&); void initializeUnpackMachineBundlesPass(PassRegistry&); void initializeFinalizeMachineBundlesPass(PassRegistry&); - +void initializeBBVectorizePass(PassRegistry&); } #endif diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 537fab757b..2258d45ce9 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -31,6 +31,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Vectorize.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include @@ -151,6 +152,7 @@ namespace { (void) llvm::createCorrelatedValuePropagationPass(); (void) llvm::createMemDepPrinter(); (void) llvm::createInstructionSimplifierPass(); + (void) llvm::createBBVectorizePass(); (void)new llvm::IntervalPartition(); (void)new llvm::FindUsedTypes(); diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h index b4ba184781..a1b4f5cd90 100644 --- a/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -99,6 +99,7 @@ public: bool DisableSimplifyLibCalls; bool DisableUnitAtATime; bool DisableUnrollLoops; + bool Vectorize; private: /// ExtensionList - This is list of all of the extensions that are registered. diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h new file mode 100644 index 0000000000..dfc099ddb9 --- /dev/null +++ b/include/llvm/Transforms/Vectorize.h @@ -0,0 +1,30 @@ +//===-- Vectorize.h - Vectorization Transformations -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This header file defines prototypes for accessor functions that expose passes +// in the Vectorize transformations library. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_H +#define LLVM_TRANSFORMS_VECTORIZE_H + +namespace llvm { + +class BasicBlockPass; + +//===----------------------------------------------------------------------===// +// +// BBVectorize - A basic-block vectorization pass. +// +BasicBlockPass *createBBVectorizePass(); + +} // End llvm namespace + +#endif diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt index 10e0cc6b56..de1353e6c1 100644 --- a/lib/Transforms/CMakeLists.txt +++ b/lib/Transforms/CMakeLists.txt @@ -3,4 +3,5 @@ add_subdirectory(Instrumentation) add_subdirectory(InstCombine) add_subdirectory(Scalar) add_subdirectory(IPO) +add_subdirectory(Vectorize) add_subdirectory(Hello) diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt index b358fabdfa..b18c9150f4 100644 --- a/lib/Transforms/IPO/LLVMBuild.txt +++ b/lib/Transforms/IPO/LLVMBuild.txt @@ -20,4 +20,4 @@ type = Library name = IPO parent = Transforms library_name = ipo -required_libraries = Analysis Core IPA InstCombine Scalar Support Target TransformUtils +required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index afd25dc317..84084374b3 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -21,14 +21,20 @@ #include "llvm/DefaultPasses.h" #include "llvm/PassManager.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Vectorize.h" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/ManagedStatic.h" using namespace llvm; +static cl::opt +RunVectorization("vectorize", cl::desc("Run vectorization passes")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -37,6 +43,7 @@ PassManagerBuilder::PassManagerBuilder() { DisableSimplifyLibCalls = false; DisableUnitAtATime = false; DisableUnrollLoops = false; + Vectorize = RunVectorization; } PassManagerBuilder::~PassManagerBuilder() { @@ -172,6 +179,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { addExtensionsToPM(EP_ScalarOptimizerLate, MPM); + if (Vectorize) { + MPM.add(createBBVectorizePass()); + MPM.add(createInstructionCombiningPass()); + if (OptLevel > 1) + MPM.add(createGVNPass()); // Remove redundancies + } + MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createInstructionCombiningPass()); // Clean up after everything. diff --git a/lib/Transforms/LLVMBuild.txt b/lib/Transforms/LLVMBuild.txt index b2ef49a4c6..f7bca064c7 100644 --- a/lib/Transforms/LLVMBuild.txt +++ b/lib/Transforms/LLVMBuild.txt @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = IPO InstCombine Instrumentation Scalar Utils +subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize [component_0] type = Group diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile index e527be25de..8b1df92fa2 100644 --- a/lib/Transforms/Makefile +++ b/lib/Transforms/Makefile @@ -8,7 +8,7 @@ ##===----------------------------------------------------------------------===## LEVEL = ../.. -PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Hello +PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello include $(LEVEL)/Makefile.config diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp new file mode 100644 index 0000000000..9c2c8dd51b --- /dev/null +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -0,0 +1,1796 @@ +//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a basic-block vectorization pass. The algorithm was +// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, +// et al. It works by looking for chains of pairable operations and then +// pairing them. +// +//===----------------------------------------------------------------------===// + +#define BBV_NAME "bb-vectorize" +#define DEBUG_TYPE BBV_NAME +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Pass.h" +#include "llvm/Type.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Vectorize.h" +#include +#include +using namespace llvm; + +static cl::opt +ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, + cl::desc("The required chain depth for vectorization")); + +static cl::opt +SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, + cl::desc("The maximum search distance for instruction pairs")); + +static cl::opt +SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, + cl::desc("Replicating one element to a pair breaks the chain")); + +static cl::opt +VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, + cl::desc("The size of the native vector registers")); + +static cl::opt +MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, + cl::desc("The maximum number of pairing iterations")); + +static cl::opt +MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), + cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" + " a full cycle check")); + +static cl::opt +NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize integer values")); + +static cl::opt +NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize floating-point values")); + +static cl::opt +NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize casting (conversion) operations")); + +static cl::opt +NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize floating-point math intrinsics")); + +static cl::opt +NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); + +static cl::opt +NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize loads and stores")); + +static cl::opt +AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, + cl::desc("Only generate aligned loads and stores")); + +static cl::opt +FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, + cl::desc("Use a fast instruction dependency analysis")); + +#ifndef NDEBUG +static cl::opt +DebugInstructionExamination("bb-vectorize-debug-instruction-examination", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " instruction-examination process")); +static cl::opt +DebugCandidateSelection("bb-vectorize-debug-candidate-selection", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " candidate-selection process")); +static cl::opt +DebugPairSelection("bb-vectorize-debug-pair-selection", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " pair-selection process")); +static cl::opt +DebugCycleCheck("bb-vectorize-debug-cycle-check", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " cycle-checking process")); +#endif + +STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); + +namespace { + struct BBVectorize : public BasicBlockPass { + static char ID; // Pass identification, replacement for typeid + BBVectorize() : BasicBlockPass(ID) { + initializeBBVectorizePass(*PassRegistry::getPassRegistry()); + } + + typedef std::pair ValuePair; + typedef std::pair ValuePairWithDepth; + typedef std::pair VPPair; // A ValuePair pair + typedef std::pair::iterator, + std::multimap::iterator> VPIteratorPair; + typedef std::pair::iterator, + std::multimap::iterator> + VPPIteratorPair; + + AliasAnalysis *AA; + ScalarEvolution *SE; + TargetData *TD; + + // FIXME: const correct? + + bool vectorizePairs(BasicBlock &BB); + + void getCandidatePairs(BasicBlock &BB, + std::multimap &CandidatePairs, + std::vector &PairableInsts); + + void computeConnectedPairs(std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs); + + void buildDepMap(BasicBlock &BB, + std::multimap &CandidatePairs, + std::vector &PairableInsts, + DenseSet &PairableInstUsers); + + void choosePairs(std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + DenseSet &PairableInstUsers, + DenseMap& ChosenPairs); + + void fuseChosenPairs(BasicBlock &BB, + std::vector &PairableInsts, + DenseMap& ChosenPairs); + + bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); + + bool areInstsCompatible(Instruction *I, Instruction *J, + bool IsSimpleLoadStore); + + bool trackUsesOfI(DenseSet &Users, + AliasSetTracker &WriteSet, Instruction *I, + Instruction *J, bool UpdateUsers = true, + std::multimap *LoadMoveSet = 0); + + void computePairsConnectedTo( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + ValuePair P); + + bool pairsConflict(ValuePair P, ValuePair Q, + DenseSet &PairableInstUsers, + std::multimap *PairableInstUserMap = 0); + + bool pairWillFormCycle(ValuePair P, + std::multimap &PairableInstUsers, + DenseSet &CurrentPairs); + + void pruneTreeFor( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + DenseSet &PairableInstUsers, + std::multimap &PairableInstUserMap, + DenseMap &ChosenPairs, + DenseMap &Tree, + DenseSet &PrunedTree, ValuePair J, + bool UseCycleCheck); + + void buildInitialTreeFor( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + DenseSet &PairableInstUsers, + DenseMap &ChosenPairs, + DenseMap &Tree, ValuePair J); + + void findBestTreeFor( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + DenseSet &PairableInstUsers, + std::multimap &PairableInstUserMap, + DenseMap &ChosenPairs, + DenseSet &BestTree, size_t &BestMaxDepth, + size_t &BestEffSize, VPIteratorPair ChoiceRange, + bool UseCycleCheck); + + Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool &FlipMemInputs); + + void fillNewShuffleMask(LLVMContext& Context, Instruction *J, + unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, + unsigned IdxOffset, std::vector &Mask); + + Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, + Instruction *J); + + Value *getReplacementInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool FlipMemInputs); + + void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, + Instruction *J, SmallVector &ReplacedOperands, + bool &FlipMemInputs); + + void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, + Instruction *J, Instruction *K, + Instruction *&InsertionPt, Instruction *&K1, + Instruction *&K2, bool &FlipMemInputs); + + void collectPairLoadMoveSet(BasicBlock &BB, + DenseMap &ChosenPairs, + std::multimap &LoadMoveSet, + Instruction *I); + + void collectLoadMoveSet(BasicBlock &BB, + std::vector &PairableInsts, + DenseMap &ChosenPairs, + std::multimap &LoadMoveSet); + + bool canMoveUsesOfIAfterJ(BasicBlock &BB, + std::multimap &LoadMoveSet, + Instruction *I, Instruction *J); + + void moveUsesOfIAfterJ(BasicBlock &BB, + std::multimap &LoadMoveSet, + Instruction *&InsertionPt, + Instruction *I, Instruction *J); + + virtual bool runOnBasicBlock(BasicBlock &BB) { + AA = &getAnalysis(); + SE = &getAnalysis(); + TD = getAnalysisIfAvailable(); + + bool changed = false; + // Iterate a sufficient number of times to merge types of size 1 bit, + // then 2 bits, then 4, etc. up to half of the target vector width of the + // target vector register. + for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter); + v *= 2, ++n) { + DEBUG(dbgs() << "BBV: fusing loop #" << n << + " for " << BB.getName() << " in " << + BB.getParent()->getName() << "...\n"); + if (vectorizePairs(BB)) + changed = true; + else + break; + } + + DEBUG(dbgs() << "BBV: done!\n"); + return changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + BasicBlockPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + } + + // This returns the vector type that holds a pair of the provided type. + // If the provided type is already a vector, then its length is doubled. + static inline VectorType *getVecTypeForPair(Type *ElemTy) { + if (VectorType *VTy = dyn_cast(ElemTy)) { + unsigned numElem = VTy->getNumElements(); + return VectorType::get(ElemTy->getScalarType(), numElem*2); + } else { + return VectorType::get(ElemTy, 2); + } + } + + // Returns the weight associated with the provided value. A chain of + // candidate pairs has a length given by the sum of the weights of its + // members (one weight per pair; the weight of each member of the pair + // is assumed to be the same). This length is then compared to the + // chain-length threshold to determine if a given chain is significant + // enough to be vectorized. The length is also used in comparing + // candidate chains where longer chains are considered to be better. + // Note: when this function returns 0, the resulting instructions are + // not actually fused. + static inline size_t getDepthFactor(Value *V) { + // InsertElement and ExtractElement have a depth factor of zero. This is + // for two reasons: First, they cannot be usefully fused. Second, because + // the pass generates a lot of these, they can confuse the simple metric + // used to compare the trees in the next iteration. Thus, giving them a + // weight of zero allows the pass to essentially ignore them in + // subsequent iterations when looking for vectorization opportunities + // while still tracking dependency chains that flow through those + // instructions. + if (isa(V) || isa(V)) + return 0; + + return 1; + } + + // This determines the relative offset of two loads or stores, returning + // true if the offset could be determined to be some constant value. + // For example, if OffsetInElmts == 1, then J accesses the memory directly + // after I; if OffsetInElmts == -1 then I accesses the memory + // directly after J. This function assumes that both instructions + // have the same type. + bool getPairPtrInfo(Instruction *I, Instruction *J, + Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, + int64_t &OffsetInElmts) { + OffsetInElmts = 0; + if (isa(I)) { + IPtr = cast(I)->getPointerOperand(); + JPtr = cast(J)->getPointerOperand(); + IAlignment = cast(I)->getAlignment(); + JAlignment = cast(J)->getAlignment(); + } else { + IPtr = cast(I)->getPointerOperand(); + JPtr = cast(J)->getPointerOperand(); + IAlignment = cast(I)->getAlignment(); + JAlignment = cast(J)->getAlignment(); + } + + const SCEV *IPtrSCEV = SE->getSCEV(IPtr); + const SCEV *JPtrSCEV = SE->getSCEV(JPtr); + + // If this is a trivial offset, then we'll get something like + // 1*sizeof(type). With target data, which we need anyway, this will get + // constant folded into a number. + const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); + if (const SCEVConstant *ConstOffSCEV = + dyn_cast(OffsetSCEV)) { + ConstantInt *IntOff = ConstOffSCEV->getValue(); + int64_t Offset = IntOff->getSExtValue(); + + Type *VTy = cast(IPtr->getType())->getElementType(); + int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); + + assert(VTy == cast(JPtr->getType())->getElementType()); + + OffsetInElmts = Offset/VTyTSS; + return (abs64(Offset) % VTyTSS) == 0; + } + + return false; + } + + // Returns true if the provided CallInst represents an intrinsic that can + // be vectorized. + bool isVectorizableIntrinsic(CallInst* I) { + Function *F = I->getCalledFunction(); + if (!F) return false; + + unsigned IID = F->getIntrinsicID(); + if (!IID) return false; + + switch(IID) { + default: + return false; + case Intrinsic::sqrt: + case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::pow: + return !NoMath; + case Intrinsic::fma: + return !NoFMA; + } + } + + // Returns true if J is the second element in some pair referenced by + // some multimap pair iterator pair. + template + bool isSecondInIteratorPair(V J, std::pair< + typename std::multimap::iterator, + typename std::multimap::iterator> PairRange) { + for (typename std::multimap::iterator K = PairRange.first; + K != PairRange.second; ++K) + if (K->second == J) return true; + + return false; + } + }; + + // This function implements one vectorization iteration on the provided + // basic block. It returns true if the block is changed. + bool BBVectorize::vectorizePairs(BasicBlock &BB) { + std::vector PairableInsts; + std::multimap CandidatePairs; + getCandidatePairs(BB, CandidatePairs, PairableInsts); + if (PairableInsts.size() == 0) return false; + + // Now we have a map of all of the pairable instructions and we need to + // select the best possible pairing. A good pairing is one such that the + // users of the pair are also paired. This defines a (directed) forest + // over the pairs such that two pairs are connected iff the second pair + // uses the first. + + // Note that it only matters that both members of the second pair use some + // element of the first pair (to allow for splatting). + + std::multimap ConnectedPairs; + computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); + if (ConnectedPairs.size() == 0) return false; + + // Build the pairable-instruction dependency map + DenseSet PairableInstUsers; + buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); + + // There is now a graph of the connected pairs. For each variable, pick the + // pairing with the largest tree meeting the depth requirement on at least + // one branch. Then select all pairings that are part of that tree and + // remove them from the list of available pairings and pairable variables. + + DenseMap ChosenPairs; + choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, + PairableInstUsers, ChosenPairs); + + if (ChosenPairs.size() == 0) return false; + NumFusedOps += ChosenPairs.size(); + + // A set of pairs has now been selected. It is now necessary to replace the + // paired instructions with vector instructions. For this procedure each + // operand much be replaced with a vector operand. This vector is formed + // by using build_vector on the old operands. The replaced values are then + // replaced with a vector_extract on the result. Subsequent optimization + // passes should coalesce the build/extract combinations. + + fuseChosenPairs(BB, PairableInsts, ChosenPairs); + + return true; + } + + // This function returns true if the provided instruction is capable of being + // fused into a vector instruction. This determination is based only on the + // type and other attributes of the instruction. + bool BBVectorize::isInstVectorizable(Instruction *I, + bool &IsSimpleLoadStore) { + IsSimpleLoadStore = false; + + if (CallInst *C = dyn_cast(I)) { + if (!isVectorizableIntrinsic(C)) + return false; + } else if (LoadInst *L = dyn_cast(I)) { + // Vectorize simple loads if possbile: + IsSimpleLoadStore = L->isSimple(); + if (!IsSimpleLoadStore || NoMemOps) + return false; + } else if (StoreInst *S = dyn_cast(I)) { + // Vectorize simple stores if possbile: + IsSimpleLoadStore = S->isSimple(); + if (!IsSimpleLoadStore || NoMemOps) + return false; + } else if (CastInst *C = dyn_cast(I)) { + // We can vectorize casts, but not casts of pointer types, etc. + if (NoCasts) + return false; + + Type *SrcTy = C->getSrcTy(); + if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy()) + return false; + + Type *DestTy = C->getDestTy(); + if (!DestTy->isSingleValueType() || DestTy->isPointerTy()) + return false; + } else if (!(I->isBinaryOp() || isa(I) || + isa(I) || isa(I))) { + return false; + } + + // We can't vectorize memory operations without target data + if (TD == 0 && IsSimpleLoadStore) + return false; + + Type *T1, *T2; + if (isa(I)) { + // For stores, it is the value type, not the pointer type that matters + // because the value is what will come from a vector register. + + Value *IVal = cast(I)->getValueOperand(); + T1 = IVal->getType(); + } else { + T1 = I->getType(); + } + + if (I->isCast()) + T2 = cast(I)->getSrcTy(); + else + T2 = T1; + + // Not every type can be vectorized... + if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || + !(VectorType::isValidElementType(T2) || T2->isVectorTy())) + return false; + + if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + return false; + + if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) + return false; + + if (T1->getPrimitiveSizeInBits() > VectorBits/2 || + T2->getPrimitiveSizeInBits() > VectorBits/2) + return false; + + return true; + } + + // This function returns true if the two provided instructions are compatible + // (meaning that they can be fused into a vector instruction). This assumes + // that I has already been determined to be vectorizable and that J is not + // in the use tree of I. + bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, + bool IsSimpleLoadStore) { + DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << + " <-> " << *J << "\n"); + + // Loads and stores can be merged if they have different alignments, + // but are otherwise the same. + LoadInst *LI, *LJ; + StoreInst *SI, *SJ; + if ((LI = dyn_cast(I)) && (LJ = dyn_cast(J))) { + if (I->getType() != J->getType()) + return false; + + if (LI->getPointerOperand()->getType() != + LJ->getPointerOperand()->getType() || + LI->isVolatile() != LJ->isVolatile() || + LI->getOrdering() != LJ->getOrdering() || + LI->getSynchScope() != LJ->getSynchScope()) + return false; + } else if ((SI = dyn_cast(I)) && (SJ = dyn_cast(J))) { + if (SI->getValueOperand()->getType() != + SJ->getValueOperand()->getType() || + SI->getPointerOperand()->getType() != + SJ->getPointerOperand()->getType() || + SI->isVolatile() != SJ->isVolatile() || + SI->getOrdering() != SJ->getOrdering() || + SI->getSynchScope() != SJ->getSynchScope()) + return false; + } else if (!J->isSameOperationAs(I)) { + return false; + } + // FIXME: handle addsub-type operations! + + if (IsSimpleLoadStore) { + Value *IPtr, *JPtr; + unsigned IAlignment, JAlignment; + int64_t OffsetInElmts = 0; + if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + OffsetInElmts) && abs64(OffsetInElmts) == 1) { + if (AlignedOnly) { + Type *aType = isa(I) ? + cast(I)->getValueOperand()->getType() : I->getType(); + // An aligned load or store is possible only if the instruction + // with the lower offset has an alignment suitable for the + // vector type. + + unsigned BottomAlignment = IAlignment; + if (OffsetInElmts < 0) BottomAlignment = JAlignment; + + Type *VType = getVecTypeForPair(aType); + unsigned VecAlignment = TD->getPrefTypeAlignment(VType); + if (BottomAlignment < VecAlignment) + return false; + } + } else { + return false; + } + } else if (isa(I)) { + // Only merge two shuffles if they're both constant + return isa(I->getOperand(2)) && + isa(J->getOperand(2)); + // FIXME: We may want to vectorize non-constant shuffles also. + } + + return true; + } + + // Figure out whether or not J uses I and update the users and write-set + // structures associated with I. Specifically, Users represents the set of + // instructions that depend on I. WriteSet represents the set + // of memory locations that are dependent on I. If UpdateUsers is true, + // and J uses I, then Users is updated to contain J and WriteSet is updated + // to contain any memory locations to which J writes. The function returns + // true if J uses I. By default, alias analysis is used to determine + // whether J reads from memory that overlaps with a location in WriteSet. + // If LoadMoveSet is not null, then it is a previously-computed multimap + // where the key is the memory-based user instruction and the value is + // the instruction to be compared with I. So, if LoadMoveSet is provided, + // then the alias analysis is not used. This is necessary because this + // function is called during the process of moving instructions during + // vectorization and the results of the alias analysis are not stable during + // that process. + bool BBVectorize::trackUsesOfI(DenseSet &Users, + AliasSetTracker &WriteSet, Instruction *I, + Instruction *J, bool UpdateUsers, + std::multimap *LoadMoveSet) { + bool UsesI = false; + + // This instruction may already be marked as a user due, for example, to + // being a member of a selected pair. + if (Users.count(J)) + UsesI = true; + + if (!UsesI) + for (User::op_iterator JU = J->op_begin(), e = J->op_end(); + JU != e; ++JU) { + Value *V = *JU; + if (I == V || Users.count(V)) { + UsesI = true; + break; + } + } + if (!UsesI && J->mayReadFromMemory()) { + if (LoadMoveSet) { + VPIteratorPair JPairRange = LoadMoveSet->equal_range(J); + UsesI = isSecondInIteratorPair(I, JPairRange); + } else { + for (AliasSetTracker::iterator W = WriteSet.begin(), + WE = WriteSet.end(); W != WE; ++W) { + for (AliasSet::iterator A = W->begin(), AE = W->end(); + A != AE; ++A) { + AliasAnalysis::Location ptrLoc(A->getValue(), A->getSize(), + A->getTBAAInfo()); + if (AA->getModRefInfo(J, ptrLoc) != AliasAnalysis::NoModRef) { + UsesI = true; + break; + } + } + if (UsesI) break; + } + } + } + + if (UsesI && UpdateUsers) { + if (J->mayWriteToMemory()) WriteSet.add(J); + Users.insert(J); + } + + return UsesI; + } + + // This function iterates over all instruction pairs in the provided + // basic block and collects all candidate pairs for vectorization. + void BBVectorize::getCandidatePairs(BasicBlock &BB, + std::multimap &CandidatePairs, + std::vector &PairableInsts) { + BasicBlock::iterator E = BB.end(); + for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { + bool IsSimpleLoadStore; + if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; + + // Look for an instruction with which to pair instruction *I... + DenseSet Users; + AliasSetTracker WriteSet(*AA); + BasicBlock::iterator J = I; ++J; + for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) { + // Determine if J uses I, if so, exit the loop. + bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep); + if (FastDep) { + // Note: For this heuristic to be effective, independent operations + // must tend to be intermixed. This is likely to be true from some + // kinds of grouped loop unrolling (but not the generic LLVM pass), + // but otherwise may require some kind of reordering pass. + + // When using fast dependency analysis, + // stop searching after first use: + if (UsesI) break; + } else { + if (UsesI) continue; + } + + // J does not use I, and comes before the first use of I, so it can be + // merged with I if the instructions are compatible. + if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue; + + // J is a candidate for merging with I. + if (!PairableInsts.size() || + PairableInsts[PairableInsts.size()-1] != I) { + PairableInsts.push_back(I); + } + CandidatePairs.insert(ValuePair(I, J)); + DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " + << *I << " <-> " << *J << "\n"); + } + } + + DEBUG(dbgs() << "BBV: found " << PairableInsts.size() + << " instructions with candidate pairs\n"); + } + + // Finds candidate pairs connected to the pair P = . This means that + // it looks for pairs such that both members have an input which is an + // output of PI or PJ. + void BBVectorize::computePairsConnectedTo( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + ValuePair P) { + // For each possible pairing for this variable, look at the uses of + // the first value... + for (Value::use_iterator I = P.first->use_begin(), + E = P.first->use_end(); I != E; ++I) { + VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); + + // For each use of the first variable, look for uses of the second + // variable... + for (Value::use_iterator J = P.second->use_begin(), + E2 = P.second->use_end(); J != E2; ++J) { + VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); + + // Look for : + if (isSecondInIteratorPair(*J, IPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + + // Look for : + if (isSecondInIteratorPair(*I, JPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); + } + + if (SplatBreaksChain) continue; + // Look for cases where just the first value in the pair is used by + // both members of another pair (splatting). + for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { + if (isSecondInIteratorPair(*J, IPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + } + } + + if (SplatBreaksChain) return; + // Look for cases where just the second value in the pair is used by + // both members of another pair (splatting). + for (Value::use_iterator I = P.second->use_begin(), + E = P.second->use_end(); I != E; ++I) { + VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); + + for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { + if (isSecondInIteratorPair(*J, IPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + } + } + } + + // This function figures out which pairs are connected. Two pairs are + // connected if some output of the first pair forms an input to both members + // of the second pair. + void BBVectorize::computeConnectedPairs( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs) { + + for (std::vector::iterator PI = PairableInsts.begin(), + PE = PairableInsts.end(); PI != PE; ++PI) { + VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI); + + for (std::multimap::iterator P = choiceRange.first; + P != choiceRange.second; ++P) + computePairsConnectedTo(CandidatePairs, PairableInsts, + ConnectedPairs, *P); + } + + DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() + << " pair connections.\n"); + } + + // This function builds a set of use tuples such that is in the set + // if B is in the use tree of A. If B is in the use tree of A, then B + // depends on the output of A. + void BBVectorize::buildDepMap( + BasicBlock &BB, + std::multimap &CandidatePairs, + std::vector &PairableInsts, + DenseSet &PairableInstUsers) { + DenseSet IsInPair; + for (std::multimap::iterator C = CandidatePairs.begin(), + E = CandidatePairs.end(); C != E; ++C) { + IsInPair.insert(C->first); + IsInPair.insert(C->second); + } + + // Iterate through the basic block, recording all Users of each + // pairable instruction. + + BasicBlock::iterator E = BB.end(); + for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { + if (IsInPair.find(I) == IsInPair.end()) continue; + + DenseSet Users; + AliasSetTracker WriteSet(*AA); + for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) + (void) trackUsesOfI(Users, WriteSet, I, J); + + for (DenseSet::iterator U = Users.begin(), E = Users.end(); + U != E; ++U) + PairableInstUsers.insert(ValuePair(I, *U)); + } + } + + // Returns true if an input to pair P is an output of pair Q and also an + // input of pair Q is an output of pair P. If this is the case, then these + // two pairs cannot be simultaneously fused. + bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, + DenseSet &PairableInstUsers, + std::multimap *PairableInstUserMap) { + // Two pairs are in conflict if they are mutual Users of eachother. + bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || + PairableInstUsers.count(ValuePair(P.first, Q.second)) || + PairableInstUsers.count(ValuePair(P.second, Q.first)) || + PairableInstUsers.count(ValuePair(P.second, Q.second)); + bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || + PairableInstUsers.count(ValuePair(Q.first, P.second)) || + PairableInstUsers.count(ValuePair(Q.second, P.first)) || + PairableInstUsers.count(ValuePair(Q.second, P.second)); + if (PairableInstUserMap) { + // FIXME: The expensive part of the cycle check is not so much the cycle + // check itself but this edge insertion procedure. This needs some + // profiling and probably a different data structure (same is true of + // most uses of std::multimap). + if (PUsesQ) { + VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q); + if (!isSecondInIteratorPair(P, QPairRange)) + PairableInstUserMap->insert(VPPair(Q, P)); + } + if (QUsesP) { + VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P); + if (!isSecondInIteratorPair(Q, PPairRange)) + PairableInstUserMap->insert(VPPair(P, Q)); + } + } + + return (QUsesP && PUsesQ); + } + + // This function walks the use graph of current pairs to see if, starting + // from P, the walk returns to P. + bool BBVectorize::pairWillFormCycle(ValuePair P, + std::multimap &PairableInstUserMap, + DenseSet &CurrentPairs) { + DEBUG(if (DebugCycleCheck) + dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " + << *P.second << "\n"); + // A lookup table of visisted pairs is kept because the PairableInstUserMap + // contains non-direct associations. + DenseSet Visited; + std::vector Q; + // General depth-first post-order traversal: + Q.push_back(P); + while (!Q.empty()) { + ValuePair QTop = Q.back(); + + Visited.insert(QTop); + Q.pop_back(); + + DEBUG(if (DebugCycleCheck) + dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> " + << *QTop.second << "\n"); + VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop); + for (std::multimap::iterator C = QPairRange.first; + C != QPairRange.second; ++C) { + if (C->second == P) { + DEBUG(dbgs() + << "BBV: rejected to prevent non-trivial cycle formation: " + << *C->first.first << " <-> " << *C->first.second << "\n"); + return true; + } + + if (CurrentPairs.count(C->second) > 0 && + Visited.count(C->second) == 0) + Q.push_back(C->second); + } + } + + return false; + } + + // This function builds the initial tree of connected pairs with the + // pair J at the root. + void BBVectorize::buildInitialTreeFor( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + DenseSet &PairableInstUsers, + DenseMap &ChosenPairs, + DenseMap &Tree, ValuePair J) { + // Each of these pairs is viewed as the root node of a Tree. The Tree + // is then walked (depth-first). As this happens, we keep track of + // the pairs that compose the Tree and the maximum depth of the Tree. + std::vector Q; + // General depth-first post-order traversal: + Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); + while (!Q.empty()) { + ValuePairWithDepth QTop = Q.back(); + + // Push each child onto the queue: + bool MoreChildren = false; + size_t MaxChildDepth = QTop.second; + VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first); + for (std::map::iterator k = qtRange.first; + k != qtRange.second; ++k) { + // Make sure that this child pair is still a candidate: + bool IsStillCand = false; + VPIteratorPair checkRange = + CandidatePairs.equal_range(k->second.first); + for (std::multimap::iterator m = checkRange.first; + m != checkRange.second; ++m) { + if (m->second == k->second.second) { + IsStillCand = true; + break; + } + } + + if (IsStillCand) { + DenseMap::iterator C = Tree.find(k->second); + if (C == Tree.end()) { + size_t d = getDepthFactor(k->second.first); + Q.push_back(ValuePairWithDepth(k->second, QTop.second+d)); + MoreChildren = true; + } else { + MaxChildDepth = std::max(MaxChildDepth, C->second); + } + } + } + + if (!MoreChildren) { + // Record the current pair as part of the Tree: + Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth)); + Q.pop_back(); + } + } + } + + // Given some initial tree, prune it by removing conflicting pairs (pairs + // that cannot be simultaneously chosen for vectorization). + void BBVectorize::pruneTreeFor( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + DenseSet &PairableInstUsers, + std::multimap &PairableInstUserMap, + DenseMap &ChosenPairs, + DenseMap &Tree, + DenseSet &PrunedTree, ValuePair J, + bool UseCycleCheck) { + std::vector Q; + // General depth-first post-order traversal: + Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first))); + while (!Q.empty()) { + ValuePairWithDepth QTop = Q.back(); + PrunedTree.insert(QTop.first); + Q.pop_back(); + + // Visit each child, pruning as necessary... + DenseMap BestChilden; + VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first); + for (std::map::iterator K = QTopRange.first; + K != QTopRange.second; ++K) { + DenseMap::iterator C = Tree.find(K->second); + if (C == Tree.end()) continue; + + // This child is in the Tree, now we need to make sure it is the + // best of any conflicting children. There could be multiple + // conflicting children, so first, determine if we're keeping + // this child, then delete conflicting children as necessary. + + // It is also necessary to guard against pairing-induced + // dependencies. Consider instructions a .. x .. y .. b + // such that (a,b) are to be fused and (x,y) are to be fused + // but a is an input to x and b is an output from y. This + // means that y cannot be moved after b but x must be moved + // after b for (a,b) to be fused. In other words, after + // fusing (a,b) we have y .. a/b .. x where y is an input + // to a/b and x is an output to a/b: x and y can no longer + // be legally fused. To prevent this condition, we must + // make sure that a child pair added to the Tree is not + // both an input and output of an already-selected pair. + + // Pairing-induced dependencies can also form from more complicated + // cycles. The pair vs. pair conflicts are easy to check, and so + // that is done explicitly for "fast rejection", and because for + // child vs. child conflicts, we may prefer to keep the current + // pair in preference to the already-selected child. + DenseSet CurrentPairs; + + bool CanAdd = true; + for (DenseMap::iterator C2 + = BestChilden.begin(), E2 = BestChilden.end(); + C2 != E2; ++C2) { + if (C2->first.first == C->first.first || + C2->first.first == C->first.second || + C2->first.second == C->first.first || + C2->first.second == C->first.second || + pairsConflict(C2->first, C->first, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + if (C2->second >= C->second) { + CanAdd = false; + break; + } + + CurrentPairs.insert(C2->first); + } + } + if (!CanAdd) continue; + + // Even worse, this child could conflict with another node already + // selected for the Tree. If that is the case, ignore this child. + for (DenseSet::iterator T = PrunedTree.begin(), + E2 = PrunedTree.end(); T != E2; ++T) { + if (T->first == C->first.first || + T->first == C->first.second || + T->second == C->first.first || + T->second == C->first.second || + pairsConflict(*T, C->first, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + CanAdd = false; + break; + } + + CurrentPairs.insert(*T); + } + if (!CanAdd) continue; + + // And check the queue too... + for (std::vector::iterator C2 = Q.begin(), + E2 = Q.end(); C2 != E2; ++C2) { + if (C2->first.first == C->first.first || + C2->first.first == C->first.second || + C2->first.second == C->first.first || + C2->first.second == C->first.second || + pairsConflict(C2->first, C->first, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + CanAdd = false; + break; + } + + CurrentPairs.insert(C2->first); + } + if (!CanAdd) continue; + + // Last but not least, check for a conflict with any of the + // already-chosen pairs. + for (DenseMap::iterator C2 = + ChosenPairs.begin(), E2 = ChosenPairs.end(); + C2 != E2; ++C2) { + if (pairsConflict(*C2, C->first, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + CanAdd = false; + break; + } + + CurrentPairs.insert(*C2); + } + if (!CanAdd) continue; + + // To check for non-trivial cycles formed by the addition of the + // current pair we've formed a list of all relevant pairs, now use a + // graph walk to check for a cycle. We start from the current pair and + // walk the use tree to see if we again reach the current pair. If we + // do, then the current pair is rejected. + + // FIXME: It may be more efficient to use a topological-ordering + // algorithm to improve the cycle check. This should be investigated. + if (UseCycleCheck && + pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs)) + continue; + + // This child can be added, but we may have chosen it in preference + // to an already-selected child. Check for this here, and if a + // conflict is found, then remove the previously-selected child + // before adding this one in its place. + for (DenseMap::iterator C2 + = BestChilden.begin(); C2 != BestChilden.end();) { + if (C2->first.first == C->first.first || + C2->first.first == C->first.second || + C2->first.second == C->first.first || + C2->first.second == C->first.second || + pairsConflict(C2->first, C->first, PairableInstUsers)) + BestChilden.erase(C2++); + else + ++C2; + } + + BestChilden.insert(ValuePairWithDepth(C->first, C->second)); + } + + for (DenseMap::iterator C + = BestChilden.begin(), E2 = BestChilden.end(); + C != E2; ++C) { + size_t DepthF = getDepthFactor(C->first.first); + Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF)); + } + } + } + + // This function finds the best tree of mututally-compatible connected + // pairs, given the choice of root pairs as an iterator range. + void BBVectorize::findBestTreeFor( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + DenseSet &PairableInstUsers, + std::multimap &PairableInstUserMap, + DenseMap &ChosenPairs, + DenseSet &BestTree, size_t &BestMaxDepth, + size_t &BestEffSize, VPIteratorPair ChoiceRange, + bool UseCycleCheck) { + for (std::multimap::iterator J = ChoiceRange.first; + J != ChoiceRange.second; ++J) { + + // Before going any further, make sure that this pair does not + // conflict with any already-selected pairs (see comment below + // near the Tree pruning for more details). + DenseSet ChosenPairSet; + bool DoesConflict = false; + for (DenseMap::iterator C = ChosenPairs.begin(), + E = ChosenPairs.end(); C != E; ++C) { + if (pairsConflict(*C, *J, PairableInstUsers, + UseCycleCheck ? &PairableInstUserMap : 0)) { + DoesConflict = true; + break; + } + + ChosenPairSet.insert(*C); + } + if (DoesConflict) continue; + + if (UseCycleCheck && + pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet)) + continue; + + DenseMap Tree; + buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + PairableInstUsers, ChosenPairs, Tree, *J); + + // Because we'll keep the child with the largest depth, the largest + // depth is still the same in the unpruned Tree. + size_t MaxDepth = Tree.lookup(*J); + + DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {" + << *J->first << " <-> " << *J->second << "} of depth " << + MaxDepth << " and size " << Tree.size() << "\n"); + + // At this point the Tree has been constructed, but, may contain + // contradictory children (meaning that different children of + // some tree node may be attempting to fuse the same instruction). + // So now we walk the tree again, in the case of a conflict, + // keep only the child with the largest depth. To break a tie, + // favor the first child. + + DenseSet PrunedTree; + pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, + PrunedTree, *J, UseCycleCheck); + + size_t EffSize = 0; + for (DenseSet::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) + EffSize += getDepthFactor(S->first); + + DEBUG(if (DebugPairSelection) + dbgs() << "BBV: found pruned Tree for pair {" + << *J->first << " <-> " << *J->second << "} of depth " << + MaxDepth << " and size " << PrunedTree.size() << + " (effective size: " << EffSize << ")\n"); + if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) { + BestMaxDepth = MaxDepth; + BestEffSize = EffSize; + BestTree = PrunedTree; + } + } + } + + // Given the list of candidate pairs, this function selects those + // that will be fused into vector instructions. + void BBVectorize::choosePairs( + std::multimap &CandidatePairs, + std::vector &PairableInsts, + std::multimap &ConnectedPairs, + DenseSet &PairableInstUsers, + DenseMap& ChosenPairs) { + bool UseCycleCheck = CandidatePairs.size() <= MaxCandPairsForCycleCheck; + std::multimap PairableInstUserMap; + for (std::vector::iterator I = PairableInsts.begin(), + E = PairableInsts.end(); I != E; ++I) { + // The number of possible pairings for this variable: + size_t NumChoices = CandidatePairs.count(*I); + if (!NumChoices) continue; + + VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); + + // The best pair to choose and its tree: + size_t BestMaxDepth = 0, BestEffSize = 0; + DenseSet BestTree; + findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + PairableInstUsers, PairableInstUserMap, ChosenPairs, + BestTree, BestMaxDepth, BestEffSize, ChoiceRange, + UseCycleCheck); + + // A tree has been chosen (or not) at this point. If no tree was + // chosen, then this instruction, I, cannot be paired (and is no longer + // considered). + + DEBUG(if (BestTree.size() > 0) + dbgs() << "BBV: selected pairs in the best tree for: " + << *cast(*I) << "\n"); + + for (DenseSet::iterator S = BestTree.begin(), + SE2 = BestTree.end(); S != SE2; ++S) { + // Insert the members of this tree into the list of chosen pairs. + ChosenPairs.insert(ValuePair(S->first, S->second)); + DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " << + *S->second << "\n"); + + // Remove all candidate pairs that have values in the chosen tree. + for (std::multimap::iterator K = + CandidatePairs.begin(); K != CandidatePairs.end();) { + if (K->first == S->first || K->second == S->first || + K->second == S->second || K->first == S->second) { + // Don't remove the actual pair chosen so that it can be used + // in subsequent tree selections. + if (!(K->first == S->first && K->second == S->second)) + CandidatePairs.erase(K++); + else + ++K; + } else { + ++K; + } + } + } + } + + DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n"); + } + + std::string getReplacementName(Instruction *I, bool IsInput, unsigned o, + unsigned n = 0) { + if (!I->hasName()) + return ""; + + return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) + + (n > 0 ? "." + utostr(n) : "")).str(); + } + + // Returns the value that is to be used as the pointer input to the vector + // instruction that fuses I with J. + Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, + Instruction *I, Instruction *J, unsigned o, + bool &FlipMemInputs) { + Value *IPtr, *JPtr; + unsigned IAlignment, JAlignment; + int64_t OffsetInElmts; + (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + OffsetInElmts); + + // The pointer value is taken to be the one with the lowest offset. + Value *VPtr; + if (OffsetInElmts > 0) { + VPtr = IPtr; + } else { + FlipMemInputs = true; + VPtr = JPtr; + } + + Type *ArgType = cast(IPtr->getType())->getElementType(); + Type *VArgType = getVecTypeForPair(ArgType); + Type *VArgPtrType = PointerType::get(VArgType, + cast(IPtr->getType())->getAddressSpace()); + return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), + /* insert before */ FlipMemInputs ? J : I); + } + + void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, + unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, + unsigned IdxOffset, std::vector &Mask) { + for (unsigned v = 0; v < NumElem/2; ++v) { + int m = cast(J)->getMaskValue(v); + if (m < 0) { + Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); + } else { + unsigned mm = m + (int) IdxOffset; + if (m >= (int) NumInElem) + mm += (int) NumInElem; + + Mask[v+MaskOffset] = + ConstantInt::get(Type::getInt32Ty(Context), mm); + } + } + } + + // Returns the value that is to be used as the vector-shuffle mask to the + // vector instruction that fuses I with J. + Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context, + Instruction *I, Instruction *J) { + // This is the shuffle mask. We need to append the second + // mask to the first, and the numbers need to be adjusted. + + Type *ArgType = I->getType(); + Type *VArgType = getVecTypeForPair(ArgType); + + // Get the total number of elements in the fused vector type. + // By definition, this must equal the number of elements in + // the final mask. + unsigned NumElem = cast(VArgType)->getNumElements(); + std::vector Mask(NumElem); + + Type *OpType = I->getOperand(0)->getType(); + unsigned NumInElem = cast(OpType)->getNumElements(); + + // For the mask from the first pair... + fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask); + + // For the mask from the second pair... + fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem, + Mask); + + return ConstantVector::get(Mask); + } + + // Returns the value to be used as the specified operand of the vector + // instruction that fuses I with J. + Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool FlipMemInputs) { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); + + // Compute the fused vector type for this operand + Type *ArgType = I->getOperand(o)->getType(); + VectorType *VArgType = getVecTypeForPair(ArgType); + + Instruction *L = I, *H = J; + if (FlipMemInputs) { + L = J; + H = I; + } + + if (ArgType->isVectorTy()) { + unsigned numElem = cast(VArgType)->getNumElements(); + std::vector Mask(numElem); + for (unsigned v = 0; v < numElem; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + + Instruction *BV = new ShuffleVectorInst(L->getOperand(o), + H->getOperand(o), + ConstantVector::get(Mask), + getReplacementName(I, true, o)); + BV->insertBefore(J); + return BV; + } + + // If these two inputs are the output of another vector instruction, + // then we should use that output directly. It might be necessary to + // permute it first. [When pairings are fused recursively, you can + // end up with cases where a large vector is decomposed into scalars + // using extractelement instructions, then built into size-2 + // vectors using insertelement and the into larger vectors using + // shuffles. InstCombine does not simplify all of these cases well, + // and so we make sure that shuffles are generated here when possible. + ExtractElementInst *LEE + = dyn_cast(L->getOperand(o)); + ExtractElementInst *HEE + = dyn_cast(H->getOperand(o)); + + if (LEE && HEE && + LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) { + VectorType *EEType = cast(LEE->getOperand(0)->getType()); + unsigned LowIndx = cast(LEE->getOperand(1))->getZExtValue(); + unsigned HighIndx = cast(HEE->getOperand(1))->getZExtValue(); + if (LEE->getOperand(0) == HEE->getOperand(0)) { + if (LowIndx == 0 && HighIndx == 1) + return LEE->getOperand(0); + + std::vector Mask(2); + Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); + Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); + + Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), + UndefValue::get(EEType), + ConstantVector::get(Mask), + getReplacementName(I, true, o)); + BV->insertBefore(J); + return BV; + } + + std::vector Mask(2); + HighIndx += EEType->getNumElements(); + Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); + Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); + + Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), + HEE->getOperand(0), + ConstantVector::get(Mask), + getReplacementName(I, true, o)); + BV->insertBefore(J); + return BV; + } + + Instruction *BV1 = InsertElementInst::Create( + UndefValue::get(VArgType), + L->getOperand(o), CV0, + getReplacementName(I, true, o, 1)); + BV1->insertBefore(I); + Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o), + CV1, + getReplacementName(I, true, o, 2)); + BV2->insertBefore(J); + return BV2; + } + + // This function creates an array of values that will be used as the inputs + // to the vector instruction that fuses I with J. + void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, + Instruction *I, Instruction *J, + SmallVector &ReplacedOperands, + bool &FlipMemInputs) { + FlipMemInputs = false; + unsigned NumOperands = I->getNumOperands(); + + for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { + // Iterate backward so that we look at the store pointer + // first and know whether or not we need to flip the inputs. + + if (isa(I) || (o == 1 && isa(I))) { + // This is the pointer for a load/store instruction. + ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, + FlipMemInputs); + continue; + } else if (isa(I) && o == NumOperands-1) { + Function *F = cast(I)->getCalledFunction(); + unsigned IID = F->getIntrinsicID(); + BasicBlock &BB = *I->getParent(); + + Module *M = BB.getParent()->getParent(); + Type *ArgType = I->getType(); + Type *VArgType = getVecTypeForPair(ArgType); + + // FIXME: is it safe to do this here? + ReplacedOperands[o] = Intrinsic::getDeclaration(M, + (Intrinsic::ID) IID, VArgType); + continue; + } else if (isa(I) && o == NumOperands-1) { + ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J); + continue; + } + + ReplacedOperands[o] = + getReplacementInput(Context, I, J, o, FlipMemInputs); + } + } + + // This function creates two values that represent the outputs of the + // original I and J instructions. These are generally vector shuffles + // or extracts. In many cases, these will end up being unused and, thus, + // eliminated by later passes. + void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, + Instruction *J, Instruction *K, + Instruction *&InsertionPt, + Instruction *&K1, Instruction *&K2, + bool &FlipMemInputs) { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); + + if (isa(I)) { + AA->replaceWithNewValue(I, K); + AA->replaceWithNewValue(J, K); + } else { + Type *IType = I->getType(); + Type *VType = getVecTypeForPair(IType); + + if (IType->isVectorTy()) { + unsigned numElem = cast(IType)->getNumElements(); + std::vector Mask1(numElem), Mask2(numElem); + for (unsigned v = 0; v < numElem; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v); + } + + K1 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get( + FlipMemInputs ? Mask2 : Mask1), + getReplacementName(K, false, 1)); + K2 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get( + FlipMemInputs ? Mask1 : Mask2), + getReplacementName(K, false, 2)); + } else { + K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, + getReplacementName(K, false, 1)); + K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, + getReplacementName(K, false, 2)); + } + + K1->insertAfter(K); + K2->insertAfter(K1); + InsertionPt = K2; + } + } + + // Move all uses of the function I (including pairing-induced uses) after J. + bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB, + std::multimap &LoadMoveSet, + Instruction *I, Instruction *J) { + // Skip to the first instruction past I. + BasicBlock::iterator L = BB.begin(); + for (; cast(L) != I; ++L); + ++L; + + DenseSet Users; + AliasSetTracker WriteSet(*AA); + for (; cast(L) != J; ++L) + (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet); + + assert(cast(L) == J && + "Tracking has not proceeded far enough to check for dependencies"); + // If J is now in the use set of I, then trackUsesOfI will return true + // and we have a dependency cycle (and the fusing operation must abort). + return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet); + } + + // Move all uses of the function I (including pairing-induced uses) after J. + void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB, + std::multimap &LoadMoveSet, + Instruction *&InsertionPt, + Instruction *I, Instruction *J) { + // Skip to the first instruction past I. + BasicBlock::iterator L = BB.begin(); + for (; cast(L) != I; ++L); + ++L; + + DenseSet Users; + AliasSetTracker WriteSet(*AA); + for (; cast(L) != J;) { + if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) { + // Move this instruction + Instruction *InstToMove = L; ++L; + + DEBUG(dbgs() << "BBV: moving: " << *InstToMove << + " to after " << *InsertionPt << "\n"); + InstToMove->removeFromParent(); + InstToMove->insertAfter(InsertionPt); + InsertionPt = InstToMove; + } else { + ++L; + } + } + } + + // Collect all load instruction that are in the move set of a given first + // pair member. These loads depend on the first instruction, I, and so need + // to be moved after J (the second instruction) when the pair is fused. + void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB, + DenseMap &ChosenPairs, + std::multimap &LoadMoveSet, + Instruction *I) { + // Skip to the first instruction past I. + BasicBlock::iterator L = BB.begin(); + for (; cast(L) != I; ++L); + ++L; + + DenseSet Users; + AliasSetTracker WriteSet(*AA); + + // Note: We cannot end the loop when we reach J because J could be moved + // farther down the use chain by another instruction pairing. Also, J + // could be before I if this is an inverted input. + for (BasicBlock::iterator E = BB.end(); cast(L) != E; ++L) { + if (trackUsesOfI(Users, WriteSet, I, L)) { + if (L->mayReadFromMemory()) + LoadMoveSet.insert(ValuePair(L, I)); + } + } + } + + // In cases where both load/stores and the computation of their pointers + // are chosen for vectorization, we can end up in a situation where the + // aliasing analysis starts returning different query results as the + // process of fusing instruction pairs continues. Because the algorithm + // relies on finding the same use trees here as were found earlier, we'll + // need to precompute the necessary aliasing information here and then + // manually update it during the fusion process. + void BBVectorize::collectLoadMoveSet(BasicBlock &BB, + std::vector &PairableInsts, + DenseMap &ChosenPairs, + std::multimap &LoadMoveSet) { + for (std::vector::iterator PI = PairableInsts.begin(), + PIE = PairableInsts.end(); PI != PIE; ++PI) { + DenseMap::iterator P = ChosenPairs.find(*PI); + if (P == ChosenPairs.end()) continue; + + Instruction *I = cast(P->first); + collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I); + } + } + + // This function fuses the chosen instruction pairs into vector instructions, + // taking care preserve any needed scalar outputs and, then, it reorders the + // remaining instructions as needed (users of the first member of the pair + // need to be moved to after the location of the second member of the pair + // because the vector instruction is inserted in the location of the pair's + // second member). + void BBVectorize::fuseChosenPairs(BasicBlock &BB, + std::vector &PairableInsts, + DenseMap &ChosenPairs) { + LLVMContext& Context = BB.getContext(); + + // During the vectorization process, the order of the pairs to be fused + // could be flipped. So we'll add each pair, flipped, into the ChosenPairs + // list. After a pair is fused, the flipped pair is removed from the list. + std::vector FlippedPairs; + FlippedPairs.reserve(ChosenPairs.size()); + for (DenseMap::iterator P = ChosenPairs.begin(), + E = ChosenPairs.end(); P != E; ++P) + FlippedPairs.push_back(ValuePair(P->second, P->first)); + for (std::vector::iterator P = FlippedPairs.begin(), + E = FlippedPairs.end(); P != E; ++P) + ChosenPairs.insert(*P); + + std::multimap LoadMoveSet; + collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); + + DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); + + for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { + DenseMap::iterator P = ChosenPairs.find(PI); + if (P == ChosenPairs.end()) { + ++PI; + continue; + } + + if (getDepthFactor(P->first) == 0) { + // These instructions are not really fused, but are tracked as though + // they are. Any case in which it would be interesting to fuse them + // will be taken care of by InstCombine. + --NumFusedOps; + ++PI; + continue; + } + + Instruction *I = cast(P->first), + *J = cast(P->second); + + DEBUG(dbgs() << "BBV: fusing: " << *I << + " <-> " << *J << "\n"); + + // Remove the pair and flipped pair from the list. + DenseMap::iterator FP = ChosenPairs.find(P->second); + assert(FP != ChosenPairs.end() && "Flipped pair not found in list"); + ChosenPairs.erase(FP); + ChosenPairs.erase(P); + + if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) { + DEBUG(dbgs() << "BBV: fusion of: " << *I << + " <-> " << *J << + " aborted because of non-trivial dependency cycle\n"); + --NumFusedOps; + ++PI; + continue; + } + + bool FlipMemInputs; + unsigned NumOperands = I->getNumOperands(); + SmallVector ReplacedOperands(NumOperands); + getReplacementInputsForPair(Context, I, J, ReplacedOperands, + FlipMemInputs); + + // Make a copy of the original operation, change its type to the vector + // type and replace its operands with the vector operands. + Instruction *K = I->clone(); + if (I->hasName()) K->takeName(I); + + if (!isa(K)) + K->mutateType(getVecTypeForPair(I->getType())); + + for (unsigned o = 0; o < NumOperands; ++o) + K->setOperand(o, ReplacedOperands[o]); + + // If we've flipped the memory inputs, make sure that we take the correct + // alignment. + if (FlipMemInputs) { + if (isa(K)) + cast(K)->setAlignment(cast(J)->getAlignment()); + else + cast(K)->setAlignment(cast(J)->getAlignment()); + } + + K->insertAfter(J); + + // Instruction insertion point: + Instruction *InsertionPt = K; + Instruction *K1 = 0, *K2 = 0; + replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2, + FlipMemInputs); + + // The use tree of the first original instruction must be moved to after + // the location of the second instruction. The entire use tree of the + // first instruction is disjoint from the input tree of the second + // (by definition), and so commutes with it. + + moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); + + if (!isa(I)) { + I->replaceAllUsesWith(K1); + J->replaceAllUsesWith(K2); + AA->replaceWithNewValue(I, K1); + AA->replaceWithNewValue(J, K2); + } + + // Instructions that may read from memory may be in the load move set. + // Once an instruction is fused, we no longer need its move set, and so + // the values of the map never need to be updated. However, when a load + // is fused, we need to merge the entries from both instructions in the + // pair in case those instructions were in the move set of some other + // yet-to-be-fused pair. The loads in question are the keys of the map. + if (I->mayReadFromMemory()) { + std::vector NewSetMembers; + VPIteratorPair IPairRange = LoadMoveSet.equal_range(I); + VPIteratorPair JPairRange = LoadMoveSet.equal_range(J); + for (std::multimap::iterator N = IPairRange.first; + N != IPairRange.second; ++N) + NewSetMembers.push_back(ValuePair(K, N->second)); + for (std::multimap::iterator N = JPairRange.first; + N != JPairRange.second; ++N) + NewSetMembers.push_back(ValuePair(K, N->second)); + for (std::vector::iterator A = NewSetMembers.begin(), + AE = NewSetMembers.end(); A != AE; ++A) + LoadMoveSet.insert(*A); + } + + // Before removing I, set the iterator to the next instruction. + PI = llvm::next(BasicBlock::iterator(I)); + if (cast(PI) == J) + ++PI; + + SE->forgetValue(I); + SE->forgetValue(J); + I->eraseFromParent(); + J->eraseFromParent(); + } + + DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); + } +} + +char BBVectorize::ID = 0; +static const char bb_vectorize_name[] = "Basic-Block Vectorization"; +INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false) + +BasicBlockPass *llvm::createBBVectorizePass() { + return new BBVectorize(); +} + diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt new file mode 100644 index 0000000000..4b6693015c --- /dev/null +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -0,0 +1,4 @@ +add_llvm_library(LLVMVectorize + BBVectorize.cpp + Vectorize.cpp + ) diff --git a/lib/Transforms/Vectorize/LLVMBuild.txt b/lib/Transforms/Vectorize/LLVMBuild.txt new file mode 100644 index 0000000000..7167d273ae --- /dev/null +++ b/lib/Transforms/Vectorize/LLVMBuild.txt @@ -0,0 +1,24 @@ +;===- ./lib/Transforms/Scalar/LLVMBuild.txt --------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = Vectorize +parent = Transforms +library_name = Vectorize +required_libraries = Analysis Core InstCombine Support Target TransformUtils + diff --git a/lib/Transforms/Vectorize/Makefile b/lib/Transforms/Vectorize/Makefile new file mode 100644 index 0000000000..86c36585f2 --- /dev/null +++ b/lib/Transforms/Vectorize/Makefile @@ -0,0 +1,15 @@ +##===- lib/Transforms/Vectorize/Makefile -----------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +LEVEL = ../../.. +LIBRARYNAME = LLVMVectorize +BUILD_ARCHIVE = 1 + +include $(LEVEL)/Makefile.common + diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp new file mode 100644 index 0000000000..1ef60029bc --- /dev/null +++ b/lib/Transforms/Vectorize/Vectorize.cpp @@ -0,0 +1,39 @@ +//===-- Vectorize.cpp -----------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements common infrastructure for libLLVMVectorizeOpts.a, which +// implements several vectorization transformations over the LLVM intermediate +// representation, including the C bindings for that library. +// +//===----------------------------------------------------------------------===// + +#include "llvm-c/Transforms/Vectorize.h" +#include "llvm-c/Initialization.h" +#include "llvm/InitializePasses.h" +#include "llvm/PassManager.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Transforms/Vectorize.h" + +using namespace llvm; + +/// initializeVectorizationPasses - Initialize all passes linked into the +/// Vectorization library. +void llvm::initializeVectorization(PassRegistry &Registry) { + initializeBBVectorizePass(Registry); +} + +void LLVMInitializeVectorization(LLVMPassRegistryRef R) { + initializeVectorization(*unwrap(R)); +} + +void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createBBVectorizePass()); +} + diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll new file mode 100644 index 0000000000..32a91ceee0 --- /dev/null +++ b/test/Transforms/BBVectorize/cycle.ll @@ -0,0 +1,112 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise +; want to select the pairs: +; %div77 = fdiv double %sub74, %mul76.v.r1 <-> %div125 = fdiv double %mul121, %mul76.v.r2 (div125 depends on mul117) +; %add84 = fadd double %sub83, 2.000000e+00 <-> %add127 = fadd double %mul126, 1.000000e+00 (add127 depends on div77) +; %mul95 = fmul double %sub45.v.r1, %sub36.v.r1 <-> %mul88 = fmul double %sub36.v.r1, %sub87 (mul88 depends on add84) +; %mul117 = fmul double %sub39.v.r1, %sub116 <-> %mul97 = fmul double %mul96, %sub39.v.r1 (mul97 depends on mul95) +; and so a dependency cycle would be created. + +declare double @fabs(double) nounwind readnone +define void @test1(double %a, double %b, double %c, double %add80, double %mul1, double %mul2.v.r1, double %mul73, double %sub, double %sub65, double %F.0, i32 %n.0, double %Bnm3.0, double %Bnm2.0, double %Bnm1.0, double %Anm3.0, double %Anm2.0, double %Anm1.0) { +entry: + br label %go +go: + %conv = sitofp i32 %n.0 to double + %add35 = fadd double %conv, %a + %sub36 = fadd double %add35, -1.000000e+00 + %add38 = fadd double %conv, %b + %sub39 = fadd double %add38, -1.000000e+00 + %add41 = fadd double %conv, %c + %sub42 = fadd double %add41, -1.000000e+00 + %sub45 = fadd double %add35, -2.000000e+00 + %sub48 = fadd double %add38, -2.000000e+00 + %sub51 = fadd double %add41, -2.000000e+00 + %mul52 = shl nsw i32 %n.0, 1 + %sub53 = add nsw i32 %mul52, -1 + %conv54 = sitofp i32 %sub53 to double + %sub56 = add nsw i32 %mul52, -3 + %conv57 = sitofp i32 %sub56 to double + %sub59 = add nsw i32 %mul52, -5 + %conv60 = sitofp i32 %sub59 to double + %mul61 = mul nsw i32 %n.0, %n.0 + %conv62 = sitofp i32 %mul61 to double + %mul63 = fmul double %conv62, 3.000000e+00 + %mul67 = fmul double %sub65, %conv + %add68 = fadd double %mul63, %mul67 + %add69 = fadd double %add68, 2.000000e+00 + %sub71 = fsub double %add69, %mul2.v.r1 + %sub74 = fsub double %sub71, %mul73 + %mul75 = fmul double %conv57, 2.000000e+00 + %mul76 = fmul double %mul75, %sub42 + %div77 = fdiv double %sub74, %mul76 + %mul82 = fmul double %add80, %conv + %sub83 = fsub double %mul63, %mul82 + %add84 = fadd double %sub83, 2.000000e+00 + %sub86 = fsub double %add84, %mul2.v.r1 + %sub87 = fsub double -0.000000e+00, %sub86 + %mul88 = fmul double %sub36, %sub87 + %mul89 = fmul double %mul88, %sub39 + %mul90 = fmul double %conv54, 4.000000e+00 + %mul91 = fmul double %mul90, %conv57 + %mul92 = fmul double %mul91, %sub51 + %mul93 = fmul double %mul92, %sub42 + %div94 = fdiv double %mul89, %mul93 + %mul95 = fmul double %sub45, %sub36 + %mul96 = fmul double %mul95, %sub48 + %mul97 = fmul double %mul96, %sub39 + %sub99 = fsub double %conv, %a + %sub100 = fadd double %sub99, -2.000000e+00 + %mul101 = fmul double %mul97, %sub100 + %sub103 = fsub double %conv, %b + %sub104 = fadd double %sub103, -2.000000e+00 + %mul105 = fmul double %mul101, %sub104 + %mul106 = fmul double %conv57, 8.000000e+00 + %mul107 = fmul double %mul106, %conv57 + %mul108 = fmul double %mul107, %conv60 + %sub111 = fadd double %add41, -3.000000e+00 + %mul112 = fmul double %mul108, %sub111 + %mul113 = fmul double %mul112, %sub51 + %mul114 = fmul double %mul113, %sub42 + %div115 = fdiv double %mul105, %mul114 + %sub116 = fsub double -0.000000e+00, %sub36 + %mul117 = fmul double %sub39, %sub116 + %sub119 = fsub double %conv, %c + %sub120 = fadd double %sub119, -1.000000e+00 + %mul121 = fmul double %mul117, %sub120 + %mul123 = fmul double %mul75, %sub51 + %mul124 = fmul double %mul123, %sub42 + %div125 = fdiv double %mul121, %mul124 + %mul126 = fmul double %div77, %sub + %add127 = fadd double %mul126, 1.000000e+00 + %mul128 = fmul double %add127, %Anm1.0 + %mul129 = fmul double %div94, %sub + %add130 = fadd double %div125, %mul129 + %mul131 = fmul double %add130, %sub + %mul132 = fmul double %mul131, %Anm2.0 + %add133 = fadd double %mul128, %mul132 + %mul134 = fmul double %div115, %mul1 + %mul135 = fmul double %mul134, %Anm3.0 + %add136 = fadd double %add133, %mul135 + %mul139 = fmul double %add127, %Bnm1.0 + %mul143 = fmul double %mul131, %Bnm2.0 + %add144 = fadd double %mul139, %mul143 + %mul146 = fmul double %mul134, %Bnm3.0 + %add147 = fadd double %add144, %mul146 + %div148 = fdiv double %add136, %add147 + %sub149 = fsub double %F.0, %div148 + %div150 = fdiv double %sub149, %F.0 + %call = tail call double @fabs(double %div150) nounwind readnone + %cmp = fcmp olt double %call, 0x3CB0000000000000 + %cmp152 = icmp sgt i32 %n.0, 20000 + %or.cond = or i1 %cmp, %cmp152 + br i1 %or.cond, label %done, label %go +done: + ret void +; CHECK: @test1 +; CHECK: go: +; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0 +; FIXME: When tree pruning is deterministic, include the entire output. +} diff --git a/test/Transforms/BBVectorize/dg.exp b/test/Transforms/BBVectorize/dg.exp new file mode 100644 index 0000000000..f2005891a5 --- /dev/null +++ b/test/Transforms/BBVectorize/dg.exp @@ -0,0 +1,3 @@ +load_lib llvm.exp + +RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]] diff --git a/test/Transforms/BBVectorize/ld1.ll b/test/Transforms/BBVectorize/ld1.ll new file mode 100644 index 0000000000..cea225d076 --- /dev/null +++ b/test/Transforms/BBVectorize/ld1.ll @@ -0,0 +1,41 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %i2 = load double* %c, align 8 + %add = fadd double %mul, %i2 + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + %arrayidx6 = getelementptr inbounds double* %c, i64 1 + %i5 = load double* %arrayidx6, align 8 + %add7 = fadd double %mul5, %i5 + %mul9 = fmul double %add, %i1 + %add11 = fadd double %mul9, %i2 + %mul13 = fmul double %add7, %i4 + %add15 = fadd double %mul13, %i5 + %mul16 = fmul double %add11, %add15 + ret double %mul16 +; CHECK: @test1 +; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* +; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* +; CHECK: %i2.v.i0 = bitcast double* %c to <2 x double>* +; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8 +; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8 +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %i2 = load <2 x double>* %i2.v.i0, align 8 +; CHECK: %add = fadd <2 x double> %mul, %i2 +; CHECK: %mul9 = fmul <2 x double> %add, %i1 +; CHECK: %add11 = fadd <2 x double> %mul9, %i2 +; CHECK: %add11.v.r1 = extractelement <2 x double> %add11, i32 0 +; CHECK: %add11.v.r2 = extractelement <2 x double> %add11, i32 1 +; CHECK: %mul16 = fmul double %add11.v.r1, %add11.v.r2 +; CHECK: ret double %mul16 +} + diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll new file mode 100644 index 0000000000..bebc91ad91 --- /dev/null +++ b/test/Transforms/BBVectorize/loop1.ll @@ -0,0 +1,93 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s +; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL +; The second check covers the use of alias analysis (with loop unrolling). + +define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable { +entry: + br label %for.body +; CHECK: @test1 +; CHECK-UNRL: @test1 + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv + %0 = load double* %arrayidx, align 8 + %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv + %1 = load double* %arrayidx2, align 8 + %mul = fmul double %0, %0 + %mul3 = fmul double %0, %1 + %add = fadd double %mul, %mul3 + %add4 = fadd double %1, %1 + %add5 = fadd double %add4, %0 + %mul6 = fmul double %0, %add5 + %add7 = fadd double %add, %mul6 + %mul8 = fmul double %1, %1 + %add9 = fadd double %0, %0 + %add10 = fadd double %add9, %0 + %mul11 = fmul double %mul8, %add10 + %add12 = fadd double %add7, %mul11 + %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv + store double %add12, double* %arrayidx14, align 8 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 10 + br i1 %exitcond, label %for.end, label %for.body +; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] +; CHECK: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv +; CHECK: %0 = load double* %arrayidx, align 8 +; CHECK: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv +; CHECK: %1 = load double* %arrayidx2, align 8 +; CHECK: %mul = fmul double %0, %0 +; CHECK: %mul3 = fmul double %0, %1 +; CHECK: %add = fadd double %mul, %mul3 +; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0 +; CHECK: %mul8 = fmul double %1, %1 +; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1 +; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2 +; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0 +; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1 +; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2 +; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1 +; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5 +; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0 +; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1 +; CHECK: %add7 = fadd double %add, %mul6.v.r1 +; CHECK: %add12 = fadd double %add7, %mul6.v.r2 +; CHECK: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv +; CHECK: store double %add12, double* %arrayidx14, align 8 +; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1 +; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32 +; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10 +; CHECK: br i1 %exitcond, label %for.end, label %for.body +; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ] +; CHECK-UNRL: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv +; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>* +; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv +; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>* +; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv +; CHECK-UNRL: %2 = load <2 x double>* %0, align 8 +; CHECK-UNRL: %3 = load <2 x double>* %1, align 8 +; CHECK-UNRL: %mul = fmul <2 x double> %2, %2 +; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3 +; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3 +; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3 +; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2 +; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5 +; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6 +; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3 +; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2 +; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2 +; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10 +; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11 +; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>* +; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8 +; CHECK-UNRL: %indvars.iv.next.1 = add i64 %indvars.iv, 2 +; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32 +; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10 +; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} diff --git a/test/Transforms/BBVectorize/req-depth.ll b/test/Transforms/BBVectorize/req-depth.ll new file mode 100644 index 0000000000..8c9cc3c188 --- /dev/null +++ b/test/Transforms/BBVectorize/req-depth.ll @@ -0,0 +1,17 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -S | FileCheck %s -check-prefix=CHECK-RD3 +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -S | FileCheck %s -check-prefix=CHECK-RD2 + +define double @test1(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 + %R = fmul double %Y1, %Y2 + ret double %R +; CHECK-RD3: @test1 +; CHECK-RD2: @test1 +; CHECK-RD3-NOT: <2 x double> +; CHECK-RD2: <2 x double> +} + diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll new file mode 100644 index 0000000000..d9945b5630 --- /dev/null +++ b/test/Transforms/BBVectorize/search-limit.ll @@ -0,0 +1,46 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4 + +define double @test1(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test1 +; CHECK-SL4: @test1 +; CHECK-SL4-NOT: <2 x double> +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y1, %B1 + ; Here we have a dependency chain: the short search limit will not + ; see past this chain and so will not see the second part of the + ; pair to vectorize. + %mul41 = fmul double %Z1, %Y2 + %sub48 = fsub double %Z1, %mul41 + %mul62 = fmul double %Z1, %sub48 + %sub69 = fsub double %Z1, %mul62 + %mul83 = fmul double %Z1, %sub69 + %sub90 = fsub double %Z1, %mul83 + %mul104 = fmul double %Z1, %sub90 + %sub111 = fsub double %Z1, %mul104 + %mul125 = fmul double %Z1, %sub111 + %sub132 = fsub double %Z1, %mul125 + %mul146 = fmul double %Z1, %sub132 + %sub153 = fsub double %Z1, %mul146 + ; end of chain. + %Z2 = fadd double %Y2, %B2 +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 + %R1 = fdiv double %Z1, %Z2 + %R = fmul double %R1, %sub153 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R1 = fdiv double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll new file mode 100644 index 0000000000..b2ef27b2b8 --- /dev/null +++ b/test/Transforms/BBVectorize/simple-int.ll @@ -0,0 +1,59 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +declare double @llvm.fma.f64(double, double, double) +declare double @llvm.cos.f64(double) + +; Basic depth-3 chain with fma +define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1) + %Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2) + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test1 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 +; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0 +; CHECK: %Y1.v.i2.2 = insertelement <2 x double> %Y1.v.i2.1, double %C2, i32 1 +; CHECK: %Y1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %X1, <2 x double> %X1.v.i0.2, <2 x double> %Y1.v.i2.2) +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 +; CHECK: ret double %R +} + +; Basic depth-3 chain with cos +define double @test2(double %A1, double %A2, double %B1, double %B2) { + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 + %Y1 = call double @llvm.cos.f64(double %X1) + %Y2 = call double @llvm.cos.f64(double %X2) + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 + %R = fmul double %Z1, %Z2 + ret double %R +; CHECK: @test2 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 +; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1) +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 +; CHECK: ret double %R +} + +; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone +; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly + diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll new file mode 100644 index 0000000000..a5397eeb1f --- /dev/null +++ b/test/Transforms/BBVectorize/simple-ldstr.ll @@ -0,0 +1,110 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO + +; Simple 3-pair chain with loads and stores +define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + store double %mul, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %mul5, double* %arrayidx5, align 8 + ret void +; CHECK: @test1 +; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* +; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* +; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8 +; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8 +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %0 = bitcast double* %c to <2 x double>* +; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8 +; CHECK: ret void +; CHECK-AO: @test1 +; CHECK-AO-NOT: <2 x double> +} + +; Simple chain with extending loads and stores +define void @test2(float* %a, float* %b, double* %c) nounwind uwtable readonly { +entry: + %i0f = load float* %a, align 4 + %i0 = fpext float %i0f to double + %i1f = load float* %b, align 4 + %i1 = fpext float %i1f to double + %mul = fmul double %i0, %i1 + %arrayidx3 = getelementptr inbounds float* %a, i64 1 + %i3f = load float* %arrayidx3, align 4 + %i3 = fpext float %i3f to double + %arrayidx4 = getelementptr inbounds float* %b, i64 1 + %i4f = load float* %arrayidx4, align 4 + %i4 = fpext float %i4f to double + %mul5 = fmul double %i3, %i4 + store double %mul, double* %c, align 8 + %arrayidx5 = getelementptr inbounds double* %c, i64 1 + store double %mul5, double* %arrayidx5, align 8 + ret void +; CHECK: @test2 +; CHECK: %i0f.v.i0 = bitcast float* %a to <2 x float>* +; CHECK: %i1f.v.i0 = bitcast float* %b to <2 x float>* +; CHECK: %i0f = load <2 x float>* %i0f.v.i0, align 4 +; CHECK: %i0 = fpext <2 x float> %i0f to <2 x double> +; CHECK: %i1f = load <2 x float>* %i1f.v.i0, align 4 +; CHECK: %i1 = fpext <2 x float> %i1f to <2 x double> +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %0 = bitcast double* %c to <2 x double>* +; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8 +; CHECK: ret void +; CHECK-AO: @test2 +; CHECK-AO-NOT: <2 x double> +} + +; Simple chain with loads and truncating stores +define void @test3(double* %a, double* %b, float* %c) nounwind uwtable readonly { +entry: + %i0 = load double* %a, align 8 + %i1 = load double* %b, align 8 + %mul = fmul double %i0, %i1 + %mulf = fptrunc double %mul to float + %arrayidx3 = getelementptr inbounds double* %a, i64 1 + %i3 = load double* %arrayidx3, align 8 + %arrayidx4 = getelementptr inbounds double* %b, i64 1 + %i4 = load double* %arrayidx4, align 8 + %mul5 = fmul double %i3, %i4 + %mul5f = fptrunc double %mul5 to float + store float %mulf, float* %c, align 8 + %arrayidx5 = getelementptr inbounds float* %c, i64 1 + store float %mul5f, float* %arrayidx5, align 4 + ret void +; CHECK: @test3 +; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>* +; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>* +; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8 +; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8 +; CHECK: %mul = fmul <2 x double> %i0, %i1 +; CHECK: %mulf = fptrunc <2 x double> %mul to <2 x float> +; CHECK: %0 = bitcast float* %c to <2 x float>* +; CHECK: store <2 x float> %mulf, <2 x float>* %0, align 8 +; CHECK: ret void +; CHECK-AO: @test3 +; CHECK-AO: %i0 = load double* %a, align 8 +; CHECK-AO: %i1 = load double* %b, align 8 +; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0 +; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0 +; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1 +; CHECK-AO: %i3 = load double* %arrayidx3, align 8 +; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1 +; CHECK-AO: %i4 = load double* %arrayidx4, align 8 +; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1 +; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1 +; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2 +; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float> +; CHECK-AO: %0 = bitcast float* %c to <2 x float>* +; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8 +; CHECK-AO: ret void +} diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll new file mode 100644 index 0000000000..904d766bb6 --- /dev/null +++ b/test/Transforms/BBVectorize/simple.ll @@ -0,0 +1,152 @@ +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s + +; Basic depth-3 chain +define double @test1(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test1 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y2, %B2 +; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2 + %R = fmul double %Z1, %Z2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + +; Basic depth-3 chain (last pair permuted) +define double @test2(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test2 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y2, %B1 + %Z2 = fadd double %Y1, %B2 +; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> +; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2 + %R = fmul double %Z1, %Z2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + +; Basic depth-3 chain (last pair first splat) +define double @test3(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test3 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y2, %B1 + %Z2 = fadd double %Y2, %B2 +; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> +; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2 + %R = fmul double %Z1, %Z2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + +; Basic depth-3 chain (last pair second splat) +define double @test4(double %A1, double %A2, double %B1, double %B2) { +; CHECK: @test4 +; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0 +; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0 +; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1 +; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1 + %X1 = fsub double %A1, %B1 + %X2 = fsub double %A2, %B2 +; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2 + %Y1 = fmul double %X1, %A1 + %Y2 = fmul double %X2, %A2 +; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2 + %Z1 = fadd double %Y1, %B1 + %Z2 = fadd double %Y1, %B2 +; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> zeroinitializer +; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2 + %R = fmul double %Z1, %Z2 +; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0 +; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1 +; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2 + ret double %R +; CHECK: ret double %R +} + +; Basic depth-3 chain +define <2 x float> @test5(<2 x float> %A1, <2 x float> %A2, <2 x float> %B1, <2 x float> %B2) { +; CHECK: @test5 +; CHECK: %X1.v.i1 = shufflevector <2 x float> %B1, <2 x float> %B2, <4 x i32> +; CHECK: %X1.v.i0 = shufflevector <2 x float> %A1, <2 x float> %A2, <4 x i32> + %X1 = fsub <2 x float> %A1, %B1 + %X2 = fsub <2 x float> %A2, %B2 +; CHECK: %X1 = fsub <4 x float> %X1.v.i0, %X1.v.i1 + %Y1 = fmul <2 x float> %X1, %A1 + %Y2 = fmul <2 x float> %X2, %A2 +; CHECK: %Y1 = fmul <4 x float> %X1, %X1.v.i0 + %Z1 = fadd <2 x float> %Y1, %B1 + %Z2 = fadd <2 x float> %Y2, %B2 +; CHECK: %Z1 = fadd <4 x float> %Y1, %X1.v.i1 + %R = fmul <2 x float> %Z1, %Z2 +; CHECK: %Z1.v.r1 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> +; CHECK: %Z1.v.r2 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> +; CHECK: %R = fmul <2 x float> %Z1.v.r1, %Z1.v.r2 + ret <2 x float> %R +; CHECK: ret <2 x float> %R +} + +; Basic chain with shuffles +define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) { +; CHECK: @test6 +; CHECK: %X1.v.i1 = shufflevector <8 x i8> %B1, <8 x i8> %B2, <16 x i32> +; CHECK: %X1.v.i0 = shufflevector <8 x i8> %A1, <8 x i8> %A2, <16 x i32> + %X1 = sub <8 x i8> %A1, %B1 + %X2 = sub <8 x i8> %A2, %B2 +; CHECK: %X1 = sub <16 x i8> %X1.v.i0, %X1.v.i1 + %Y1 = mul <8 x i8> %X1, %A1 + %Y2 = mul <8 x i8> %X2, %A2 +; CHECK: %Y1 = mul <16 x i8> %X1, %X1.v.i0 + %Z1 = add <8 x i8> %Y1, %B1 + %Z2 = add <8 x i8> %Y2, %B2 +; CHECK: %Z1 = add <16 x i8> %Y1, %X1.v.i1 + %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> + %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> +; CHECK: %Z1.v.r2 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <8 x i32> +; CHECK: %Q1.v.i1 = shufflevector <8 x i8> %Z1.v.r2, <8 x i8> undef, <16 x i32> +; CHECK: %Q1 = shufflevector <16 x i8> %Z1, <16 x i8> %Q1.v.i1, <16 x i32> + %R = mul <8 x i8> %Q1, %Q2 +; CHECK: %Q1.v.r1 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> +; CHECK: %Q1.v.r2 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> +; CHECK: %R = mul <8 x i8> %Q1.v.r1, %Q1.v.r2 + ret <8 x i8> %R +; CHECK: ret <8 x i8> %R +} + + diff --git a/tools/bugpoint/CMakeLists.txt b/tools/bugpoint/CMakeLists.txt index e06feb1003..ee2235bf42 100644 --- a/tools/bugpoint/CMakeLists.txt +++ b/tools/bugpoint/CMakeLists.txt @@ -1,5 +1,5 @@ set(LLVM_LINK_COMPONENTS asmparser instrumentation scalaropts ipo - linker bitreader bitwriter) + linker bitreader bitwriter vectorize) add_llvm_tool(bugpoint BugDriver.cpp diff --git a/tools/bugpoint/Makefile b/tools/bugpoint/Makefile index eacaa47a7b..34f4bddb01 100644 --- a/tools/bugpoint/Makefile +++ b/tools/bugpoint/Makefile @@ -10,6 +10,6 @@ LEVEL := ../.. TOOLNAME := bugpoint LINK_COMPONENTS := asmparser instrumentation scalaropts ipo linker bitreader \ - bitwriter + bitwriter vectorize include $(LEVEL)/Makefile.common diff --git a/tools/llvm-ld/CMakeLists.txt b/tools/llvm-ld/CMakeLists.txt index 370bcb4abf..d328a04b0e 100644 --- a/tools/llvm-ld/CMakeLists.txt +++ b/tools/llvm-ld/CMakeLists.txt @@ -1,4 +1,4 @@ -set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter) +set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter vectorize) add_llvm_tool(llvm-ld Optimize.cpp diff --git a/tools/llvm-ld/Makefile b/tools/llvm-ld/Makefile index 2ec6e0a43a..8793ca9c10 100644 --- a/tools/llvm-ld/Makefile +++ b/tools/llvm-ld/Makefile @@ -9,6 +9,6 @@ LEVEL := ../.. TOOLNAME := llvm-ld -LINK_COMPONENTS := ipo scalaropts linker archive bitwriter +LINK_COMPONENTS := ipo scalaropts linker archive bitwriter vectorize include $(LEVEL)/Makefile.common diff --git a/tools/lto/CMakeLists.txt b/tools/lto/CMakeLists.txt index 7e2c5f06e6..911297609b 100644 --- a/tools/lto/CMakeLists.txt +++ b/tools/lto/CMakeLists.txt @@ -1,6 +1,6 @@ set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} - ipo scalaropts linker bitreader bitwriter mcdisassembler) + ipo scalaropts linker bitreader bitwriter mcdisassembler vectorize) add_definitions( -DLLVM_VERSION_INFO=\"${PACKAGE_VERSION}\" ) diff --git a/tools/lto/Makefile b/tools/lto/Makefile index ef78f82c8c..153fa03137 100644 --- a/tools/lto/Makefile +++ b/tools/lto/Makefile @@ -10,7 +10,7 @@ LEVEL := ../.. LIBRARYNAME := LTO LINK_COMPONENTS := all-targets ipo scalaropts linker bitreader bitwriter \ - mcdisassembler + mcdisassembler vectorize LINK_LIBS_IN_SHARED := 1 SHARED_LIBRARY := 1 diff --git a/tools/opt/CMakeLists.txt b/tools/opt/CMakeLists.txt index 0570d0e04a..7daf22aa9e 100644 --- a/tools/opt/CMakeLists.txt +++ b/tools/opt/CMakeLists.txt @@ -1,4 +1,4 @@ -set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo) +set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo vectorize) add_llvm_tool(opt AnalysisWrappers.cpp diff --git a/tools/opt/Makefile b/tools/opt/Makefile index e8cd7e2857..16d116da5d 100644 --- a/tools/opt/Makefile +++ b/tools/opt/Makefile @@ -9,6 +9,6 @@ LEVEL := ../.. TOOLNAME := opt -LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo +LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo vectorize include $(LEVEL)/Makefile.common diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp index 51613649b5..30da863b41 100644 --- a/tools/opt/opt.cpp +++ b/tools/opt/opt.cpp @@ -480,6 +480,7 @@ int main(int argc, char **argv) { PassRegistry &Registry = *PassRegistry::getPassRegistry(); initializeCore(Registry); initializeScalarOpts(Registry); + initializeVectorization(Registry); initializeIPO(Registry); initializeAnalysis(Registry); initializeIPA(Registry); -- cgit v1.2.3