From de5e5ec3045a73a06b1054417f9ac6c02929e9ce Mon Sep 17 00:00:00 2001
From: Hal Finkel <hfinkel@anl.gov>
Date: Wed, 1 Feb 2012 03:51:43 +0000
Subject: Add a basic-block autovectorization pass.

This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure.
Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser).

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8
---
 docs/Passes.html                                 |   21 +
 include/llvm-c/Initialization.h                  |    1 +
 include/llvm-c/Transforms/Vectorize.h            |   37 +
 include/llvm/InitializePasses.h                  |    6 +-
 include/llvm/LinkAllPasses.h                     |    2 +
 include/llvm/Transforms/IPO/PassManagerBuilder.h |    1 +
 include/llvm/Transforms/Vectorize.h              |   30 +
 lib/Transforms/CMakeLists.txt                    |    1 +
 lib/Transforms/IPO/LLVMBuild.txt                 |    2 +-
 lib/Transforms/IPO/PassManagerBuilder.cpp        |   14 +
 lib/Transforms/LLVMBuild.txt                     |    2 +-
 lib/Transforms/Makefile                          |    2 +-
 lib/Transforms/Vectorize/BBVectorize.cpp         | 1796 ++++++++++++++++++++++
 lib/Transforms/Vectorize/CMakeLists.txt          |    4 +
 lib/Transforms/Vectorize/LLVMBuild.txt           |   24 +
 lib/Transforms/Vectorize/Makefile                |   15 +
 lib/Transforms/Vectorize/Vectorize.cpp           |   39 +
 test/Transforms/BBVectorize/cycle.ll             |  112 ++
 test/Transforms/BBVectorize/dg.exp               |    3 +
 test/Transforms/BBVectorize/ld1.ll               |   41 +
 test/Transforms/BBVectorize/loop1.ll             |   93 ++
 test/Transforms/BBVectorize/req-depth.ll         |   17 +
 test/Transforms/BBVectorize/search-limit.ll      |   46 +
 test/Transforms/BBVectorize/simple-int.ll        |   59 +
 test/Transforms/BBVectorize/simple-ldstr.ll      |  110 ++
 test/Transforms/BBVectorize/simple.ll            |  152 ++
 tools/bugpoint/CMakeLists.txt                    |    2 +-
 tools/bugpoint/Makefile                          |    2 +-
 tools/llvm-ld/CMakeLists.txt                     |    2 +-
 tools/llvm-ld/Makefile                           |    2 +-
 tools/lto/CMakeLists.txt                         |    2 +-
 tools/lto/Makefile                               |    2 +-
 tools/opt/CMakeLists.txt                         |    2 +-
 tools/opt/Makefile                               |    2 +-
 tools/opt/opt.cpp                                |    1 +
 35 files changed, 2635 insertions(+), 12 deletions(-)
 create mode 100644 include/llvm-c/Transforms/Vectorize.h
 create mode 100644 include/llvm/Transforms/Vectorize.h
 create mode 100644 lib/Transforms/Vectorize/BBVectorize.cpp
 create mode 100644 lib/Transforms/Vectorize/CMakeLists.txt
 create mode 100644 lib/Transforms/Vectorize/LLVMBuild.txt
 create mode 100644 lib/Transforms/Vectorize/Makefile
 create mode 100644 lib/Transforms/Vectorize/Vectorize.cpp
 create mode 100644 test/Transforms/BBVectorize/cycle.ll
 create mode 100644 test/Transforms/BBVectorize/dg.exp
 create mode 100644 test/Transforms/BBVectorize/ld1.ll
 create mode 100644 test/Transforms/BBVectorize/loop1.ll
 create mode 100644 test/Transforms/BBVectorize/req-depth.ll
 create mode 100644 test/Transforms/BBVectorize/search-limit.ll
 create mode 100644 test/Transforms/BBVectorize/simple-int.ll
 create mode 100644 test/Transforms/BBVectorize/simple-ldstr.ll
 create mode 100644 test/Transforms/BBVectorize/simple.ll
diff --git a/docs/Passes.html b/docs/Passes.html
index 5c42f3fdd5..840b2eff06 100644
--- a/docs/Passes.html
+++ b/docs/Passes.html
@@ -126,6 +126,7 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
 <tr><td><a href="#adce">-adce</a></td><td>Aggressive Dead Code Elimination</td></tr>
 <tr><td><a href="#always-inline">-always-inline</a></td><td>Inliner for always_inline functions</td></tr>
 <tr><td><a href="#argpromotion">-argpromotion</a></td><td>Promote 'by reference' arguments to scalars</td></tr>
+<tr><td><a href="#bb-vectorize">-bb-vectorize</a></td><td>Combine instructions to form vector instructions within basic blocks</td></tr>
 <tr><td><a href="#block-placement">-block-placement</a></td><td>Profile Guided Basic Block Placement</td></tr>
 <tr><td><a href="#break-crit-edges">-break-crit-edges</a></td><td>Break critical edges in CFG</td></tr>
 <tr><td><a href="#codegenprepare">-codegenprepare</a></td><td>Optimize for code generation</td></tr>
@@ -815,6 +816,26 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !
   </p>
 </div>
 
+<!-------------------------------------------------------------------------- -->
+<h3>
+  <a name="bb-vectorize">-bb-vectorize: Basic-Block Vectorization</a>
+</h3>
+<div>
+  <p>This pass combines instructions inside basic blocks to form vector
+  instructions. It iterates over each basic block, attempting to pair
+  compatible instructions, repeating this process until no additional
+  pairs are selected for vectorization. When the outputs of some pair
+  of compatible instructions are used as inputs by some other pair of
+  compatible instructions, those pairs are part of a potential
+  vectorization chain. Instruction pairs are only fused into vector
+  instructions when they are part of a chain longer than some
+  threshold length. Moreover, the pass attempts to find the best
+  possible chain for each pair of compatible instructions. These
+  heuristics are intended to prevent vectorization in cases where
+  it would not yield a performance increase of the resulting code.
+  </p>
+</div>
+
 <!-------------------------------------------------------------------------- -->
 <h3>
   <a name="block-placement">-block-placement: Profile Guided Basic Block Placement</a>
diff --git a/include/llvm-c/Initialization.h b/include/llvm-c/Initialization.h
index 3b59abbec0..cbe60df908 100644
--- a/include/llvm-c/Initialization.h
+++ b/include/llvm-c/Initialization.h
@@ -25,6 +25,7 @@ extern "C" {
 void LLVMInitializeCore(LLVMPassRegistryRef R);
 void LLVMInitializeTransformUtils(LLVMPassRegistryRef R);
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R);
+void LLVMInitializeVectorization(LLVMPassRegistryRef R);
 void LLVMInitializeInstCombine(LLVMPassRegistryRef R);
 void LLVMInitializeIPO(LLVMPassRegistryRef R);
 void LLVMInitializeInstrumentation(LLVMPassRegistryRef R);
diff --git a/include/llvm-c/Transforms/Vectorize.h b/include/llvm-c/Transforms/Vectorize.h
new file mode 100644
index 0000000000..178465ac82
--- /dev/null
+++ b/include/llvm-c/Transforms/Vectorize.h
@@ -0,0 +1,37 @@
+/*===---------------------------Vectorize.h ------------------- -*- C++ -*-===*\
+|*===----------- Vectorization Transformation Library C Interface ---------===*|
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to libLLVMVectorize.a, which          *|
+|* implements various vectorization transformations of the LLVM IR.           *|
+|*                                                                            *|
+|* Many exotic languages can interoperate with C code but have a harder time  *|
+|* with C++ due to name mangling. So in addition to C, this interface enables *|
+|* tools written in such languages.                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_TRANSFORMS_VECTORIZE_H
+#define LLVM_C_TRANSFORMS_VECTORIZE_H
+
+#include "llvm-c/Core.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** See llvm::createBBVectorizePass function. */
+void LLVMAddBBVectorizePass(LLVMPassManagerRef PM);
+
+#ifdef __cplusplus
+}
+#endif /* defined(__cplusplus) */
+
+#endif
+
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 6cc3f195c7..ac129c77d5 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -31,6 +31,10 @@ void initializeTransformUtils(PassRegistry&);
 /// ScalarOpts library.
 void initializeScalarOpts(PassRegistry&);
 
+/// initializeVectorization - Initialize all passes linked into the
+/// Vectorize library.
+void initializeVectorization(PassRegistry&);
+
 /// initializeInstCombine - Initialize all passes linked into the
 /// ScalarOpts library.
 void initializeInstCombine(PassRegistry&);
@@ -236,7 +240,7 @@ void initializeVirtRegMapPass(PassRegistry&);
 void initializeInstSimplifierPass(PassRegistry&);
 void initializeUnpackMachineBundlesPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
-
+void initializeBBVectorizePass(PassRegistry&);
 }
 
 #endif
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 537fab757b..2258d45ce9 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -31,6 +31,7 @@
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include <cstdlib>
 
@@ -151,6 +152,7 @@ namespace {
       (void) llvm::createCorrelatedValuePropagationPass();
       (void) llvm::createMemDepPrinter();
       (void) llvm::createInstructionSimplifierPass();
+      (void) llvm::createBBVectorizePass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::FindUsedTypes();
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index b4ba184781..a1b4f5cd90 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -99,6 +99,7 @@ public:
   bool DisableSimplifyLibCalls;
   bool DisableUnitAtATime;
   bool DisableUnrollLoops;
+  bool Vectorize;
 
 private:
   /// ExtensionList - This is list of all of the extensions that are registered.
diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h
new file mode 100644
index 0000000000..dfc099ddb9
--- /dev/null
+++ b/include/llvm/Transforms/Vectorize.h
@@ -0,0 +1,30 @@
+//===-- Vectorize.h - Vectorization Transformations -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for accessor functions that expose passes
+// in the Vectorize transformations library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_H
+#define LLVM_TRANSFORMS_VECTORIZE_H
+
+namespace llvm {
+
+class BasicBlockPass;
+
+//===----------------------------------------------------------------------===//
+//
+// BBVectorize - A basic-block vectorization pass.
+//
+BasicBlockPass *createBBVectorizePass();
+
+} // End llvm namespace
+
+#endif
diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
index 10e0cc6b56..de1353e6c1 100644
--- a/lib/Transforms/CMakeLists.txt
+++ b/lib/Transforms/CMakeLists.txt
@@ -3,4 +3,5 @@ add_subdirectory(Instrumentation)
 add_subdirectory(InstCombine)
 add_subdirectory(Scalar)
 add_subdirectory(IPO)
+add_subdirectory(Vectorize)
 add_subdirectory(Hello)
diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt
index b358fabdfa..b18c9150f4 100644
--- a/lib/Transforms/IPO/LLVMBuild.txt
+++ b/lib/Transforms/IPO/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = IPO
 parent = Transforms
 library_name = ipo
-required_libraries = Analysis Core IPA InstCombine Scalar Support Target TransformUtils
+required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index afd25dc317..84084374b3 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -21,14 +21,20 @@
 #include "llvm/DefaultPasses.h"
 #include "llvm/PassManager.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
+static cl::opt<bool>
+RunVectorization("vectorize", cl::desc("Run vectorization passes"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -37,6 +43,7 @@ PassManagerBuilder::PassManagerBuilder() {
     DisableSimplifyLibCalls = false;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
+    Vectorize = RunVectorization;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -172,6 +179,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
+  if (Vectorize) {
+    MPM.add(createBBVectorizePass());
+    MPM.add(createInstructionCombiningPass());
+    if (OptLevel > 1)
+      MPM.add(createGVNPass());                 // Remove redundancies
+  }
+
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createInstructionCombiningPass());  // Clean up after everything.
diff --git a/lib/Transforms/LLVMBuild.txt b/lib/Transforms/LLVMBuild.txt
index b2ef49a4c6..f7bca064c7 100644
--- a/lib/Transforms/LLVMBuild.txt
+++ b/lib/Transforms/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = IPO InstCombine Instrumentation Scalar Utils
+subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize
 
 [component_0]
 type = Group
diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile
index e527be25de..8b1df92fa2 100644
--- a/lib/Transforms/Makefile
+++ b/lib/Transforms/Makefile
@@ -8,7 +8,7 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../..
-PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Hello
+PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello
 
 include $(LEVEL)/Makefile.config
 
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
new file mode 100644
index 0000000000..9c2c8dd51b
--- /dev/null
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -0,0 +1,1796 @@
+//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a basic-block vectorization pass. The algorithm was
+// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
+// et al. It works by looking for chains of pairable operations and then
+// pairing them.
+//
+//===----------------------------------------------------------------------===//
+
+#define BBV_NAME "bb-vectorize"
+#define DEBUG_TYPE BBV_NAME
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+static cl::opt<unsigned>
+ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
+  cl::desc("The required chain depth for vectorization"));
+
+static cl::opt<unsigned>
+SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
+  cl::desc("The maximum search distance for instruction pairs"));
+
+static cl::opt<bool>
+SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
+  cl::desc("Replicating one element to a pair breaks the chain"));
+
+static cl::opt<unsigned>
+VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
+  cl::desc("The size of the native vector registers"));
+
+static cl::opt<unsigned>
+MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
+  cl::desc("The maximum number of pairing iterations"));
+
+static cl::opt<unsigned>
+MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
+  cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
+                       " a full cycle check"));
+
+static cl::opt<bool>
+NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize integer values"));
+
+static cl::opt<bool>
+NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize floating-point values"));
+
+static cl::opt<bool>
+NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize casting (conversion) operations"));
+
+static cl::opt<bool>
+NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize floating-point math intrinsics"));
+
+static cl::opt<bool>
+NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
+
+static cl::opt<bool>
+NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
+  cl::desc("Don't try to vectorize loads and stores"));
+
+static cl::opt<bool>
+AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
+  cl::desc("Only generate aligned loads and stores"));
+
+static cl::opt<bool>
+FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
+  cl::desc("Use a fast instruction dependency analysis"));
+
+#ifndef NDEBUG
+static cl::opt<bool>
+DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " instruction-examination process"));
+static cl::opt<bool>
+DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " candidate-selection process"));
+static cl::opt<bool>
+DebugPairSelection("bb-vectorize-debug-pair-selection",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " pair-selection process"));
+static cl::opt<bool>
+DebugCycleCheck("bb-vectorize-debug-cycle-check",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, output information on the"
+           " cycle-checking process"));
+#endif
+
+STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
+
+namespace {
+  struct BBVectorize : public BasicBlockPass {
+    static char ID; // Pass identification, replacement for typeid
+    BBVectorize() : BasicBlockPass(ID) {
+      initializeBBVectorizePass(*PassRegistry::getPassRegistry());
+    }
+
+    typedef std::pair<Value *, Value *> ValuePair;
+    typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
+    typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+    typedef std::pair<std::multimap<Value *, Value *>::iterator,
+              std::multimap<Value *, Value *>::iterator> VPIteratorPair;
+    typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
+              std::multimap<ValuePair, ValuePair>::iterator>
+                VPPIteratorPair;
+
+    AliasAnalysis *AA;
+    ScalarEvolution *SE;
+    TargetData *TD;
+
+    // FIXME: const correct?
+
+    bool vectorizePairs(BasicBlock &BB);
+
+    void getCandidatePairs(BasicBlock &BB,
+                       std::multimap<Value *, Value *> &CandidatePairs,
+                       std::vector<Value *> &PairableInsts);
+
+    void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
+                       std::vector<Value *> &PairableInsts,
+                       std::multimap<ValuePair, ValuePair> &ConnectedPairs);
+
+    void buildDepMap(BasicBlock &BB,
+                       std::multimap<Value *, Value *> &CandidatePairs,
+                       std::vector<Value *> &PairableInsts,
+                       DenseSet<ValuePair> &PairableInstUsers);
+
+    void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
+                        std::vector<Value *> &PairableInsts,
+                        std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                        DenseSet<ValuePair> &PairableInstUsers,
+                        DenseMap<Value *, Value *>& ChosenPairs);
+
+    void fuseChosenPairs(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *>& ChosenPairs);
+
+    bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
+
+    bool areInstsCompatible(Instruction *I, Instruction *J,
+                       bool IsSimpleLoadStore);
+
+    bool trackUsesOfI(DenseSet<Value *> &Users,
+                      AliasSetTracker &WriteSet, Instruction *I,
+                      Instruction *J, bool UpdateUsers = true,
+                      std::multimap<Value *, Value *> *LoadMoveSet = 0);
+  
+    void computePairsConnectedTo(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      ValuePair P);
+
+    bool pairsConflict(ValuePair P, ValuePair Q,
+                 DenseSet<ValuePair> &PairableInstUsers,
+                 std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
+
+    bool pairWillFormCycle(ValuePair P,
+                       std::multimap<ValuePair, ValuePair> &PairableInstUsers,
+                       DenseSet<ValuePair> &CurrentPairs);
+
+    void pruneTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseMap<ValuePair, size_t> &Tree,
+                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
+                      bool UseCycleCheck);
+
+    void buildInitialTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseMap<ValuePair, size_t> &Tree, ValuePair J);
+
+    void findBestTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
+                      size_t &BestEffSize, VPIteratorPair ChoiceRange,
+                      bool UseCycleCheck);
+
+    Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
+                     Instruction *J, unsigned o, bool &FlipMemInputs);
+
+    void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
+                     unsigned IdxOffset, std::vector<Constant*> &Mask);
+
+    Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
+                     Instruction *J);
+
+    Value *getReplacementInput(LLVMContext& Context, Instruction *I,
+                     Instruction *J, unsigned o, bool FlipMemInputs);
+
+    void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
+                     Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+                     bool &FlipMemInputs);
+
+    void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+                     Instruction *J, Instruction *K,
+                     Instruction *&InsertionPt, Instruction *&K1,
+                     Instruction *&K2, bool &FlipMemInputs);
+
+    void collectPairLoadMoveSet(BasicBlock &BB,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *I);
+
+    void collectLoadMoveSet(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     std::multimap<Value *, Value *> &LoadMoveSet);
+
+    bool canMoveUsesOfIAfterJ(BasicBlock &BB,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *I, Instruction *J);
+
+    void moveUsesOfIAfterJ(BasicBlock &BB,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *&InsertionPt,
+                     Instruction *I, Instruction *J);
+
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      AA = &getAnalysis<AliasAnalysis>();
+      SE = &getAnalysis<ScalarEvolution>();
+      TD = getAnalysisIfAvailable<TargetData>();
+
+      bool changed = false;
+      // Iterate a sufficient number of times to merge types of size 1 bit,
+      // then 2 bits, then 4, etc. up to half of the target vector width of the
+      // target vector register.
+      for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter);
+           v *= 2, ++n) {
+        DEBUG(dbgs() << "BBV: fusing loop #" << n << 
+              " for " << BB.getName() << " in " <<
+              BB.getParent()->getName() << "...\n");
+        if (vectorizePairs(BB))
+          changed = true;
+        else
+          break;
+      }
+
+      DEBUG(dbgs() << "BBV: done!\n");
+      return changed;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      BasicBlockPass::getAnalysisUsage(AU);
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<ScalarEvolution>();
+    }
+
+    // This returns the vector type that holds a pair of the provided type.
+    // If the provided type is already a vector, then its length is doubled.
+    static inline VectorType *getVecTypeForPair(Type *ElemTy) {
+      if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
+        unsigned numElem = VTy->getNumElements();
+        return VectorType::get(ElemTy->getScalarType(), numElem*2);
+      } else {
+        return VectorType::get(ElemTy, 2);
+      }
+    }
+
+    // Returns the weight associated with the provided value. A chain of
+    // candidate pairs has a length given by the sum of the weights of its
+    // members (one weight per pair; the weight of each member of the pair
+    // is assumed to be the same). This length is then compared to the
+    // chain-length threshold to determine if a given chain is significant
+    // enough to be vectorized. The length is also used in comparing
+    // candidate chains where longer chains are considered to be better.
+    // Note: when this function returns 0, the resulting instructions are
+    // not actually fused.
+    static inline size_t getDepthFactor(Value *V) {
+      // InsertElement and ExtractElement have a depth factor of zero. This is
+      // for two reasons: First, they cannot be usefully fused. Second, because
+      // the pass generates a lot of these, they can confuse the simple metric
+      // used to compare the trees in the next iteration. Thus, giving them a
+      // weight of zero allows the pass to essentially ignore them in
+      // subsequent iterations when looking for vectorization opportunities
+      // while still tracking dependency chains that flow through those
+      // instructions.
+      if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
+        return 0;
+
+      return 1;
+    }
+
+    // This determines the relative offset of two loads or stores, returning
+    // true if the offset could be determined to be some constant value.
+    // For example, if OffsetInElmts == 1, then J accesses the memory directly
+    // after I; if OffsetInElmts == -1 then I accesses the memory
+    // directly after J. This function assumes that both instructions
+    // have the same type.
+    bool getPairPtrInfo(Instruction *I, Instruction *J,
+        Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
+        int64_t &OffsetInElmts) {
+      OffsetInElmts = 0;
+      if (isa<LoadInst>(I)) {
+        IPtr = cast<LoadInst>(I)->getPointerOperand();
+        JPtr = cast<LoadInst>(J)->getPointerOperand();
+        IAlignment = cast<LoadInst>(I)->getAlignment();
+        JAlignment = cast<LoadInst>(J)->getAlignment();
+      } else {
+        IPtr = cast<StoreInst>(I)->getPointerOperand();
+        JPtr = cast<StoreInst>(J)->getPointerOperand();
+        IAlignment = cast<StoreInst>(I)->getAlignment();
+        JAlignment = cast<StoreInst>(J)->getAlignment();
+      }
+
+      const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
+      const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
+
+      // If this is a trivial offset, then we'll get something like
+      // 1*sizeof(type). With target data, which we need anyway, this will get
+      // constant folded into a number.
+      const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
+      if (const SCEVConstant *ConstOffSCEV =
+            dyn_cast<SCEVConstant>(OffsetSCEV)) {
+        ConstantInt *IntOff = ConstOffSCEV->getValue();
+        int64_t Offset = IntOff->getSExtValue();
+
+        Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
+        int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
+
+        assert(VTy == cast<PointerType>(JPtr->getType())->getElementType());
+
+        OffsetInElmts = Offset/VTyTSS;
+        return (abs64(Offset) % VTyTSS) == 0;
+      }
+
+      return false;
+    }
+
+    // Returns true if the provided CallInst represents an intrinsic that can
+    // be vectorized.
+    bool isVectorizableIntrinsic(CallInst* I) {
+      Function *F = I->getCalledFunction();
+      if (!F) return false;
+
+      unsigned IID = F->getIntrinsicID();
+      if (!IID) return false;
+
+      switch(IID) {
+      default:
+        return false;
+      case Intrinsic::sqrt:
+      case Intrinsic::powi:
+      case Intrinsic::sin:
+      case Intrinsic::cos:
+      case Intrinsic::log:
+      case Intrinsic::log2:
+      case Intrinsic::log10:
+      case Intrinsic::exp:
+      case Intrinsic::exp2:
+      case Intrinsic::pow:
+        return !NoMath;
+      case Intrinsic::fma:
+        return !NoFMA;
+      }
+    }
+
+    // Returns true if J is the second element in some pair referenced by
+    // some multimap pair iterator pair.
+    template <typename V>
+    bool isSecondInIteratorPair(V J, std::pair<
+           typename std::multimap<V, V>::iterator,
+           typename std::multimap<V, V>::iterator> PairRange) {
+      for (typename std::multimap<V, V>::iterator K = PairRange.first;
+           K != PairRange.second; ++K)
+        if (K->second == J) return true;
+
+      return false;
+    }
+  };
+
+  // This function implements one vectorization iteration on the provided
+  // basic block. It returns true if the block is changed.
+  bool BBVectorize::vectorizePairs(BasicBlock &BB) {
+    std::vector<Value *> PairableInsts;
+    std::multimap<Value *, Value *> CandidatePairs;
+    getCandidatePairs(BB, CandidatePairs, PairableInsts);
+    if (PairableInsts.size() == 0) return false;
+
+    // Now we have a map of all of the pairable instructions and we need to
+    // select the best possible pairing. A good pairing is one such that the
+    // users of the pair are also paired. This defines a (directed) forest
+    // over the pairs such that two pairs are connected iff the second pair
+    // uses the first.
+
+    // Note that it only matters that both members of the second pair use some
+    // element of the first pair (to allow for splatting).
+
+    std::multimap<ValuePair, ValuePair> ConnectedPairs;
+    computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+    if (ConnectedPairs.size() == 0) return false;
+
+    // Build the pairable-instruction dependency map
+    DenseSet<ValuePair> PairableInstUsers;
+    buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
+
+    // There is now a graph of the connected pairs. For each variable, pick the
+    // pairing with the largest tree meeting the depth requirement on at least
+    // one branch. Then select all pairings that are part of that tree and
+    // remove them from the list of available pairings and pairable variables.
+
+    DenseMap<Value *, Value *> ChosenPairs;
+    choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
+      PairableInstUsers, ChosenPairs);
+
+    if (ChosenPairs.size() == 0) return false;
+    NumFusedOps += ChosenPairs.size();
+
+    // A set of pairs has now been selected. It is now necessary to replace the
+    // paired instructions with vector instructions. For this procedure each
+    // operand much be replaced with a vector operand. This vector is formed
+    // by using build_vector on the old operands. The replaced values are then
+    // replaced with a vector_extract on the result.  Subsequent optimization
+    // passes should coalesce the build/extract combinations.
+
+    fuseChosenPairs(BB, PairableInsts, ChosenPairs);
+
+    return true;
+  }
+
+  // This function returns true if the provided instruction is capable of being
+  // fused into a vector instruction. This determination is based only on the
+  // type and other attributes of the instruction.
+  bool BBVectorize::isInstVectorizable(Instruction *I,
+                                         bool &IsSimpleLoadStore) {
+    IsSimpleLoadStore = false;
+
+    if (CallInst *C = dyn_cast<CallInst>(I)) {
+      if (!isVectorizableIntrinsic(C))
+        return false;
+    } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+      // Vectorize simple loads if possbile:
+      IsSimpleLoadStore = L->isSimple();
+      if (!IsSimpleLoadStore || NoMemOps)
+        return false;
+    } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
+      // Vectorize simple stores if possbile:
+      IsSimpleLoadStore = S->isSimple();
+      if (!IsSimpleLoadStore || NoMemOps)
+        return false;
+    } else if (CastInst *C = dyn_cast<CastInst>(I)) {
+      // We can vectorize casts, but not casts of pointer types, etc.
+      if (NoCasts)
+        return false;
+
+      Type *SrcTy = C->getSrcTy();
+      if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy())
+        return false;
+
+      Type *DestTy = C->getDestTy();
+      if (!DestTy->isSingleValueType() || DestTy->isPointerTy())
+        return false;
+    } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
+        isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
+      return false;
+    }
+
+    // We can't vectorize memory operations without target data
+    if (TD == 0 && IsSimpleLoadStore)
+      return false;
+
+    Type *T1, *T2;
+    if (isa<StoreInst>(I)) {
+      // For stores, it is the value type, not the pointer type that matters
+      // because the value is what will come from a vector register.
+
+      Value *IVal = cast<StoreInst>(I)->getValueOperand();
+      T1 = IVal->getType();
+    } else {
+      T1 = I->getType();
+    }
+
+    if (I->isCast())
+      T2 = cast<CastInst>(I)->getSrcTy();
+    else
+      T2 = T1;
+
+    // Not every type can be vectorized...
+    if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
+        !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
+      return false;
+
+    if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
+      return false;
+
+    if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
+      return false;
+
+    if (T1->getPrimitiveSizeInBits() > VectorBits/2 ||
+        T2->getPrimitiveSizeInBits() > VectorBits/2)
+      return false;
+
+    return true;
+  }
+
+  // This function returns true if the two provided instructions are compatible
+  // (meaning that they can be fused into a vector instruction). This assumes
+  // that I has already been determined to be vectorizable and that J is not
+  // in the use tree of I.
+  bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
+                       bool IsSimpleLoadStore) {
+    DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
+                     " <-> " << *J << "\n");
+
+    // Loads and stores can be merged if they have different alignments,
+    // but are otherwise the same.
+    LoadInst *LI, *LJ;
+    StoreInst *SI, *SJ;
+    if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) {
+      if (I->getType() != J->getType())
+        return false;
+
+      if (LI->getPointerOperand()->getType() !=
+            LJ->getPointerOperand()->getType() ||
+          LI->isVolatile() != LJ->isVolatile() ||
+          LI->getOrdering() != LJ->getOrdering() ||
+          LI->getSynchScope() != LJ->getSynchScope())
+        return false; 
+    } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) {
+      if (SI->getValueOperand()->getType() !=
+            SJ->getValueOperand()->getType() ||
+          SI->getPointerOperand()->getType() !=
+            SJ->getPointerOperand()->getType() ||
+          SI->isVolatile() != SJ->isVolatile() ||
+          SI->getOrdering() != SJ->getOrdering() ||
+          SI->getSynchScope() != SJ->getSynchScope())
+        return false;
+    } else if (!J->isSameOperationAs(I)) {
+      return false;
+    }
+    // FIXME: handle addsub-type operations!
+
+    if (IsSimpleLoadStore) {
+      Value *IPtr, *JPtr;
+      unsigned IAlignment, JAlignment;
+      int64_t OffsetInElmts = 0;
+      if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+            OffsetInElmts) && abs64(OffsetInElmts) == 1) {
+        if (AlignedOnly) {
+          Type *aType = isa<StoreInst>(I) ?
+            cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+          // An aligned load or store is possible only if the instruction
+          // with the lower offset has an alignment suitable for the
+          // vector type.
+  
+          unsigned BottomAlignment = IAlignment;
+          if (OffsetInElmts < 0) BottomAlignment = JAlignment;
+  
+          Type *VType = getVecTypeForPair(aType);
+          unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
+          if (BottomAlignment < VecAlignment)
+            return false;
+        }
+      } else {
+        return false;
+      }
+    } else if (isa<ShuffleVectorInst>(I)) {
+      // Only merge two shuffles if they're both constant
+      return isa<Constant>(I->getOperand(2)) &&
+             isa<Constant>(J->getOperand(2));
+      // FIXME: We may want to vectorize non-constant shuffles also.
+    }
+
+    return true;
+  }
+
+  // Figure out whether or not J uses I and update the users and write-set
+  // structures associated with I. Specifically, Users represents the set of
+  // instructions that depend on I. WriteSet represents the set
+  // of memory locations that are dependent on I. If UpdateUsers is true,
+  // and J uses I, then Users is updated to contain J and WriteSet is updated
+  // to contain any memory locations to which J writes. The function returns
+  // true if J uses I. By default, alias analysis is used to determine
+  // whether J reads from memory that overlaps with a location in WriteSet.
+  // If LoadMoveSet is not null, then it is a previously-computed multimap
+  // where the key is the memory-based user instruction and the value is
+  // the instruction to be compared with I. So, if LoadMoveSet is provided,
+  // then the alias analysis is not used. This is necessary because this
+  // function is called during the process of moving instructions during
+  // vectorization and the results of the alias analysis are not stable during
+  // that process.
+  bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
+                       AliasSetTracker &WriteSet, Instruction *I,
+                       Instruction *J, bool UpdateUsers,
+                       std::multimap<Value *, Value *> *LoadMoveSet) {
+    bool UsesI = false;
+
+    // This instruction may already be marked as a user due, for example, to
+    // being a member of a selected pair.
+    if (Users.count(J))
+      UsesI = true;
+
+    if (!UsesI)
+      for (User::op_iterator JU = J->op_begin(), e = J->op_end();
+           JU != e; ++JU) {
+        Value *V = *JU;
+        if (I == V || Users.count(V)) {
+          UsesI = true;
+          break;
+        }
+      }
+    if (!UsesI && J->mayReadFromMemory()) {
+      if (LoadMoveSet) {
+        VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
+        UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
+      } else {
+        for (AliasSetTracker::iterator W = WriteSet.begin(),
+             WE = WriteSet.end(); W != WE; ++W) {
+          for (AliasSet::iterator A = W->begin(), AE = W->end();
+               A != AE; ++A) {
+            AliasAnalysis::Location ptrLoc(A->getValue(), A->getSize(),
+                                           A->getTBAAInfo());
+            if (AA->getModRefInfo(J, ptrLoc) != AliasAnalysis::NoModRef) {
+              UsesI = true;
+              break;
+            }
+          }
+          if (UsesI) break;
+        }
+      }
+    }
+
+    if (UsesI && UpdateUsers) {
+      if (J->mayWriteToMemory()) WriteSet.add(J);
+      Users.insert(J);
+    }
+
+    return UsesI;
+  }
+
+  // This function iterates over all instruction pairs in the provided
+  // basic block and collects all candidate pairs for vectorization.
+  void BBVectorize::getCandidatePairs(BasicBlock &BB,
+                       std::multimap<Value *, Value *> &CandidatePairs,
+                       std::vector<Value *> &PairableInsts) {
+    BasicBlock::iterator E = BB.end();
+    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+      bool IsSimpleLoadStore;
+      if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
+
+      // Look for an instruction with which to pair instruction *I...
+      DenseSet<Value *> Users;
+      AliasSetTracker WriteSet(*AA);
+      BasicBlock::iterator J = I; ++J;
+      for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) {
+        // Determine if J uses I, if so, exit the loop.
+        bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep);
+        if (FastDep) {
+          // Note: For this heuristic to be effective, independent operations
+          // must tend to be intermixed. This is likely to be true from some
+          // kinds of grouped loop unrolling (but not the generic LLVM pass),
+          // but otherwise may require some kind of reordering pass.
+
+          // When using fast dependency analysis,
+          // stop searching after first use:
+          if (UsesI) break;
+        } else {
+          if (UsesI) continue;
+        }
+
+        // J does not use I, and comes before the first use of I, so it can be
+        // merged with I if the instructions are compatible.
+        if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue;
+
+        // J is a candidate for merging with I.
+        if (!PairableInsts.size() ||
+             PairableInsts[PairableInsts.size()-1] != I) {
+          PairableInsts.push_back(I);
+        }
+        CandidatePairs.insert(ValuePair(I, J));
+        DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
+                     << *I << " <-> " << *J << "\n");
+      }
+    }
+
+    DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
+           << " instructions with candidate pairs\n");
+  }
+
+  // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
+  // it looks for pairs such that both members have an input which is an
+  // output of PI or PJ.
+  void BBVectorize::computePairsConnectedTo(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      ValuePair P) {
+    // For each possible pairing for this variable, look at the uses of
+    // the first value...
+    for (Value::use_iterator I = P.first->use_begin(),
+         E = P.first->use_end(); I != E; ++I) {
+      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+      // For each use of the first variable, look for uses of the second
+      // variable...
+      for (Value::use_iterator J = P.second->use_begin(),
+           E2 = P.second->use_end(); J != E2; ++J) {
+        VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
+
+        // Look for <I, J>:
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+
+        // Look for <J, I>:
+        if (isSecondInIteratorPair<Value*>(*I, JPairRange))
+          ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
+      }
+
+      if (SplatBreaksChain) continue;
+      // Look for cases where just the first value in the pair is used by
+      // both members of another pair (splatting).
+      for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+      }
+    }
+
+    if (SplatBreaksChain) return;
+    // Look for cases where just the second value in the pair is used by
+    // both members of another pair (splatting).
+    for (Value::use_iterator I = P.second->use_begin(),
+         E = P.second->use_end(); I != E; ++I) {
+      VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+      for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+      }
+    }
+  }
+
+  // This function figures out which pairs are connected.  Two pairs are
+  // connected if some output of the first pair forms an input to both members
+  // of the second pair.
+  void BBVectorize::computeConnectedPairs(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
+
+    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+         PE = PairableInsts.end(); PI != PE; ++PI) {
+      VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
+
+      for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
+           P != choiceRange.second; ++P)
+        computePairsConnectedTo(CandidatePairs, PairableInsts,
+                                ConnectedPairs, *P);
+    }
+
+    DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
+                 << " pair connections.\n");
+  }
+
+  // This function builds a set of use tuples such that <A, B> is in the set
+  // if B is in the use tree of A. If B is in the use tree of A, then B
+  // depends on the output of A.
+  void BBVectorize::buildDepMap(
+                      BasicBlock &BB,
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      DenseSet<ValuePair> &PairableInstUsers) {
+    DenseSet<Value *> IsInPair;
+    for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
+         E = CandidatePairs.end(); C != E; ++C) {
+      IsInPair.insert(C->first);
+      IsInPair.insert(C->second);
+    }
+
+    // Iterate through the basic block, recording all Users of each
+    // pairable instruction.
+
+    BasicBlock::iterator E = BB.end();
+    for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+      if (IsInPair.find(I) == IsInPair.end()) continue;
+
+      DenseSet<Value *> Users;
+      AliasSetTracker WriteSet(*AA);
+      for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
+        (void) trackUsesOfI(Users, WriteSet, I, J);
+
+      for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
+           U != E; ++U)
+        PairableInstUsers.insert(ValuePair(I, *U));
+    }
+  }
+
+  // Returns true if an input to pair P is an output of pair Q and also an
+  // input of pair Q is an output of pair P. If this is the case, then these
+  // two pairs cannot be simultaneously fused.
+  bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
+                     DenseSet<ValuePair> &PairableInstUsers,
+                     std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
+    // Two pairs are in conflict if they are mutual Users of eachother.
+    bool QUsesP = PairableInstUsers.count(ValuePair(P.first,  Q.first))  ||
+                  PairableInstUsers.count(ValuePair(P.first,  Q.second)) ||
+                  PairableInstUsers.count(ValuePair(P.second, Q.first))  ||
+                  PairableInstUsers.count(ValuePair(P.second, Q.second));
+    bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first,  P.first))  ||
+                  PairableInstUsers.count(ValuePair(Q.first,  P.second)) ||
+                  PairableInstUsers.count(ValuePair(Q.second, P.first))  ||
+                  PairableInstUsers.count(ValuePair(Q.second, P.second));
+    if (PairableInstUserMap) {
+      // FIXME: The expensive part of the cycle check is not so much the cycle
+      // check itself but this edge insertion procedure. This needs some
+      // profiling and probably a different data structure (same is true of
+      // most uses of std::multimap).
+      if (PUsesQ) {
+        VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
+        if (!isSecondInIteratorPair(P, QPairRange))
+          PairableInstUserMap->insert(VPPair(Q, P));
+      }
+      if (QUsesP) {
+        VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
+        if (!isSecondInIteratorPair(Q, PPairRange))
+          PairableInstUserMap->insert(VPPair(P, Q));
+      }
+    }
+
+    return (QUsesP && PUsesQ);
+  }
+
+  // This function walks the use graph of current pairs to see if, starting
+  // from P, the walk returns to P.
+  bool BBVectorize::pairWillFormCycle(ValuePair P,
+                       std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                       DenseSet<ValuePair> &CurrentPairs) {
+    DEBUG(if (DebugCycleCheck)
+            dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
+                   << *P.second << "\n");
+    // A lookup table of visisted pairs is kept because the PairableInstUserMap
+    // contains non-direct associations.
+    DenseSet<ValuePair> Visited;
+    std::vector<ValuePair> Q;
+    // General depth-first post-order traversal:
+    Q.push_back(P);
+    while (!Q.empty()) {
+      ValuePair QTop = Q.back();
+
+      Visited.insert(QTop);
+      Q.pop_back();
+
+      DEBUG(if (DebugCycleCheck)
+              dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
+                     << *QTop.second << "\n");
+      VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
+      for (std::multimap<ValuePair, ValuePair>::iterator C = QPairRange.first;
+           C != QPairRange.second; ++C) {
+        if (C->second == P) {
+          DEBUG(dbgs()
+                 << "BBV: rejected to prevent non-trivial cycle formation: "
+                 << *C->first.first << " <-> " << *C->first.second << "\n");
+          return true;
+        }
+
+        if (CurrentPairs.count(C->second) > 0 &&
+            Visited.count(C->second) == 0)
+          Q.push_back(C->second);
+      }
+    }
+
+    return false;
+  }
+
+  // This function builds the initial tree of connected pairs with the
+  // pair J at the root.
+  void BBVectorize::buildInitialTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseMap<ValuePair, size_t> &Tree, ValuePair J) {
+    // Each of these pairs is viewed as the root node of a Tree. The Tree
+    // is then walked (depth-first). As this happens, we keep track of
+    // the pairs that compose the Tree and the maximum depth of the Tree.
+    std::vector<ValuePairWithDepth> Q;
+    // General depth-first post-order traversal:
+    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+    while (!Q.empty()) {
+      ValuePairWithDepth QTop = Q.back();
+
+      // Push each child onto the queue:
+      bool MoreChildren = false;
+      size_t MaxChildDepth = QTop.second;
+      VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
+      for (std::map<ValuePair, ValuePair>::iterator k = qtRange.first;
+           k != qtRange.second; ++k) {
+        // Make sure that this child pair is still a candidate:
+        bool IsStillCand = false;
+        VPIteratorPair checkRange =
+          CandidatePairs.equal_range(k->second.first);
+        for (std::multimap<Value *, Value *>::iterator m = checkRange.first;
+             m != checkRange.second; ++m) {
+          if (m->second == k->second.second) {
+            IsStillCand = true;
+            break;
+          }
+        }
+
+        if (IsStillCand) {
+          DenseMap<ValuePair, size_t>::iterator C = Tree.find(k->second);
+          if (C == Tree.end()) {
+            size_t d = getDepthFactor(k->second.first);
+            Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
+            MoreChildren = true;
+          } else {
+            MaxChildDepth = std::max(MaxChildDepth, C->second);
+          }
+        }
+      }
+
+      if (!MoreChildren) {
+        // Record the current pair as part of the Tree:
+        Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
+        Q.pop_back();
+      }
+    }
+  }
+
+  // Given some initial tree, prune it by removing conflicting pairs (pairs
+  // that cannot be simultaneously chosen for vectorization).
+  void BBVectorize::pruneTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseMap<ValuePair, size_t> &Tree,
+                      DenseSet<ValuePair> &PrunedTree, ValuePair J,
+                      bool UseCycleCheck) {
+    std::vector<ValuePairWithDepth> Q;
+    // General depth-first post-order traversal:
+    Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
+    while (!Q.empty()) {
+      ValuePairWithDepth QTop = Q.back();
+      PrunedTree.insert(QTop.first);
+      Q.pop_back();
+
+      // Visit each child, pruning as necessary...
+      DenseMap<ValuePair, size_t> BestChilden;
+      VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
+      for (std::map<ValuePair, ValuePair>::iterator K = QTopRange.first;
+           K != QTopRange.second; ++K) {
+        DenseMap<ValuePair, size_t>::iterator C = Tree.find(K->second);
+        if (C == Tree.end()) continue;
+
+        // This child is in the Tree, now we need to make sure it is the
+        // best of any conflicting children. There could be multiple
+        // conflicting children, so first, determine if we're keeping
+        // this child, then delete conflicting children as necessary.
+
+        // It is also necessary to guard against pairing-induced
+        // dependencies. Consider instructions a .. x .. y .. b
+        // such that (a,b) are to be fused and (x,y) are to be fused
+        // but a is an input to x and b is an output from y. This
+        // means that y cannot be moved after b but x must be moved
+        // after b for (a,b) to be fused. In other words, after
+        // fusing (a,b) we have y .. a/b .. x where y is an input
+        // to a/b and x is an output to a/b: x and y can no longer
+        // be legally fused. To prevent this condition, we must
+        // make sure that a child pair added to the Tree is not
+        // both an input and output of an already-selected pair.
+
+        // Pairing-induced dependencies can also form from more complicated
+        // cycles. The pair vs. pair conflicts are easy to check, and so
+        // that is done explicitly for "fast rejection", and because for
+        // child vs. child conflicts, we may prefer to keep the current
+        // pair in preference to the already-selected child.
+        DenseSet<ValuePair> CurrentPairs;
+
+        bool CanAdd = true;
+        for (DenseMap<ValuePair, size_t>::iterator C2
+              = BestChilden.begin(), E2 = BestChilden.end();
+             C2 != E2; ++C2) {
+          if (C2->first.first == C->first.first ||
+              C2->first.first == C->first.second ||
+              C2->first.second == C->first.first ||
+              C2->first.second == C->first.second ||
+              pairsConflict(C2->first, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+            if (C2->second >= C->second) {
+              CanAdd = false;
+              break;
+            }
+
+            CurrentPairs.insert(C2->first);
+          }
+        }
+        if (!CanAdd) continue;
+
+        // Even worse, this child could conflict with another node already
+        // selected for the Tree. If that is the case, ignore this child.
+        for (DenseSet<ValuePair>::iterator T = PrunedTree.begin(),
+             E2 = PrunedTree.end(); T != E2; ++T) {
+          if (T->first == C->first.first ||
+              T->first == C->first.second ||
+              T->second == C->first.first ||
+              T->second == C->first.second ||
+              pairsConflict(*T, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+            CanAdd = false;
+            break;
+          }
+
+          CurrentPairs.insert(*T);
+        }
+        if (!CanAdd) continue;
+
+        // And check the queue too...
+        for (std::vector<ValuePairWithDepth>::iterator C2 = Q.begin(),
+             E2 = Q.end(); C2 != E2; ++C2) {
+          if (C2->first.first == C->first.first ||
+              C2->first.first == C->first.second ||
+              C2->first.second == C->first.first ||
+              C2->first.second == C->first.second ||
+              pairsConflict(C2->first, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+            CanAdd = false;
+            break;
+          }
+
+          CurrentPairs.insert(C2->first);
+        }
+        if (!CanAdd) continue;
+
+        // Last but not least, check for a conflict with any of the
+        // already-chosen pairs.
+        for (DenseMap<Value *, Value *>::iterator C2 =
+              ChosenPairs.begin(), E2 = ChosenPairs.end();
+             C2 != E2; ++C2) {
+          if (pairsConflict(*C2, C->first, PairableInstUsers,
+                            UseCycleCheck ? &PairableInstUserMap : 0)) {
+            CanAdd = false;
+            break;
+          }
+
+          CurrentPairs.insert(*C2);
+        }
+        if (!CanAdd) continue;
+
+	// To check for non-trivial cycles formed by the addition of the
+	// current pair we've formed a list of all relevant pairs, now use a
+	// graph walk to check for a cycle. We start from the current pair and
+	// walk the use tree to see if we again reach the current pair. If we
+	// do, then the current pair is rejected.
+
+        // FIXME: It may be more efficient to use a topological-ordering
+        // algorithm to improve the cycle check. This should be investigated.
+        if (UseCycleCheck &&
+            pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
+          continue;
+
+        // This child can be added, but we may have chosen it in preference
+        // to an already-selected child. Check for this here, and if a
+        // conflict is found, then remove the previously-selected child
+        // before adding this one in its place.
+        for (DenseMap<ValuePair, size_t>::iterator C2
+              = BestChilden.begin(); C2 != BestChilden.end();) {
+          if (C2->first.first == C->first.first ||
+              C2->first.first == C->first.second ||
+              C2->first.second == C->first.first ||
+              C2->first.second == C->first.second ||
+              pairsConflict(C2->first, C->first, PairableInstUsers))
+            BestChilden.erase(C2++);
+          else
+            ++C2;
+        }
+
+        BestChilden.insert(ValuePairWithDepth(C->first, C->second));
+      }
+
+      for (DenseMap<ValuePair, size_t>::iterator C
+            = BestChilden.begin(), E2 = BestChilden.end();
+           C != E2; ++C) {
+        size_t DepthF = getDepthFactor(C->first.first);
+        Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
+      }
+    }
+  }
+
+  // This function finds the best tree of mututally-compatible connected
+  // pairs, given the choice of root pairs as an iterator range.
+  void BBVectorize::findBestTreeFor(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+                      DenseMap<Value *, Value *> &ChosenPairs,
+                      DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
+                      size_t &BestEffSize, VPIteratorPair ChoiceRange,
+                      bool UseCycleCheck) {
+    for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first;
+         J != ChoiceRange.second; ++J) {
+
+      // Before going any further, make sure that this pair does not
+      // conflict with any already-selected pairs (see comment below
+      // near the Tree pruning for more details).
+      DenseSet<ValuePair> ChosenPairSet;
+      bool DoesConflict = false;
+      for (DenseMap<Value *, Value *>::iterator C = ChosenPairs.begin(),
+           E = ChosenPairs.end(); C != E; ++C) {
+        if (pairsConflict(*C, *J, PairableInstUsers,
+                          UseCycleCheck ? &PairableInstUserMap : 0)) {
+          DoesConflict = true;
+          break;
+        }
+
+        ChosenPairSet.insert(*C);
+      }
+      if (DoesConflict) continue;
+
+      if (UseCycleCheck &&
+          pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
+        continue;
+
+      DenseMap<ValuePair, size_t> Tree;
+      buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                          PairableInstUsers, ChosenPairs, Tree, *J);
+
+      // Because we'll keep the child with the largest depth, the largest
+      // depth is still the same in the unpruned Tree.
+      size_t MaxDepth = Tree.lookup(*J);
+
+      DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {"
+                   << *J->first << " <-> " << *J->second << "} of depth " <<
+                   MaxDepth << " and size " << Tree.size() << "\n");
+
+      // At this point the Tree has been constructed, but, may contain
+      // contradictory children (meaning that different children of
+      // some tree node may be attempting to fuse the same instruction).
+      // So now we walk the tree again, in the case of a conflict,
+      // keep only the child with the largest depth. To break a tie,
+      // favor the first child.
+
+      DenseSet<ValuePair> PrunedTree;
+      pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                   PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
+                   PrunedTree, *J, UseCycleCheck);
+
+      size_t EffSize = 0;
+      for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
+           E = PrunedTree.end(); S != E; ++S)
+        EffSize += getDepthFactor(S->first);
+
+      DEBUG(if (DebugPairSelection)
+             dbgs() << "BBV: found pruned Tree for pair {"
+             << *J->first << " <-> " << *J->second << "} of depth " <<
+             MaxDepth << " and size " << PrunedTree.size() <<
+            " (effective size: " << EffSize << ")\n");
+      if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) {
+        BestMaxDepth = MaxDepth;
+        BestEffSize = EffSize;
+        BestTree = PrunedTree;
+      }
+    }
+  }
+
+  // Given the list of candidate pairs, this function selects those
+  // that will be fused into vector instructions.
+  void BBVectorize::choosePairs(
+                      std::multimap<Value *, Value *> &CandidatePairs,
+                      std::vector<Value *> &PairableInsts,
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseSet<ValuePair> &PairableInstUsers,
+                      DenseMap<Value *, Value *>& ChosenPairs) {
+    bool UseCycleCheck = CandidatePairs.size() <= MaxCandPairsForCycleCheck;
+    std::multimap<ValuePair, ValuePair> PairableInstUserMap;
+    for (std::vector<Value *>::iterator I = PairableInsts.begin(),
+         E = PairableInsts.end(); I != E; ++I) {
+      // The number of possible pairings for this variable:
+      size_t NumChoices = CandidatePairs.count(*I);
+      if (!NumChoices) continue;
+
+      VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
+
+      // The best pair to choose and its tree:
+      size_t BestMaxDepth = 0, BestEffSize = 0;
+      DenseSet<ValuePair> BestTree;
+      findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
+                      PairableInstUsers, PairableInstUserMap, ChosenPairs,
+                      BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
+                      UseCycleCheck);
+
+      // A tree has been chosen (or not) at this point. If no tree was
+      // chosen, then this instruction, I, cannot be paired (and is no longer
+      // considered).
+
+      DEBUG(if (BestTree.size() > 0)
+              dbgs() << "BBV: selected pairs in the best tree for: "
+                     << *cast<Instruction>(*I) << "\n");
+
+      for (DenseSet<ValuePair>::iterator S = BestTree.begin(),
+           SE2 = BestTree.end(); S != SE2; ++S) {
+        // Insert the members of this tree into the list of chosen pairs.
+        ChosenPairs.insert(ValuePair(S->first, S->second));
+        DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
+               *S->second << "\n");
+
+        // Remove all candidate pairs that have values in the chosen tree.
+        for (std::multimap<Value *, Value *>::iterator K =
+               CandidatePairs.begin(); K != CandidatePairs.end();) {
+          if (K->first == S->first || K->second == S->first ||
+              K->second == S->second || K->first == S->second) {
+            // Don't remove the actual pair chosen so that it can be used
+            // in subsequent tree selections.
+            if (!(K->first == S->first && K->second == S->second))
+              CandidatePairs.erase(K++);
+            else
+              ++K;
+          } else {
+            ++K;
+          }
+        }
+      }
+    }
+
+    DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
+  }
+
+  std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
+                     unsigned n = 0) {
+    if (!I->hasName())
+      return "";
+
+    return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
+             (n > 0 ? "." + utostr(n) : "")).str();
+  }
+
+  // Returns the value that is to be used as the pointer input to the vector
+  // instruction that fuses I with J.
+  Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
+                     Instruction *I, Instruction *J, unsigned o,
+                     bool &FlipMemInputs) {
+    Value *IPtr, *JPtr;
+    unsigned IAlignment, JAlignment;
+    int64_t OffsetInElmts;
+    (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+                          OffsetInElmts);
+
+    // The pointer value is taken to be the one with the lowest offset.
+    Value *VPtr;
+    if (OffsetInElmts > 0) {
+      VPtr = IPtr;
+    } else {
+      FlipMemInputs = true;
+      VPtr = JPtr;
+    }
+
+    Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType();
+    Type *VArgType = getVecTypeForPair(ArgType);
+    Type *VArgPtrType = PointerType::get(VArgType,
+      cast<PointerType>(IPtr->getType())->getAddressSpace());
+    return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
+                        /* insert before */ FlipMemInputs ? J : I);
+  }
+
+  void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+                     unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
+                     unsigned IdxOffset, std::vector<Constant*> &Mask) {
+    for (unsigned v = 0; v < NumElem/2; ++v) {
+      int m = cast<ShuffleVectorInst>(J)->getMaskValue(v);
+      if (m < 0) {
+        Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
+      } else {
+        unsigned mm = m + (int) IdxOffset;
+        if (m >= (int) NumInElem)
+          mm += (int) NumInElem;
+
+        Mask[v+MaskOffset] =
+          ConstantInt::get(Type::getInt32Ty(Context), mm);
+      }
+    }
+  }
+
+  // Returns the value that is to be used as the vector-shuffle mask to the
+  // vector instruction that fuses I with J.
+  Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
+                     Instruction *I, Instruction *J) {
+    // This is the shuffle mask. We need to append the second
+    // mask to the first, and the numbers need to be adjusted.
+
+    Type *ArgType = I->getType();
+    Type *VArgType = getVecTypeForPair(ArgType);
+
+    // Get the total number of elements in the fused vector type.
+    // By definition, this must equal the number of elements in
+    // the final mask.
+    unsigned NumElem = cast<VectorType>(VArgType)->getNumElements();
+    std::vector<Constant*> Mask(NumElem);
+
+    Type *OpType = I->getOperand(0)->getType();
+    unsigned NumInElem = cast<VectorType>(OpType)->getNumElements();
+
+    // For the mask from the first pair...
+    fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask);
+
+    // For the mask from the second pair...
+    fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem,
+                       Mask);
+
+    return ConstantVector::get(Mask);
+  }
+
+  // Returns the value to be used as the specified operand of the vector
+  // instruction that fuses I with J.
+  Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
+                     Instruction *J, unsigned o, bool FlipMemInputs) {
+    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+      // Compute the fused vector type for this operand
+    Type *ArgType = I->getOperand(o)->getType();
+    VectorType *VArgType = getVecTypeForPair(ArgType);
+
+    Instruction *L = I, *H = J;
+    if (FlipMemInputs) {
+      L = J;
+      H = I;
+    }
+
+    if (ArgType->isVectorTy()) {
+      unsigned numElem = cast<VectorType>(VArgType)->getNumElements();
+      std::vector<Constant*> Mask(numElem);
+      for (unsigned v = 0; v < numElem; ++v)
+        Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+
+      Instruction *BV = new ShuffleVectorInst(L->getOperand(o),
+                                              H->getOperand(o),
+                                              ConstantVector::get(Mask),
+                                              getReplacementName(I, true, o));
+      BV->insertBefore(J);
+      return BV;
+    }
+
+    // If these two inputs are the output of another vector instruction,
+    // then we should use that output directly. It might be necessary to
+    // permute it first. [When pairings are fused recursively, you can
+    // end up with cases where a large vector is decomposed into scalars
+    // using extractelement instructions, then built into size-2
+    // vectors using insertelement and the into larger vectors using
+    // shuffles. InstCombine does not simplify all of these cases well,
+    // and so we make sure that shuffles are generated here when possible.
+    ExtractElementInst *LEE
+      = dyn_cast<ExtractElementInst>(L->getOperand(o));
+    ExtractElementInst *HEE
+      = dyn_cast<ExtractElementInst>(H->getOperand(o));
+
+    if (LEE && HEE &&
+        LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) {
+      VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType());
+      unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue();
+      unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue();
+      if (LEE->getOperand(0) == HEE->getOperand(0)) {
+        if (LowIndx == 0 && HighIndx == 1)
+          return LEE->getOperand(0);
+ 
+        std::vector<Constant*> Mask(2);
+        Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
+        Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+
+        Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
+                                          UndefValue::get(EEType),
+                                          ConstantVector::get(Mask),
+                                          getReplacementName(I, true, o));
+        BV->insertBefore(J);
+        return BV;
+      }
+
+      std::vector<Constant*> Mask(2);
+      HighIndx += EEType->getNumElements();
+      Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
+      Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
+
+      Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
+                                          HEE->getOperand(0),
+                                          ConstantVector::get(Mask),
+                                          getReplacementName(I, true, o));
+      BV->insertBefore(J);
+      return BV;
+    }
+
+    Instruction *BV1 = InsertElementInst::Create(
+                                          UndefValue::get(VArgType),
+                                          L->getOperand(o), CV0,
+                                          getReplacementName(I, true, o, 1));
+    BV1->insertBefore(I);
+    Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o),
+                                          CV1,
+                                          getReplacementName(I, true, o, 2));
+    BV2->insertBefore(J);
+    return BV2;
+  }
+
+  // This function creates an array of values that will be used as the inputs
+  // to the vector instruction that fuses I with J.
+  void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
+                     Instruction *I, Instruction *J,
+                     SmallVector<Value *, 3> &ReplacedOperands,
+                     bool &FlipMemInputs) {
+    FlipMemInputs = false;
+    unsigned NumOperands = I->getNumOperands();
+
+    for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
+      // Iterate backward so that we look at the store pointer
+      // first and know whether or not we need to flip the inputs.
+
+      if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) {
+        // This is the pointer for a load/store instruction.
+        ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o,
+                                FlipMemInputs);
+        continue;
+      } else if (isa<CallInst>(I) && o == NumOperands-1) {
+        Function *F = cast<CallInst>(I)->getCalledFunction();
+        unsigned IID = F->getIntrinsicID();
+        BasicBlock &BB = *I->getParent();
+
+        Module *M = BB.getParent()->getParent();
+        Type *ArgType = I->getType();
+        Type *VArgType = getVecTypeForPair(ArgType);
+
+        // FIXME: is it safe to do this here?
+        ReplacedOperands[o] = Intrinsic::getDeclaration(M,
+          (Intrinsic::ID) IID, VArgType);
+        continue;
+      } else if (isa<ShuffleVectorInst>(I) && o == NumOperands-1) {
+        ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
+        continue;
+      }
+
+      ReplacedOperands[o] =
+        getReplacementInput(Context, I, J, o, FlipMemInputs);
+    }
+  }
+
+  // This function creates two values that represent the outputs of the
+  // original I and J instructions. These are generally vector shuffles
+  // or extracts. In many cases, these will end up being unused and, thus,
+  // eliminated by later passes.
+  void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+                     Instruction *J, Instruction *K,
+                     Instruction *&InsertionPt,
+                     Instruction *&K1, Instruction *&K2,
+                     bool &FlipMemInputs) {
+    Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
+    Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
+
+    if (isa<StoreInst>(I)) {
+      AA->replaceWithNewValue(I, K);
+      AA->replaceWithNewValue(J, K);
+    } else {
+      Type *IType = I->getType();
+      Type *VType = getVecTypeForPair(IType);
+
+      if (IType->isVectorTy()) {
+          unsigned numElem = cast<VectorType>(IType)->getNumElements();
+          std::vector<Constant*> Mask1(numElem), Mask2(numElem);
+          for (unsigned v = 0; v < numElem; ++v) {
+            Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
+            Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v);
+          }
+
+          K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                       ConstantVector::get(
+                                         FlipMemInputs ? Mask2 : Mask1),
+                                       getReplacementName(K, false, 1));
+          K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
+                                       ConstantVector::get(
+                                         FlipMemInputs ? Mask1 : Mask2),
+                                       getReplacementName(K, false, 2));
+      } else {
+        K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
+                                          getReplacementName(K, false, 1));
+        K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
+                                          getReplacementName(K, false, 2));
+      }
+
+      K1->insertAfter(K);
+      K2->insertAfter(K1);
+      InsertionPt = K2;
+    }
+  }
+
+  // Move all uses of the function I (including pairing-induced uses) after J.
+  bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *I, Instruction *J) {
+    // Skip to the first instruction past I.
+    BasicBlock::iterator L = BB.begin();
+    for (; cast<Instruction>(L) != I; ++L);
+    ++L;
+
+    DenseSet<Value *> Users;
+    AliasSetTracker WriteSet(*AA);
+    for (; cast<Instruction>(L) != J; ++L)
+      (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet);
+
+    assert(cast<Instruction>(L) == J &&
+      "Tracking has not proceeded far enough to check for dependencies");
+    // If J is now in the use set of I, then trackUsesOfI will return true
+    // and we have a dependency cycle (and the fusing operation must abort).
+    return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet);
+  }
+
+  // Move all uses of the function I (including pairing-induced uses) after J.
+  void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *&InsertionPt,
+                     Instruction *I, Instruction *J) {
+    // Skip to the first instruction past I.
+    BasicBlock::iterator L = BB.begin();
+    for (; cast<Instruction>(L) != I; ++L);
+    ++L;
+
+    DenseSet<Value *> Users;
+    AliasSetTracker WriteSet(*AA);
+    for (; cast<Instruction>(L) != J;) {
+      if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
+        // Move this instruction
+        Instruction *InstToMove = L; ++L;
+
+        DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
+                        " to after " << *InsertionPt << "\n");
+        InstToMove->removeFromParent();
+        InstToMove->insertAfter(InsertionPt);
+        InsertionPt = InstToMove;
+      } else {
+        ++L;
+      }
+    }
+  }
+
+  // Collect all load instruction that are in the move set of a given first
+  // pair member.  These loads depend on the first instruction, I, and so need
+  // to be moved after J (the second instruction) when the pair is fused.
+  void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     std::multimap<Value *, Value *> &LoadMoveSet,
+                     Instruction *I) {
+    // Skip to the first instruction past I.
+    BasicBlock::iterator L = BB.begin();
+    for (; cast<Instruction>(L) != I; ++L);
+    ++L;
+
+    DenseSet<Value *> Users;
+    AliasSetTracker WriteSet(*AA);
+
+    // Note: We cannot end the loop when we reach J because J could be moved
+    // farther down the use chain by another instruction pairing. Also, J
+    // could be before I if this is an inverted input.
+    for (BasicBlock::iterator E = BB.end(); cast<Instruction>(L) != E; ++L) {
+      if (trackUsesOfI(Users, WriteSet, I, L)) {
+        if (L->mayReadFromMemory())
+          LoadMoveSet.insert(ValuePair(L, I));
+      }
+    }
+  }
+
+  // In cases where both load/stores and the computation of their pointers
+  // are chosen for vectorization, we can end up in a situation where the
+  // aliasing analysis starts returning different query results as the
+  // process of fusing instruction pairs continues. Because the algorithm
+  // relies on finding the same use trees here as were found earlier, we'll
+  // need to precompute the necessary aliasing information here and then
+  // manually update it during the fusion process.
+  void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *> &ChosenPairs,
+                     std::multimap<Value *, Value *> &LoadMoveSet) {
+    for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+         PIE = PairableInsts.end(); PI != PIE; ++PI) {
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI);
+      if (P == ChosenPairs.end()) continue;
+
+      Instruction *I = cast<Instruction>(P->first);
+      collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
+    }
+  }
+
+  // This function fuses the chosen instruction pairs into vector instructions,
+  // taking care preserve any needed scalar outputs and, then, it reorders the
+  // remaining instructions as needed (users of the first member of the pair
+  // need to be moved to after the location of the second member of the pair
+  // because the vector instruction is inserted in the location of the pair's
+  // second member).
+  void BBVectorize::fuseChosenPairs(BasicBlock &BB,
+                     std::vector<Value *> &PairableInsts,
+                     DenseMap<Value *, Value *> &ChosenPairs) {
+    LLVMContext& Context = BB.getContext();
+
+    // During the vectorization process, the order of the pairs to be fused
+    // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
+    // list. After a pair is fused, the flipped pair is removed from the list.
+    std::vector<ValuePair> FlippedPairs;
+    FlippedPairs.reserve(ChosenPairs.size());
+    for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
+         E = ChosenPairs.end(); P != E; ++P)
+      FlippedPairs.push_back(ValuePair(P->second, P->first));
+    for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(),
+         E = FlippedPairs.end(); P != E; ++P)
+      ChosenPairs.insert(*P);
+
+    std::multimap<Value *, Value *> LoadMoveSet;
+    collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
+
+    DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
+
+    for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
+      DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(PI);
+      if (P == ChosenPairs.end()) {
+        ++PI;
+        continue;
+      }
+
+      if (getDepthFactor(P->first) == 0) {
+        // These instructions are not really fused, but are tracked as though
+        // they are. Any case in which it would be interesting to fuse them
+        // will be taken care of by InstCombine.
+        --NumFusedOps;
+        ++PI;
+        continue;
+      }
+
+      Instruction *I = cast<Instruction>(P->first),
+        *J = cast<Instruction>(P->second);
+
+      DEBUG(dbgs() << "BBV: fusing: " << *I <<
+             " <-> " << *J << "\n");
+
+      // Remove the pair and flipped pair from the list.
+      DenseMap<Value *, Value *>::iterator FP = ChosenPairs.find(P->second);
+      assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
+      ChosenPairs.erase(FP);
+      ChosenPairs.erase(P);
+
+      if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) {
+        DEBUG(dbgs() << "BBV: fusion of: " << *I <<
+               " <-> " << *J <<
+               " aborted because of non-trivial dependency cycle\n");
+        --NumFusedOps;
+        ++PI;
+        continue;
+      }
+
+      bool FlipMemInputs;
+      unsigned NumOperands = I->getNumOperands();
+      SmallVector<Value *, 3> ReplacedOperands(NumOperands);
+      getReplacementInputsForPair(Context, I, J, ReplacedOperands,
+        FlipMemInputs);
+
+      // Make a copy of the original operation, change its type to the vector
+      // type and replace its operands with the vector operands.
+      Instruction *K = I->clone();
+      if (I->hasName()) K->takeName(I);
+
+      if (!isa<StoreInst>(K))
+        K->mutateType(getVecTypeForPair(I->getType()));
+
+      for (unsigned o = 0; o < NumOperands; ++o)
+        K->setOperand(o, ReplacedOperands[o]);
+
+      // If we've flipped the memory inputs, make sure that we take the correct
+      // alignment.
+      if (FlipMemInputs) {
+        if (isa<StoreInst>(K))
+          cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment());
+        else
+          cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment());
+      }
+
+      K->insertAfter(J);
+
+      // Instruction insertion point:
+      Instruction *InsertionPt = K;
+      Instruction *K1 = 0, *K2 = 0;
+      replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2,
+        FlipMemInputs);
+
+      // The use tree of the first original instruction must be moved to after
+      // the location of the second instruction. The entire use tree of the
+      // first instruction is disjoint from the input tree of the second
+      // (by definition), and so commutes with it.
+
+      moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
+
+      if (!isa<StoreInst>(I)) {
+        I->replaceAllUsesWith(K1);
+        J->replaceAllUsesWith(K2);
+        AA->replaceWithNewValue(I, K1);
+        AA->replaceWithNewValue(J, K2);
+      }
+
+      // Instructions that may read from memory may be in the load move set.
+      // Once an instruction is fused, we no longer need its move set, and so
+      // the values of the map never need to be updated. However, when a load
+      // is fused, we need to merge the entries from both instructions in the
+      // pair in case those instructions were in the move set of some other
+      // yet-to-be-fused pair. The loads in question are the keys of the map.
+      if (I->mayReadFromMemory()) {
+        std::vector<ValuePair> NewSetMembers;
+        VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
+        VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
+        for (std::multimap<Value *, Value *>::iterator N = IPairRange.first;
+             N != IPairRange.second; ++N)
+          NewSetMembers.push_back(ValuePair(K, N->second));
+        for (std::multimap<Value *, Value *>::iterator N = JPairRange.first;
+             N != JPairRange.second; ++N)
+          NewSetMembers.push_back(ValuePair(K, N->second));
+        for (std::vector<ValuePair>::iterator A = NewSetMembers.begin(),
+             AE = NewSetMembers.end(); A != AE; ++A)
+          LoadMoveSet.insert(*A);
+      }
+
+      // Before removing I, set the iterator to the next instruction.
+      PI = llvm::next(BasicBlock::iterator(I));
+      if (cast<Instruction>(PI) == J)
+        ++PI;
+
+      SE->forgetValue(I);
+      SE->forgetValue(J);
+      I->eraseFromParent();
+      J->eraseFromParent();
+    }
+
+    DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
+  }
+}
+
+char BBVectorize::ID = 0;
+static const char bb_vectorize_name[] = "Basic-Block Vectorization";
+INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
+
+BasicBlockPass *llvm::createBBVectorizePass() {
+  return new BBVectorize();
+}
+
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
new file mode 100644
index 0000000000..4b6693015c
--- /dev/null
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_llvm_library(LLVMVectorize
+  BBVectorize.cpp
+  Vectorize.cpp
+  )
diff --git a/lib/Transforms/Vectorize/LLVMBuild.txt b/lib/Transforms/Vectorize/LLVMBuild.txt
new file mode 100644
index 0000000000..7167d273ae
--- /dev/null
+++ b/lib/Transforms/Vectorize/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Transforms/Scalar/LLVMBuild.txt --------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Vectorize
+parent = Transforms
+library_name = Vectorize
+required_libraries = Analysis Core InstCombine Support Target TransformUtils
+
diff --git a/lib/Transforms/Vectorize/Makefile b/lib/Transforms/Vectorize/Makefile
new file mode 100644
index 0000000000..86c36585f2
--- /dev/null
+++ b/lib/Transforms/Vectorize/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/Vectorize/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMVectorize
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
new file mode 100644
index 0000000000..1ef60029bc
--- /dev/null
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -0,0 +1,39 @@
+//===-- Vectorize.cpp -----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which 
+// implements several vectorization transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+
+/// initializeVectorizationPasses - Initialize all passes linked into the 
+/// Vectorization library.
+void llvm::initializeVectorization(PassRegistry &Registry) {
+  initializeBBVectorizePass(Registry);
+}
+
+void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
+  initializeVectorization(*unwrap(R));
+}
+
+void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createBBVectorizePass());
+}
+
diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll
new file mode 100644
index 0000000000..32a91ceee0
--- /dev/null
+++ b/test/Transforms/BBVectorize/cycle.ll
@@ -0,0 +1,112 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise
+; want to select the pairs:
+; %div77 = fdiv double %sub74, %mul76.v.r1 <->   %div125 = fdiv double %mul121, %mul76.v.r2 (div125 depends on mul117)
+; %add84 = fadd double %sub83, 2.000000e+00 <->   %add127 = fadd double %mul126, 1.000000e+00 (add127 depends on div77)
+; %mul95 = fmul double %sub45.v.r1, %sub36.v.r1 <->   %mul88 = fmul double %sub36.v.r1, %sub87 (mul88 depends on add84)
+; %mul117 = fmul double %sub39.v.r1, %sub116 <->   %mul97 = fmul double %mul96, %sub39.v.r1 (mul97 depends on mul95)
+; and so a dependency cycle would be created.
+
+declare double @fabs(double) nounwind readnone
+define void @test1(double %a, double %b, double %c, double %add80, double %mul1, double %mul2.v.r1, double %mul73, double %sub, double %sub65, double %F.0, i32 %n.0, double %Bnm3.0, double %Bnm2.0, double %Bnm1.0, double %Anm3.0, double %Anm2.0, double %Anm1.0) {
+entry:
+  br label %go
+go:
+  %conv = sitofp i32 %n.0 to double
+  %add35 = fadd double %conv, %a
+  %sub36 = fadd double %add35, -1.000000e+00
+  %add38 = fadd double %conv, %b
+  %sub39 = fadd double %add38, -1.000000e+00
+  %add41 = fadd double %conv, %c
+  %sub42 = fadd double %add41, -1.000000e+00
+  %sub45 = fadd double %add35, -2.000000e+00
+  %sub48 = fadd double %add38, -2.000000e+00
+  %sub51 = fadd double %add41, -2.000000e+00
+  %mul52 = shl nsw i32 %n.0, 1
+  %sub53 = add nsw i32 %mul52, -1
+  %conv54 = sitofp i32 %sub53 to double
+  %sub56 = add nsw i32 %mul52, -3
+  %conv57 = sitofp i32 %sub56 to double
+  %sub59 = add nsw i32 %mul52, -5
+  %conv60 = sitofp i32 %sub59 to double
+  %mul61 = mul nsw i32 %n.0, %n.0
+  %conv62 = sitofp i32 %mul61 to double
+  %mul63 = fmul double %conv62, 3.000000e+00
+  %mul67 = fmul double %sub65, %conv
+  %add68 = fadd double %mul63, %mul67
+  %add69 = fadd double %add68, 2.000000e+00
+  %sub71 = fsub double %add69, %mul2.v.r1
+  %sub74 = fsub double %sub71, %mul73
+  %mul75 = fmul double %conv57, 2.000000e+00
+  %mul76 = fmul double %mul75, %sub42
+  %div77 = fdiv double %sub74, %mul76
+  %mul82 = fmul double %add80, %conv
+  %sub83 = fsub double %mul63, %mul82
+  %add84 = fadd double %sub83, 2.000000e+00
+  %sub86 = fsub double %add84, %mul2.v.r1
+  %sub87 = fsub double -0.000000e+00, %sub86
+  %mul88 = fmul double %sub36, %sub87
+  %mul89 = fmul double %mul88, %sub39
+  %mul90 = fmul double %conv54, 4.000000e+00
+  %mul91 = fmul double %mul90, %conv57
+  %mul92 = fmul double %mul91, %sub51
+  %mul93 = fmul double %mul92, %sub42
+  %div94 = fdiv double %mul89, %mul93
+  %mul95 = fmul double %sub45, %sub36
+  %mul96 = fmul double %mul95, %sub48
+  %mul97 = fmul double %mul96, %sub39
+  %sub99 = fsub double %conv, %a
+  %sub100 = fadd double %sub99, -2.000000e+00
+  %mul101 = fmul double %mul97, %sub100
+  %sub103 = fsub double %conv, %b
+  %sub104 = fadd double %sub103, -2.000000e+00
+  %mul105 = fmul double %mul101, %sub104
+  %mul106 = fmul double %conv57, 8.000000e+00
+  %mul107 = fmul double %mul106, %conv57
+  %mul108 = fmul double %mul107, %conv60
+  %sub111 = fadd double %add41, -3.000000e+00
+  %mul112 = fmul double %mul108, %sub111
+  %mul113 = fmul double %mul112, %sub51
+  %mul114 = fmul double %mul113, %sub42
+  %div115 = fdiv double %mul105, %mul114
+  %sub116 = fsub double -0.000000e+00, %sub36
+  %mul117 = fmul double %sub39, %sub116
+  %sub119 = fsub double %conv, %c
+  %sub120 = fadd double %sub119, -1.000000e+00
+  %mul121 = fmul double %mul117, %sub120
+  %mul123 = fmul double %mul75, %sub51
+  %mul124 = fmul double %mul123, %sub42
+  %div125 = fdiv double %mul121, %mul124
+  %mul126 = fmul double %div77, %sub
+  %add127 = fadd double %mul126, 1.000000e+00
+  %mul128 = fmul double %add127, %Anm1.0
+  %mul129 = fmul double %div94, %sub
+  %add130 = fadd double %div125, %mul129
+  %mul131 = fmul double %add130, %sub
+  %mul132 = fmul double %mul131, %Anm2.0
+  %add133 = fadd double %mul128, %mul132
+  %mul134 = fmul double %div115, %mul1
+  %mul135 = fmul double %mul134, %Anm3.0
+  %add136 = fadd double %add133, %mul135
+  %mul139 = fmul double %add127, %Bnm1.0
+  %mul143 = fmul double %mul131, %Bnm2.0
+  %add144 = fadd double %mul139, %mul143
+  %mul146 = fmul double %mul134, %Bnm3.0
+  %add147 = fadd double %add144, %mul146
+  %div148 = fdiv double %add136, %add147
+  %sub149 = fsub double %F.0, %div148
+  %div150 = fdiv double %sub149, %F.0
+  %call = tail call double @fabs(double %div150) nounwind readnone
+  %cmp = fcmp olt double %call, 0x3CB0000000000000
+  %cmp152 = icmp sgt i32 %n.0, 20000
+  %or.cond = or i1 %cmp, %cmp152
+  br i1 %or.cond, label %done, label %go
+done:
+  ret void
+; CHECK: @test1
+; CHECK: go:
+; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
+; FIXME: When tree pruning is deterministic, include the entire output.
+}
diff --git a/test/Transforms/BBVectorize/dg.exp b/test/Transforms/BBVectorize/dg.exp
new file mode 100644
index 0000000000..f2005891a5
--- /dev/null
+++ b/test/Transforms/BBVectorize/dg.exp
@@ -0,0 +1,3 @@
+load_lib llvm.exp
+
+RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]
diff --git a/test/Transforms/BBVectorize/ld1.ll b/test/Transforms/BBVectorize/ld1.ll
new file mode 100644
index 0000000000..cea225d076
--- /dev/null
+++ b/test/Transforms/BBVectorize/ld1.ll
@@ -0,0 +1,41 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %i2 = load double* %c, align 8
+  %add = fadd double %mul, %i2
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %arrayidx6 = getelementptr inbounds double* %c, i64 1
+  %i5 = load double* %arrayidx6, align 8
+  %add7 = fadd double %mul5, %i5
+  %mul9 = fmul double %add, %i1
+  %add11 = fadd double %mul9, %i2
+  %mul13 = fmul double %add7, %i4
+  %add15 = fadd double %mul13, %i5
+  %mul16 = fmul double %add11, %add15
+  ret double %mul16
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i2.v.i0 = bitcast double* %c to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %i2 = load <2 x double>* %i2.v.i0, align 8
+; CHECK: %add = fadd <2 x double> %mul, %i2
+; CHECK: %mul9 = fmul <2 x double> %add, %i1
+; CHECK: %add11 = fadd <2 x double> %mul9, %i2
+; CHECK: %add11.v.r1 = extractelement <2 x double> %add11, i32 0
+; CHECK: %add11.v.r2 = extractelement <2 x double> %add11, i32 1
+; CHECK: %mul16 = fmul double %add11.v.r1, %add11.v.r2
+; CHECK: ret double %mul16
+}
+
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
new file mode 100644
index 0000000000..bebc91ad91
--- /dev/null
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -0,0 +1,93 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
+; The second check covers the use of alias analysis (with loop unrolling).
+
+define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
+entry:
+  br label %for.body
+; CHECK: @test1
+; CHECK-UNRL: @test1
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 8
+  %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+  %1 = load double* %arrayidx2, align 8
+  %mul = fmul double %0, %0
+  %mul3 = fmul double %0, %1
+  %add = fadd double %mul, %mul3
+  %add4 = fadd double %1, %1
+  %add5 = fadd double %add4, %0
+  %mul6 = fmul double %0, %add5
+  %add7 = fadd double %add, %mul6
+  %mul8 = fmul double %1, %1
+  %add9 = fadd double %0, %0
+  %add10 = fadd double %add9, %0
+  %mul11 = fmul double %mul8, %add10
+  %add12 = fadd double %add7, %mul11
+  %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+  store double %add12, double* %arrayidx14, align 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %for.end, label %for.body
+; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+; CHECK: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+; CHECK: %0 = load double* %arrayidx, align 8
+; CHECK: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+; CHECK: %1 = load double* %arrayidx2, align 8
+; CHECK: %mul = fmul double %0, %0
+; CHECK: %mul3 = fmul double %0, %1
+; CHECK: %add = fadd double %mul, %mul3
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
+; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
+; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
+; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
+; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1
+; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2
+; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1
+; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5
+; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0
+; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1
+; CHECK: %add7 = fadd double %add, %mul6.v.r1
+; CHECK: %add12 = fadd double %add7, %mul6.v.r2
+; CHECK: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+; CHECK: store double %add12, double* %arrayidx14, align 8
+; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1
+; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10
+; CHECK: br i1 %exitcond, label %for.end, label %for.body
+; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
+; CHECK-UNRL: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
+; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>*
+; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
+; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>*
+; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
+; CHECK-UNRL: %2 = load <2 x double>* %0, align 8
+; CHECK-UNRL: %3 = load <2 x double>* %1, align 8
+; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
+; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
+; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
+; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
+; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
+; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
+; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
+; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
+; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
+; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
+; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
+; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
+; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>*
+; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8
+; CHECK-UNRL: %indvars.iv.next.1 = add i64 %indvars.iv, 2
+; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32
+; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10
+; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/BBVectorize/req-depth.ll b/test/Transforms/BBVectorize/req-depth.ll
new file mode 100644
index 0000000000..8c9cc3c188
--- /dev/null
+++ b/test/Transforms/BBVectorize/req-depth.ll
@@ -0,0 +1,17 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -S | FileCheck %s -check-prefix=CHECK-RD3
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -S | FileCheck %s -check-prefix=CHECK-RD2
+
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+	%R  = fmul double %Y1, %Y2
+	ret double %R
+; CHECK-RD3: @test1
+; CHECK-RD2: @test1
+; CHECK-RD3-NOT: <2 x double>
+; CHECK-RD2: <2 x double>
+}
+
diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll
new file mode 100644
index 0000000000..d9945b5630
--- /dev/null
+++ b/test/Transforms/BBVectorize/search-limit.ll
@@ -0,0 +1,46 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4
+
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test1
+; CHECK-SL4: @test1
+; CHECK-SL4-NOT: <2 x double>
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y1, %B1
+        ; Here we have a dependency chain: the short search limit will not
+        ; see past this chain and so will not see the second part of the
+        ; pair to vectorize.
+        %mul41 = fmul double %Z1, %Y2
+        %sub48 = fsub double %Z1, %mul41
+        %mul62 = fmul double %Z1, %sub48
+        %sub69 = fsub double %Z1, %mul62
+        %mul83 = fmul double %Z1, %sub69
+        %sub90 = fsub double %Z1, %mul83
+        %mul104 = fmul double %Z1, %sub90
+        %sub111 = fsub double %Z1, %mul104
+        %mul125 = fmul double %Z1, %sub111
+        %sub132 = fsub double %Z1, %mul125
+        %mul146 = fmul double %Z1, %sub132
+        %sub153 = fsub double %Z1, %mul146
+        ; end of chain.
+	%Z2 = fadd double %Y2, %B2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R1  = fdiv double %Z1, %Z2
+        %R   = fmul double %R1, %sub153
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R1 = fdiv double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
new file mode 100644
index 0000000000..b2ef27b2b8
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -0,0 +1,59 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+declare double @llvm.fma.f64(double, double, double)
+declare double @llvm.cos.f64(double)
+
+; Basic depth-3 chain with fma
+define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1)
+	%Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test1
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
+; CHECK: %Y1.v.i2.2 = insertelement <2 x double> %Y1.v.i2.1, double %C2, i32 1
+; CHECK: %Y1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %X1, <2 x double> %X1.v.i0.2, <2 x double> %Y1.v.i2.2)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain with cos
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+	%Y1 = call double @llvm.cos.f64(double %X1)
+	%Y2 = call double @llvm.cos.f64(double %X2)
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+	%R  = fmul double %Z1, %Z2
+	ret double %R
+; CHECK: @test2
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: ret double %R
+}
+
+; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly
+
diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll
new file mode 100644
index 0000000000..a5397eeb1f
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple-ldstr.ll
@@ -0,0 +1,110 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
+
+; Simple 3-pair chain with loads and stores
+define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test1
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test1
+; CHECK-AO-NOT: <2 x double>
+}
+
+; Simple chain with extending loads and stores
+define void @test2(float* %a, float* %b, double* %c) nounwind uwtable readonly {
+entry:
+  %i0f = load float* %a, align 4
+  %i0 = fpext float %i0f to double
+  %i1f = load float* %b, align 4
+  %i1 = fpext float %i1f to double
+  %mul = fmul double %i0, %i1
+  %arrayidx3 = getelementptr inbounds float* %a, i64 1
+  %i3f = load float* %arrayidx3, align 4
+  %i3 = fpext float %i3f to double
+  %arrayidx4 = getelementptr inbounds float* %b, i64 1
+  %i4f = load float* %arrayidx4, align 4
+  %i4 = fpext float %i4f to double
+  %mul5 = fmul double %i3, %i4
+  store double %mul, double* %c, align 8
+  %arrayidx5 = getelementptr inbounds double* %c, i64 1
+  store double %mul5, double* %arrayidx5, align 8
+  ret void
+; CHECK: @test2
+; CHECK: %i0f.v.i0 = bitcast float* %a to <2 x float>*
+; CHECK: %i1f.v.i0 = bitcast float* %b to <2 x float>*
+; CHECK: %i0f = load <2 x float>* %i0f.v.i0, align 4
+; CHECK: %i0 = fpext <2 x float> %i0f to <2 x double>
+; CHECK: %i1f = load <2 x float>* %i1f.v.i0, align 4
+; CHECK: %i1 = fpext <2 x float> %i1f to <2 x double>
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %0 = bitcast double* %c to <2 x double>*
+; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test2
+; CHECK-AO-NOT: <2 x double>
+}
+
+; Simple chain with loads and truncating stores
+define void @test3(double* %a, double* %b, float* %c) nounwind uwtable readonly {
+entry:
+  %i0 = load double* %a, align 8
+  %i1 = load double* %b, align 8
+  %mul = fmul double %i0, %i1
+  %mulf = fptrunc double %mul to float
+  %arrayidx3 = getelementptr inbounds double* %a, i64 1
+  %i3 = load double* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds double* %b, i64 1
+  %i4 = load double* %arrayidx4, align 8
+  %mul5 = fmul double %i3, %i4
+  %mul5f = fptrunc double %mul5 to float
+  store float %mulf, float* %c, align 8
+  %arrayidx5 = getelementptr inbounds float* %c, i64 1
+  store float %mul5f, float* %arrayidx5, align 4
+  ret void
+; CHECK: @test3
+; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
+; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
+; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
+; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
+; CHECK: %mul = fmul <2 x double> %i0, %i1
+; CHECK: %mulf = fptrunc <2 x double> %mul to <2 x float>
+; CHECK: %0 = bitcast float* %c to <2 x float>*
+; CHECK: store <2 x float> %mulf, <2 x float>* %0, align 8
+; CHECK: ret void
+; CHECK-AO: @test3
+; CHECK-AO: %i0 = load double* %a, align 8
+; CHECK-AO: %i1 = load double* %b, align 8
+; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
+; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
+; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
+; CHECK-AO: %i3 = load double* %arrayidx3, align 8
+; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
+; CHECK-AO: %i4 = load double* %arrayidx4, align 8
+; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
+; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
+; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
+; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
+; CHECK-AO: %0 = bitcast float* %c to <2 x float>*
+; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8
+; CHECK-AO: ret void
+}
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll
new file mode 100644
index 0000000000..904d766bb6
--- /dev/null
+++ b/test/Transforms/BBVectorize/simple.ll
@@ -0,0 +1,152 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
+
+; Basic depth-3 chain
+define double @test1(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test1
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y2, %B2
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair permuted)
+define double @test2(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test2
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y1, %B2
+; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair first splat)
+define double @test3(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test3
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y2, %B1
+	%Z2 = fadd double %Y2, %B2
+; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain (last pair second splat)
+define double @test4(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test4
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z1 = fadd double %Y1, %B1
+	%Z2 = fadd double %Y1, %B2
+; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
+; Basic depth-3 chain
+define <2 x float> @test5(<2 x float> %A1, <2 x float> %A2, <2 x float> %B1, <2 x float> %B2) {
+; CHECK: @test5
+; CHECK: %X1.v.i1 = shufflevector <2 x float> %B1, <2 x float> %B2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %X1.v.i0 = shufflevector <2 x float> %A1, <2 x float> %A2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+	%X1 = fsub <2 x float> %A1, %B1
+	%X2 = fsub <2 x float> %A2, %B2
+; CHECK: %X1 = fsub <4 x float> %X1.v.i0, %X1.v.i1
+	%Y1 = fmul <2 x float> %X1, %A1
+	%Y2 = fmul <2 x float> %X2, %A2
+; CHECK: %Y1 = fmul <4 x float> %X1, %X1.v.i0
+	%Z1 = fadd <2 x float> %Y1, %B1
+	%Z2 = fadd <2 x float> %Y2, %B2
+; CHECK: %Z1 = fadd <4 x float> %Y1, %X1.v.i1
+	%R  = fmul <2 x float> %Z1, %Z2
+; CHECK: %Z1.v.r1 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+; CHECK: %Z1.v.r2 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK: %R = fmul <2 x float> %Z1.v.r1, %Z1.v.r2
+	ret <2 x float> %R
+; CHECK: ret <2 x float> %R
+}
+
+; Basic chain with shuffles
+define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
+; CHECK: @test6
+; CHECK: %X1.v.i1 = shufflevector <8 x i8> %B1, <8 x i8> %B2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %X1.v.i0 = shufflevector <8 x i8> %A1, <8 x i8> %A2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+	%X1 = sub <8 x i8> %A1, %B1
+	%X2 = sub <8 x i8> %A2, %B2
+; CHECK: %X1 = sub <16 x i8> %X1.v.i0, %X1.v.i1
+	%Y1 = mul <8 x i8> %X1, %A1
+	%Y2 = mul <8 x i8> %X2, %A2
+; CHECK: %Y1 = mul <16 x i8> %X1, %X1.v.i0
+	%Z1 = add <8 x i8> %Y1, %B1
+	%Z2 = add <8 x i8> %Y2, %B2
+; CHECK: %Z1 = add <16 x i8> %Y1, %X1.v.i1
+        %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32> <i32 15, i32 8, i32 6, i32 1, i32 13, i32 10, i32 4, i32 3>
+        %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32> <i32 6, i32 7, i32 0, i32 1, i32 2, i32 4, i32 4, i32 1>
+; CHECK: %Z1.v.r2 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <8 x i32> <i32 8, i32 undef, i32 10, i32 undef, i32 undef, i32 13, i32 undef, i32 15>
+; CHECK: %Q1.v.i1 = shufflevector <8 x i8> %Z1.v.r2, <8 x i8> undef, <16 x i32> <i32 0, i32 undef, i32 2, i32 undef, i32 undef, i32 5, i32 undef, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK: %Q1 = shufflevector <16 x i8> %Z1, <16 x i8> %Q1.v.i1, <16 x i32> <i32 23, i32 16, i32 6, i32 1, i32 21, i32 18, i32 4, i32 3, i32 14, i32 15, i32 8, i32 9, i32 10, i32 12, i32 12, i32 9>
+	%R  = mul <8 x i8> %Q1, %Q2
+; CHECK: %Q1.v.r1 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %Q1.v.r2 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK: %R = mul <8 x i8> %Q1.v.r1, %Q1.v.r2
+	ret <8 x i8> %R
+; CHECK: ret <8 x i8> %R
+}
+
+
diff --git a/tools/bugpoint/CMakeLists.txt b/tools/bugpoint/CMakeLists.txt
index e06feb1003..ee2235bf42 100644
--- a/tools/bugpoint/CMakeLists.txt
+++ b/tools/bugpoint/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(LLVM_LINK_COMPONENTS asmparser instrumentation scalaropts ipo
-  linker bitreader bitwriter)
+  linker bitreader bitwriter vectorize)
 
 add_llvm_tool(bugpoint
   BugDriver.cpp
diff --git a/tools/bugpoint/Makefile b/tools/bugpoint/Makefile
index eacaa47a7b..34f4bddb01 100644
--- a/tools/bugpoint/Makefile
+++ b/tools/bugpoint/Makefile
@@ -10,6 +10,6 @@
 LEVEL := ../..
 TOOLNAME := bugpoint
 LINK_COMPONENTS := asmparser instrumentation scalaropts ipo linker bitreader \
-                   bitwriter
+                   bitwriter vectorize
 
 include $(LEVEL)/Makefile.common
diff --git a/tools/llvm-ld/CMakeLists.txt b/tools/llvm-ld/CMakeLists.txt
index 370bcb4abf..d328a04b0e 100644
--- a/tools/llvm-ld/CMakeLists.txt
+++ b/tools/llvm-ld/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter)
+set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter vectorize)
 
 add_llvm_tool(llvm-ld
   Optimize.cpp
diff --git a/tools/llvm-ld/Makefile b/tools/llvm-ld/Makefile
index 2ec6e0a43a..8793ca9c10 100644
--- a/tools/llvm-ld/Makefile
+++ b/tools/llvm-ld/Makefile
@@ -9,6 +9,6 @@
 
 LEVEL := ../..
 TOOLNAME := llvm-ld
-LINK_COMPONENTS := ipo scalaropts linker archive bitwriter
+LINK_COMPONENTS := ipo scalaropts linker archive bitwriter vectorize
 
 include $(LEVEL)/Makefile.common
diff --git a/tools/lto/CMakeLists.txt b/tools/lto/CMakeLists.txt
index 7e2c5f06e6..911297609b 100644
--- a/tools/lto/CMakeLists.txt
+++ b/tools/lto/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
-  ipo scalaropts linker bitreader bitwriter mcdisassembler)
+  ipo scalaropts linker bitreader bitwriter mcdisassembler vectorize)
 
 add_definitions( -DLLVM_VERSION_INFO=\"${PACKAGE_VERSION}\" )
 
diff --git a/tools/lto/Makefile b/tools/lto/Makefile
index ef78f82c8c..153fa03137 100644
--- a/tools/lto/Makefile
+++ b/tools/lto/Makefile
@@ -10,7 +10,7 @@
 LEVEL := ../..
 LIBRARYNAME := LTO
 LINK_COMPONENTS := all-targets ipo scalaropts linker bitreader bitwriter \
-                   mcdisassembler
+                   mcdisassembler vectorize
 LINK_LIBS_IN_SHARED := 1
 SHARED_LIBRARY := 1
 
diff --git a/tools/opt/CMakeLists.txt b/tools/opt/CMakeLists.txt
index 0570d0e04a..7daf22aa9e 100644
--- a/tools/opt/CMakeLists.txt
+++ b/tools/opt/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo)
+set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo vectorize)
 
 add_llvm_tool(opt
   AnalysisWrappers.cpp
diff --git a/tools/opt/Makefile b/tools/opt/Makefile
index e8cd7e2857..16d116da5d 100644
--- a/tools/opt/Makefile
+++ b/tools/opt/Makefile
@@ -9,6 +9,6 @@
 
 LEVEL := ../..
 TOOLNAME := opt
-LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo
+LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo vectorize
 
 include $(LEVEL)/Makefile.common
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 51613649b5..30da863b41 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -480,6 +480,7 @@ int main(int argc, char **argv) {
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeCore(Registry);
   initializeScalarOpts(Registry);
+  initializeVectorization(Registry);
   initializeIPO(Registry);
   initializeAnalysis(Registry);
   initializeIPA(Registry);
-- 
cgit v1.2.3