BBVectorize: Choose pair ordering to minimize shuffles

BBVectorize would, except for loads and stores, always fuse instructions so that the first instruction (in the current source order) would always represent the low part of the input vectors and the second instruction would always represent the high part. This lead to too many shuffles being produced because sometimes the opposite order produces fewer of them. With this change, BBVectorize tracks the kind of pair connections that form the DAG of candidate pairs, and uses that information to reorder the pairs to avoid excess shuffles. Using this information, a future commit will be able to add VTTI-based shuffle costs to the pair selection procedure. Importantly, the number of remaining shuffles can now be estimated during pair selection. There are some trivial instruction reorderings in the test cases, and one simple additional test where we certainly want to do a reordering to avoid an unnecessary shuffle. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167122 91177308-0d34-0410-b5e6-96231b3b80d8
author: Hal Finkel <hfinkel@anl.gov> 2012-10-31 15:17:07 +0000
committer: Hal Finkel <hfinkel@anl.gov> 2012-10-31 15:17:07 +0000
commit: 72465ea23d010507d3746adc126d719005981e05 (patch)
tree: d5e6b1ad3aad528df1c41d88c82db6a62ba61ca4
parent: ef026f1b5e4d52e11c67a1a5ad01eadffcfa4d8e (diff)
download: llvm-72465ea23d010507d3746adc126d719005981e05.tar.gz
llvm-72465ea23d010507d3746adc126d719005981e05.tar.bz2
llvm-72465ea23d010507d3746adc126d719005981e05.tar.xz
10 files changed, 255 insertions, 96 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 051606b6d8..40277dc237 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -166,6 +166,12 @@ DebugCycleCheck("bb-vectorize-debug-cycle-check",
   cl::init(false), cl::Hidden,
   cl::desc("When debugging is enabled, output information on the"
            " cycle-checking process"));
+
+static cl::opt<bool>
+PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
+  cl::init(false), cl::Hidden,
+  cl::desc("When debugging is enabled, dump the basic block after"
+           " every pair is fused"));
 #endif
 
 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
@@ -196,6 +202,7 @@ namespace {
     typedef std::pair<ValuePair, int> ValuePairWithCost;
     typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
     typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+    typedef std::pair<VPPair, unsigned> VPPairWithType;
     typedef std::pair<std::multimap<Value *, Value *>::iterator,
               std::multimap<Value *, Value *>::iterator> VPIteratorPair;
     typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
@@ -220,9 +227,16 @@ namespace {
                        DenseMap<ValuePair, int> &CandidatePairCostSavings,
                        std::vector<Value *> &PairableInsts, bool NonPow2Len);
 
+    enum PairConnectionType {
+      PairConnectionDirect,
+      PairConnectionSwap,
+      PairConnectionSplat
+    };
+
     void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
                        std::vector<Value *> &PairableInsts,
-                       std::multimap<ValuePair, ValuePair> &ConnectedPairs);
+                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                       DenseMap<VPPair, unsigned> &PairConnectionTypes);
 
     void buildDepMap(BasicBlock &BB,
                        std::multimap<Value *, Value *> &CandidatePairs,
@@ -239,7 +253,11 @@ namespace {
     void fuseChosenPairs(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
                      DenseMap<Value *, Value *>& ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs);
+                     DenseSet<ValuePair> &FixedOrderPairs,
+                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps);
+
 
     bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
 
@@ -256,6 +274,7 @@ namespace {
                       std::multimap<Value *, Value *> &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
                       ValuePair P);
 
     bool pairsConflict(ValuePair P, ValuePair Q,
@@ -310,14 +329,15 @@ namespace {
 
     bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
                        unsigned o, Value *&LOp, unsigned numElemL,
-                       Type *ArgTypeL, Type *ArgTypeR,
+                       Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
                        unsigned IdxOff = 0);
 
     Value *getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o);
+                     Instruction *J, unsigned o, bool IBeforeJ);
 
     void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
-                     Instruction *J, SmallVector<Value *, 3> &ReplacedOperands);
+                     Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+                     bool IBeforeJ);
 
     void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
                      Instruction *J, Instruction *K,
@@ -647,6 +667,8 @@ namespace {
     std::vector<Value *> AllPairableInsts;
     DenseMap<Value *, Value *> AllChosenPairs;
     DenseSet<ValuePair> AllFixedOrderPairs;
+    DenseMap<VPPair, unsigned> AllPairConnectionTypes;
+    std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps;
 
     do {
       std::vector<Value *> PairableInsts;
@@ -668,10 +690,18 @@ namespace {
       // Note that it only matters that both members of the second pair use some
       // element of the first pair (to allow for splatting).
 
-      std::multimap<ValuePair, ValuePair> ConnectedPairs;
-      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+      std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps;
+      DenseMap<VPPair, unsigned> PairConnectionTypes;
+      computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs,
+                            PairConnectionTypes);
       if (ConnectedPairs.empty()) continue;
 
+      for (std::multimap<ValuePair, ValuePair>::iterator
+           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+           I != IE; ++I) {
+        ConnectedPairDeps.insert(VPPair(I->second, I->first));
+      }
+
       // Build the pairable-instruction dependency map
       DenseSet<ValuePair> PairableInstUsers;
       buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
@@ -692,12 +722,37 @@ namespace {
                               PairableInsts.end());
       AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
 
+      // Only for the chosen pairs, propagate information on fixed-order pairs,
+      // pair connections, and their types to the data structures used by the
+      // pair fusion procedures.
       for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(),
            IE = ChosenPairs.end(); I != IE; ++I) {
         if (FixedOrderPairs.count(*I))
           AllFixedOrderPairs.insert(*I);
         else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
           AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
+
+        for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin();
+             J != IE; ++J) {
+          DenseMap<VPPair, unsigned>::iterator K =
+            PairConnectionTypes.find(VPPair(*I, *J));
+          if (K != PairConnectionTypes.end()) {
+            AllPairConnectionTypes.insert(*K);
+          } else {
+            K = PairConnectionTypes.find(VPPair(*J, *I));
+            if (K != PairConnectionTypes.end())
+              AllPairConnectionTypes.insert(*K);
+          }
+        }
+      }
+
+      for (std::multimap<ValuePair, ValuePair>::iterator
+           I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
+           I != IE; ++I) {
+        if (AllPairConnectionTypes.count(*I)) {
+          AllConnectedPairs.insert(*I);
+          AllConnectedPairDeps.insert(VPPair(I->second, I->first));
+        }
       }
     } while (ShouldContinue);
 
@@ -711,7 +766,9 @@ namespace {
     // replaced with a vector_extract on the result.  Subsequent optimization
     // passes should coalesce the build/extract combinations.
 
-    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs);
+    fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
+                    AllPairConnectionTypes,
+                    AllConnectedPairs, AllConnectedPairDeps);
 
     // It is important to cleanup here so that future iterations of this
     // function have less work to do.
@@ -1098,6 +1155,7 @@ namespace {
                       std::multimap<Value *, Value *> &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
                       std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes,
                       ValuePair P) {
     StoreInst *SI, *SJ;
 
@@ -1129,12 +1187,18 @@ namespace {
         VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
 
         // Look for <I, J>:
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
+        }
 
         // Look for <J, I>:
-        if (isSecondInIteratorPair<Value*>(*I, JPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
+        if (isSecondInIteratorPair<Value*>(*I, JPairRange)) {
+          VPPair VP(P, ValuePair(*J, *I));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
+        }
       }
 
       if (Config.SplatBreaksChain) continue;
@@ -1145,8 +1209,11 @@ namespace {
             P.first == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+        }
       }
     }
 
@@ -1168,8 +1235,11 @@ namespace {
             P.second == SJ->getPointerOperand())
           continue;
 
-        if (isSecondInIteratorPair<Value*>(*J, IPairRange))
-          ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+        if (isSecondInIteratorPair<Value*>(*J, IPairRange)) {
+          VPPair VP(P, ValuePair(*I, *J));
+          ConnectedPairs.insert(VP);
+          PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
+        }
       }
     }
   }
@@ -1180,7 +1250,8 @@ namespace {
   void BBVectorize::computeConnectedPairs(
                       std::multimap<Value *, Value *> &CandidatePairs,
                       std::vector<Value *> &PairableInsts,
-                      std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
+                      std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                      DenseMap<VPPair, unsigned> &PairConnectionTypes) {
 
     for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
          PE = PairableInsts.end(); PI != PE; ++PI) {
@@ -1189,7 +1260,7 @@ namespace {
       for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
            P != choiceRange.second; ++P)
         computePairsConnectedTo(CandidatePairs, PairableInsts,
-                                ConnectedPairs, *P);
+                                ConnectedPairs, PairConnectionTypes, *P);
     }
 
     DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
@@ -1776,7 +1847,7 @@ namespace {
                                   Instruction *J, unsigned o, Value *&LOp,
                                   unsigned numElemL,
                                   Type *ArgTypeL, Type *ArgTypeH,
-                                  unsigned IdxOff) {
+                                  bool IBeforeJ, unsigned IdxOff) {
     bool ExpandedIEChain = false;
     if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
       // If we have a pure insertelement chain, then this can be rewritten
@@ -1810,8 +1881,9 @@ namespace {
           LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
                              ConstantInt::get(Type::getInt32Ty(Context),
                                               i + IdxOff),
-                             getReplacementName(I, true, o, i+1));
-          LIENext->insertBefore(J);
+                             getReplacementName(IBeforeJ ? I : J,
+                                                true, o, i+1));
+          LIENext->insertBefore(IBeforeJ ? J : I);
           LIEPrev = LIENext;
         }
 
@@ -1826,7 +1898,7 @@ namespace {
   // Returns the value to be used as the specified operand of the vector
   // instruction that fuses I with J.
   Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
-                     Instruction *J, unsigned o) {
+                     Instruction *J, unsigned o, bool IBeforeJ) {
     Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
     Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
 
@@ -1989,8 +2061,9 @@ namespace {
           Instruction *S =
             new ShuffleVectorInst(I1, UndefValue::get(I1T),
                                   ConstantVector::get(Mask),
-                                  getReplacementName(I, true, o));
-          S->insertBefore(J);
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o));
+          S->insertBefore(IBeforeJ ? J : I);
           return S;
         }
 
@@ -2011,8 +2084,9 @@ namespace {
           Instruction *NewI1 =
             new ShuffleVectorInst(I1, UndefValue::get(I1T),
                                   ConstantVector::get(Mask),
-                                  getReplacementName(I, true, o, 1));
-          NewI1->insertBefore(J);
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o, 1));
+          NewI1->insertBefore(IBeforeJ ? J : I);
           I1 = NewI1;
           I1T = I2T;
           I1Elem = I2Elem;
@@ -2027,8 +2101,9 @@ namespace {
           Instruction *NewI2 =
             new ShuffleVectorInst(I2, UndefValue::get(I2T),
                                   ConstantVector::get(Mask),
-                                  getReplacementName(I, true, o, 1));
-          NewI2->insertBefore(J);
+                                  getReplacementName(IBeforeJ ? I : J,
+                                                     true, o, 1));
+          NewI2->insertBefore(IBeforeJ ? J : I);
           I2 = NewI2;
           I2T = I1T;
           I2Elem = I1Elem;
@@ -2048,8 +2123,8 @@ namespace {
 
         Instruction *NewOp =
           new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
-                                getReplacementName(I, true, o));
-        NewOp->insertBefore(J);
+                                getReplacementName(IBeforeJ ? I : J, true, o));
+        NewOp->insertBefore(IBeforeJ ? J : I);
         return NewOp;
       }
     }
@@ -2057,17 +2132,17 @@ namespace {
     Type *ArgType = ArgTypeL;
     if (numElemL < numElemH) {
       if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
-                                         ArgTypeL, VArgType, 1)) {
+                                         ArgTypeL, VArgType, IBeforeJ, 1)) {
         // This is another short-circuit case: we're combining a scalar into
         // a vector that is formed by an IE chain. We've just expanded the IE
         // chain, now insert the scalar and we're done.
 
         Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
-                                               getReplacementName(I, true, o));
-        S->insertBefore(J);
+                           getReplacementName(IBeforeJ ? I : J, true, o));
+        S->insertBefore(IBeforeJ ? J : I);
         return S;
       } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
-                                ArgTypeH)) {
+                                ArgTypeH, IBeforeJ)) {
         // The two vector inputs to the shuffle must be the same length,
         // so extend the smaller vector to be the same length as the larger one.
         Instruction *NLOp;
@@ -2082,29 +2157,32 @@ namespace {
     
           NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
                                        ConstantVector::get(Mask),
-                                       getReplacementName(I, true, o, 1));
+                                       getReplacementName(IBeforeJ ? I : J,
+                                                          true, o, 1));
         } else {
           NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
-                                           getReplacementName(I, true, o, 1));
+                                           getReplacementName(IBeforeJ ? I : J,
+                                                              true, o, 1));
         }
   
-        NLOp->insertBefore(J);
+        NLOp->insertBefore(IBeforeJ ? J : I);
         LOp = NLOp;
       }
 
       ArgType = ArgTypeH;
     } else if (numElemL > numElemH) {
       if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
-                                         ArgTypeH, VArgType)) {
+                                         ArgTypeH, VArgType, IBeforeJ)) {
         Instruction *S =
           InsertElementInst::Create(LOp, HOp, 
                                     ConstantInt::get(Type::getInt32Ty(Context),
                                                      numElemL),
-                                    getReplacementName(I, true, o));
-        S->insertBefore(J);
+                                    getReplacementName(IBeforeJ ? I : J,
+                                                       true, o));
+        S->insertBefore(IBeforeJ ? J : I);
         return S;
       } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
-                                ArgTypeL)) {
+                                ArgTypeL, IBeforeJ)) {
         Instruction *NHOp;
         if (numElemH > 1) {
           std::vector<Constant *> Mask(numElemL);
@@ -2116,13 +2194,15 @@ namespace {
     
           NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
                                        ConstantVector::get(Mask),
-                                       getReplacementName(I, true, o, 1));
+                                       getReplacementName(IBeforeJ ? I : J,
+                                                          true, o, 1));
         } else {
           NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
-                                           getReplacementName(I, true, o, 1));
+                                           getReplacementName(IBeforeJ ? I : J,
+                                                              true, o, 1));
         }
   
-        NHOp->insertBefore(J);
+        NHOp->insertBefore(IBeforeJ ? J : I);
         HOp = NHOp;
       }
     }
@@ -2140,19 +2220,21 @@ namespace {
       }
 
       Instruction *BV = new ShuffleVectorInst(LOp, HOp,
-                                              ConstantVector::get(Mask),
-                                              getReplacementName(I, true, o));
-      BV->insertBefore(J);
+                          ConstantVector::get(Mask),
+                          getReplacementName(IBeforeJ ? I : J, true, o));
+      BV->insertBefore(IBeforeJ ? J : I);
       return BV;
     }
 
     Instruction *BV1 = InsertElementInst::Create(
                                           UndefValue::get(VArgType), LOp, CV0,
-                                          getReplacementName(I, true, o, 1));
-    BV1->insertBefore(I);
+                                          getReplacementName(IBeforeJ ? I : J,
+                                                             true, o, 1));
+    BV1->insertBefore(IBeforeJ ? J : I);
     Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
-                                          getReplacementName(I, true, o, 2));
-    BV2->insertBefore(J);
+                                          getReplacementName(IBeforeJ ? I : J,
+                                                             true, o, 2));
+    BV2->insertBefore(IBeforeJ ? J : I);
     return BV2;
   }
 
@@ -2160,7 +2242,8 @@ namespace {
   // to the vector instruction that fuses I with J.
   void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
                      Instruction *I, Instruction *J,
-                     SmallVector<Value *, 3> &ReplacedOperands) {
+                     SmallVector<Value *, 3> &ReplacedOperands,
+                     bool IBeforeJ) {
     unsigned NumOperands = I->getNumOperands();
 
     for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
@@ -2197,7 +2280,7 @@ namespace {
         continue;
       }
 
-      ReplacedOperands[o] = getReplacementInput(Context, I, J, o);
+      ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
     }
   }
 
@@ -2392,18 +2475,20 @@ namespace {
   void BBVectorize::fuseChosenPairs(BasicBlock &BB,
                      std::vector<Value *> &PairableInsts,
                      DenseMap<Value *, Value *> &ChosenPairs,
-                     DenseSet<ValuePair> &FixedOrderPairs) {
+                     DenseSet<ValuePair> &FixedOrderPairs,
+                     DenseMap<VPPair, unsigned> &PairConnectionTypes,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+                     std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) {
     LLVMContext& Context = BB.getContext();
 
     // During the vectorization process, the order of the pairs to be fused
     // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
     // list. After a pair is fused, the flipped pair is removed from the list.
-    std::vector<ValuePair> FlippedPairs;
-    FlippedPairs.reserve(ChosenPairs.size());
+    DenseSet<ValuePair> FlippedPairs;
     for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(),
          E = ChosenPairs.end(); P != E; ++P)
-      FlippedPairs.push_back(ValuePair(P->second, P->first));
-    for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(),
+      FlippedPairs.insert(ValuePair(P->second, P->first));
+    for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(),
          E = FlippedPairs.end(); P != E; ++P)
       ChosenPairs.insert(*P);
 
@@ -2451,37 +2536,83 @@ namespace {
 
       // If the pair must have the other order, then flip it.
       bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
+      if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
+        // This pair does not have a fixed order, and so we might want to
+        // flip it if that will yield fewer shuffles. We count the number
+        // of dependencies connected via swaps, and those directly connected,
+        // and flip the order if the number of swaps is greater.
+        bool OrigOrder = true;
+        VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J));
+        if (IP.first == ConnectedPairDeps.end()) {
+          IP = ConnectedPairDeps.equal_range(ValuePair(J, I));
+          OrigOrder = false;
+        }
+
+        if (IP.first != ConnectedPairDeps.end()) {
+          unsigned NumDepsDirect = 0, NumDepsSwap = 0;
+          for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+               Q != IP.second; ++Q) {
+            DenseMap<VPPair, unsigned>::iterator R =
+              PairConnectionTypes.find(VPPair(Q->second, Q->first));
+            assert(R != PairConnectionTypes.end() &&
+                   "Cannot find pair connection type");
+            if (R->second == PairConnectionDirect)
+              ++NumDepsDirect;
+            else if (R->second == PairConnectionSwap)
+              ++NumDepsSwap;
+          }
+
+          if (!OrigOrder)
+            std::swap(NumDepsDirect, NumDepsSwap);
+
+          if (NumDepsSwap > NumDepsDirect) {
+            FlipPairOrder = true;
+            DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
+                            " <-> " << *J << "\n");
+          }
+        }
+      }
 
       Instruction *L = I, *H = J;
       if (FlipPairOrder)
         std::swap(H, L);
 
+      // If the pair being fused uses the opposite order from that in the pair
+      // connection map, then we need to flip the types.
+      VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L));
+      for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first;
+           Q != IP.second; ++Q) {
+        DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q);
+        assert(R != PairConnectionTypes.end() &&
+               "Cannot find pair connection type");
+        if (R->second == PairConnectionDirect)
+          R->second = PairConnectionSwap;
+        else if (R->second == PairConnectionSwap)
+          R->second = PairConnectionDirect;
+      }
+
+      bool LBeforeH = !FlipPairOrder;
       unsigned NumOperands = I->getNumOperands();
       SmallVector<Value *, 3> ReplacedOperands(NumOperands);
-      getReplacementInputsForPair(Context, L, H, ReplacedOperands);
+      getReplacementInputsForPair(Context, L, H, ReplacedOperands,
+                                  LBeforeH);
 
       // Make a copy of the original operation, change its type to the vector
       // type and replace its operands with the vector operands.
-      Instruction *K = I->clone();
-      if (I->hasName()) K->takeName(I);
+      Instruction *K = L->clone();
+      if (L->hasName())
+        K->takeName(L);
+      else if (H->hasName())
+        K->takeName(H);
 
       if (!isa<StoreInst>(K))
         K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
 
-      combineMetadata(K, J);
+      combineMetadata(K, H);
 
       for (unsigned o = 0; o < NumOperands; ++o)
         K->setOperand(o, ReplacedOperands[o]);
 
-      // If we've flipped the memory inputs, make sure that we take the correct
-      // alignment.
-      if (FlipPairOrder) {
-        if (isa<StoreInst>(K))
-          cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment());
-        else if (isa<LoadInst>(K))
-          cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment());
-      }
-
       K->insertAfter(J);
 
       // Instruction insertion point:
@@ -2497,10 +2628,10 @@ namespace {
       moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
 
       if (!isa<StoreInst>(I)) {
-        I->replaceAllUsesWith(K1);
-        J->replaceAllUsesWith(K2);
-        AA->replaceWithNewValue(I, K1);
-        AA->replaceWithNewValue(J, K2);
+        L->replaceAllUsesWith(K1);
+        H->replaceAllUsesWith(K2);
+        AA->replaceWithNewValue(L, K1);
+        AA->replaceWithNewValue(H, K2);
       }
 
       // Instructions that may read from memory may be in the load move set.
@@ -2533,6 +2664,9 @@ namespace {
       SE->forgetValue(J);
       I->eraseFromParent();
       J->eraseFromParent();
+
+      DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
+                                               BB << "\n");
     }
 
     DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
diff --git a/test/Transforms/BBVectorize/X86/loop1.ll b/test/Transforms/BBVectorize/X86/loop1.ll
index 9d5d9fb617..c1be62203b 100644
--- a/test/Transforms/BBVectorize/X86/loop1.ll
+++ b/test/Transforms/BBVectorize/X86/loop1.ll
@@ -42,8 +42,8 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: %mul = fmul double %0, %0
 ; CHECK: %mul3 = fmul double %0, %1
 ; CHECK: %add = fadd double %mul, %mul3
-; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
 ; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
 ; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
diff --git a/test/Transforms/BBVectorize/X86/simple.ll b/test/Transforms/BBVectorize/X86/simple.ll
index 6450f821f5..d11c9b92f0 100644
--- a/test/Transforms/BBVectorize/X86/simple.ll
+++ b/test/Transforms/BBVectorize/X86/simple.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/cycle.ll b/test/Transforms/BBVectorize/cycle.ll
index 32a91ceee0..e8e82ce024 100644
--- a/test/Transforms/BBVectorize/cycle.ll
+++ b/test/Transforms/BBVectorize/cycle.ll
@@ -107,6 +107,6 @@ done:
   ret void
 ; CHECK: @test1
 ; CHECK: go:
-; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
+; CHECK: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
 ; FIXME: When tree pruning is deterministic, include the entire output.
 }
diff --git a/test/Transforms/BBVectorize/loop1.ll b/test/Transforms/BBVectorize/loop1.ll
index bebc91ad91..c22ea5852a 100644
--- a/test/Transforms/BBVectorize/loop1.ll
+++ b/test/Transforms/BBVectorize/loop1.ll
@@ -42,8 +42,8 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: %mul = fmul double %0, %0
 ; CHECK: %mul3 = fmul double %0, %1
 ; CHECK: %add = fadd double %mul, %mul3
-; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %mul8 = fmul double %1, %1
+; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
 ; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
 ; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
 ; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
diff --git a/test/Transforms/BBVectorize/search-limit.ll b/test/Transforms/BBVectorize/search-limit.ll
index d9945b5630..aeaf98865b 100644
--- a/test/Transforms/BBVectorize/search-limit.ll
+++ b/test/Transforms/BBVectorize/search-limit.ll
@@ -7,8 +7,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK-SL4: @test1
 ; CHECK-SL4-NOT: <2 x double>
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple-int.ll b/test/Transforms/BBVectorize/simple-int.ll
index 6844977143..ae1d63bfd8 100644
--- a/test/Transforms/BBVectorize/simple-int.ll
+++ b/test/Transforms/BBVectorize/simple-int.ll
@@ -17,8 +17,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1,
 	ret double %R
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
@@ -43,8 +43,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 	ret double %R
 ; CHECK: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
@@ -68,8 +68,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2, i32 %P) {
 	ret double %R
 ; CHECK: @test3
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
 ; CHECK: %Y1 = call <2 x double> @llvm.powi.v2f64(<2 x double> %X1, i32 %P)
diff --git a/test/Transforms/BBVectorize/simple-ldstr.ll b/test/Transforms/BBVectorize/simple-ldstr.ll
index 6883e844be..7dd77c933f 100644
--- a/test/Transforms/BBVectorize/simple-ldstr.ll
+++ b/test/Transforms/BBVectorize/simple-ldstr.ll
@@ -94,13 +94,13 @@ entry:
 ; CHECK-AO: @test3
 ; CHECK-AO: %i0 = load double* %a, align 8
 ; CHECK-AO: %i1 = load double* %b, align 8
-; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
-; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
 ; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
 ; CHECK-AO: %i3 = load double* %arrayidx3, align 8
 ; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
 ; CHECK-AO: %i4 = load double* %arrayidx4, align 8
+; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
 ; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
+; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
 ; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
 ; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
 ; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
diff --git a/test/Transforms/BBVectorize/simple-sel.ll b/test/Transforms/BBVectorize/simple-sel.ll
index 325792a5dc..15ecb59702 100644
--- a/test/Transforms/BBVectorize/simple-sel.ll
+++ b/test/Transforms/BBVectorize/simple-sel.ll
@@ -6,8 +6,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define double @test1(double %A1, double %A2, double %B1, double %B2, i1 %C1, i1 %C2) {
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -33,8 +33,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test2
 ; CHECK-NB: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
diff --git a/test/Transforms/BBVectorize/simple.ll b/test/Transforms/BBVectorize/simple.ll
index 7cd8133bee..d9a12eebed 100644
--- a/test/Transforms/BBVectorize/simple.ll
+++ b/test/Transforms/BBVectorize/simple.ll
@@ -5,8 +5,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 define double @test1(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test1
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -29,8 +29,8 @@ define double @test1(double %A1, double %A2, double %B1, double %B2) {
 define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test2
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -40,12 +40,13 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
 	%Z1 = fadd double %Y2, %B1
 	%Z2 = fadd double %Y1, %B2
-; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
+; CHECK: %Z1.v.i1.1 = insertelement <2 x double> undef, double %B2, i32 0
+; CHECK: %Z1.v.i1.2 = insertelement <2 x double> %Z1.v.i1.1, double %B1, i32 1
+; CHECK: %Z2 = fadd <2 x double> %Y1, %Z1.v.i1.2
 	%R  = fmul double %Z1, %Z2
-; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
-; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
-; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+; CHECK: %Z2.v.r1 = extractelement <2 x double> %Z2, i32 0
+; CHECK: %Z2.v.r2 = extractelement <2 x double> %Z2, i32 1
+; CHECK: %R = fmul double %Z2.v.r2, %Z2.v.r1
 	ret double %R
 ; CHECK: ret double %R
 }
@@ -54,8 +55,8 @@ define double @test2(double %A1, double %A2, double %B1, double %B2) {
 define double @test3(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test3
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -79,8 +80,8 @@ define double @test3(double %A1, double %A2, double %B1, double %B2) {
 define double @test4(double %A1, double %A2, double %B1, double %B2) {
 ; CHECK: @test4
 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
-; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
 	%X1 = fsub double %A1, %B1
 	%X2 = fsub double %A2, %B2
@@ -148,3 +149,27 @@ define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
 ; CHECK: ret <8 x i8> %R
 }
 
+; Basic depth-3 chain (flipped order)
+define double @test7(double %A1, double %A2, double %B1, double %B2) {
+; CHECK: @test7
+; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
+; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
+; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
+; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
+	%X1 = fsub double %A1, %B1
+	%X2 = fsub double %A2, %B2
+; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
+	%Y1 = fmul double %X1, %A1
+	%Y2 = fmul double %X2, %A2
+; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
+	%Z2 = fadd double %Y2, %B2
+	%Z1 = fadd double %Y1, %B1
+; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
+	%R  = fmul double %Z1, %Z2
+; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
+; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
+; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
+	ret double %R
+; CHECK: ret double %R
+}
+
author	Hal Finkel <hfinkel@anl.gov>	2012-10-31 15:17:07 +0000
committer	Hal Finkel <hfinkel@anl.gov>	2012-10-31 15:17:07 +0000
commit	72465ea23d010507d3746adc126d719005981e05 (patch)
tree	d5e6b1ad3aad528df1c41d88c82db6a62ba61ca4
parent	ef026f1b5e4d52e11c67a1a5ad01eadffcfa4d8e (diff)
download	llvm-72465ea23d010507d3746adc126d719005981e05.tar.gz llvm-72465ea23d010507d3746adc126d719005981e05.tar.bz2 llvm-72465ea23d010507d3746adc126d719005981e05.tar.xz