From 2652c50f74bc4a874c6a2e4b34ff2d52d479183f Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 24 Oct 2012 23:47:38 +0000
Subject: Implement a basic cost model for vector and scalar instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166642 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h        |   2 +
 include/llvm/Target/TargetTransformImpl.h   |   5 ++
 lib/Target/TargetTransformImpl.cpp          | 129 +++++++++++++++++++++++++++-
 lib/Transforms/IPO/PassManagerBuilder.cpp   |   2 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp  |  49 +++++++----
 test/Transforms/LoopVectorize/cost-model.ll |   4 +-
 6 files changed, 168 insertions(+), 23 deletions(-)
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 0dbdd25140..19eb941635 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1958,6 +1958,7 @@ private:
 
   ValueTypeActionImpl ValueTypeActions;
 
+public:
   LegalizeKind
   getTypeConversion(LLVMContext &Context, EVT VT) const {
     // If this is a simple type, use the ComputeRegisterProp mechanism.
@@ -2074,6 +2075,7 @@ private:
     return LegalizeKind(TypeSplitVector, NVT);
   }
 
+private:
   std::vector<std::pair<EVT, const TargetRegisterClass*> > AvailableRegClasses;
 
   /// TargetDAGCombineArray - Targets can specify ISD nodes that they would
diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h
index ec39f9968e..25a7edeb01 100644
--- a/include/llvm/Target/TargetTransformImpl.h
+++ b/include/llvm/Target/TargetTransformImpl.h
@@ -16,6 +16,7 @@
 #define LLVM_TARGET_TARGET_TRANSFORMATION_IMPL_H
 
 #include "llvm/TargetTransformInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
 
 namespace llvm {
 
@@ -51,6 +52,10 @@ class VectorTargetTransformImpl : public VectorTargetTransformInfo {
 private:
   const TargetLowering *TLI;
 
+  /// Estimate the cost of type-legalization and the legalized type.
+  std::pair<unsigned, EVT>
+  getTypeLegalizationCost(LLVMContext &C, EVT Ty) const;
+
 public:
   explicit VectorTargetTransformImpl(const TargetLowering *TL) : TLI(TL) {}
   
diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp
index cee736bd71..382eecb766 100644
--- a/lib/Target/TargetTransformImpl.cpp
+++ b/lib/Target/TargetTransformImpl.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Target/TargetTransformImpl.h"
 #include "llvm/Target/TargetLowering.h"
+#include <utility>
 
 using namespace llvm;
 
@@ -53,11 +54,131 @@ unsigned ScalarTargetTransformImpl::getJumpBufSize() const {
 // Calls used by the vectorizers.
 //
 //===----------------------------------------------------------------------===//
+int InstructionOpcodeToISD(unsigned Opcode) {
+  static const int OpToISDTbl[] = {
+    /*Instruction::Ret           */ 0, // Opcode numbering start at #1.
+    /*Instruction::Br            */ 0,
+    /*Instruction::Switch        */ 0,
+    /*Instruction::IndirectBr    */ 0,
+    /*Instruction::Invoke        */ 0,
+    /*Instruction::Resume        */ 0,
+    /*Instruction::Unreachable   */ 0,
+    /*Instruction::Add           */ ISD::ADD,
+    /*Instruction::FAdd          */ ISD::FADD,
+    /*Instruction::Sub           */ ISD::SUB,
+    /*Instruction::FSub          */ ISD::FSUB,
+    /*Instruction::Mul           */ ISD::MUL,
+    /*Instruction::FMul          */ ISD::FMUL,
+    /*Instruction::UDiv          */ ISD::UDIV,
+    /*Instruction::SDiv          */ ISD::UDIV,
+    /*Instruction::FDiv          */ ISD::FDIV,
+    /*Instruction::URem          */ ISD::UREM,
+    /*Instruction::SRem          */ ISD::SREM,
+    /*Instruction::FRem          */ ISD::FREM,
+    /*Instruction::Shl           */ ISD::SHL,
+    /*Instruction::LShr          */ ISD::SRL,
+    /*Instruction::AShr          */ ISD::SRA,
+    /*Instruction::And           */ ISD::AND,
+    /*Instruction::Or            */ ISD::OR,
+    /*Instruction::Xor           */ ISD::XOR,
+    /*Instruction::Alloca        */ 0,
+    /*Instruction::Load          */ ISD::LOAD,
+    /*Instruction::Store         */ ISD::STORE,
+    /*Instruction::GetElementPtr */ 0,
+    /*Instruction::Fence         */ 0,
+    /*Instruction::AtomicCmpXchg */ 0,
+    /*Instruction::AtomicRMW     */ 0,
+    /*Instruction::Trunc         */ ISD::TRUNCATE,
+    /*Instruction::ZExt          */ ISD::ZERO_EXTEND,
+    /*Instruction::SExt          */ ISD::SEXTLOAD,
+    /*Instruction::FPToUI        */ ISD::FP_TO_UINT,
+    /*Instruction::FPToSI        */ ISD::FP_TO_SINT,
+    /*Instruction::UIToFP        */ ISD::UINT_TO_FP,
+    /*Instruction::SIToFP        */ ISD::SINT_TO_FP,
+    /*Instruction::FPTrunc       */ ISD::FP_ROUND,
+    /*Instruction::FPExt         */ ISD::FP_EXTEND,
+    /*Instruction::PtrToInt      */ ISD::BITCAST,
+    /*Instruction::IntToPtr      */ ISD::BITCAST,
+    /*Instruction::BitCast       */ ISD::BITCAST,
+    /*Instruction::ICmp          */ ISD::SETCC,
+    /*Instruction::FCmp          */ ISD::SETCC,
+    /*Instruction::PHI           */ 0,
+    /*Instruction::Call          */ 0,
+    /*Instruction::Select        */ ISD::SELECT,
+    /*Instruction::UserOp1       */ 0,
+    /*Instruction::UserOp2       */ 0,
+    /*Instruction::VAArg         */ 0,
+    /*Instruction::ExtractElement*/ ISD::EXTRACT_VECTOR_ELT,
+    /*Instruction::InsertElement */ ISD::INSERT_VECTOR_ELT,
+    /*Instruction::ShuffleVector */ ISD::VECTOR_SHUFFLE,
+    /*Instruction::ExtractValue  */ ISD::MERGE_VALUES,
+    /*Instruction::InsertValue   */ ISD::MERGE_VALUES,
+    /*Instruction::LandingPad    */ 0};
+
+  assert((Instruction::Ret == 1) && (Instruction::LandingPad == 58) &&
+         "Instruction order had changed");
+
+  // Opcode numbering starts at #1 but the table starts at #0, so we subtract
+  // one from the opcode number.
+  return OpToISDTbl[Opcode - 1];
+}
+
+std::pair<unsigned, EVT> 
+VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C,
+                                                         EVT Ty) const {
+  unsigned Cost = 1;
+  // We keep legalizing the type until we find a legal kind. We assume that
+  // the only operation that costs anything is the split. After splitting
+  // we need to handle two types.
+  while (true) {
+    TargetLowering::LegalizeKind LK = TLI->getTypeConversion(C, Ty);
+
+    if (LK.first == TargetLowering::TypeLegal)
+      return std::make_pair(Cost, LK.second);
+
+    if (LK.first == TargetLowering::TypeSplitVector)
+      Cost *= 2;
+
+    // Keep legalizing the type.
+    Ty = LK.second;
+  }
+}
 
 unsigned
 VectorTargetTransformImpl::getInstrCost(unsigned Opcode, Type *Ty1,
                                         Type *Ty2) const {
-  return 1;
+  // Check if any of the operands are vector operands.
+  int ISD = InstructionOpcodeToISD(Opcode);
+
+  // Selects on vectors are actually vector selects.
+  if (ISD == ISD::SELECT) {
+    assert(Ty2 && "Ty2 must hold the select type");
+    if (Ty2->isVectorTy())
+    ISD = ISD::VSELECT;
+  }
+
+  // If we don't have any information about this instruction assume it costs 1.
+  if (ISD == 0)
+    return 1;
+
+  assert(Ty1 && "We need to have at least one type");
+
+  // From this stage we look at the legalized type.
+  std::pair<unsigned, EVT>  LT =
+  getTypeLegalizationCost(Ty1->getContext(), TLI->getValueType(Ty1));
+
+  if (TLI->isOperationLegalOrCustom(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1. Multiply
+    // by the type-legalization overhead.
+    return LT.first * 1;
+  }
+
+  unsigned NumElem =
+  (LT.second.isVector() ? LT.second.getVectorNumElements() : 1);
+
+  // We will probably scalarize this instruction. Assume that the cost is the
+  // number of the vector elements.
+  return LT.first * NumElem * 1;
 }
 
 unsigned
@@ -69,5 +190,9 @@ unsigned
 VectorTargetTransformImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                            unsigned Alignment,
                                            unsigned AddressSpace) const {
-  return 1;
+  // From this stage we look at the legalized type.
+  std::pair<unsigned, EVT>  LT =
+  getTypeLegalizationCost(Src->getContext(), TLI->getValueType(Src));
+  // Assume that all loads of legal types cost 1.
+  return LT.first;
 }
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 1d8f1e531a..1b196eab84 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -185,7 +185,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
 
-  if (Vectorize) {
+  if (Vectorize || true) {
     MPM.add(createLoopVectorizePass());
     MPM.add(createLICMPass());
   }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 35f49e4f30..483b9fcbc3 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -300,10 +300,10 @@ private:
 class LoopVectorizationCostModel {
 public:
   /// C'tor.
-  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
+  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
                              LoopVectorizationLegality *Leg,
                              const VectorTargetTransformInfo *Vtti):
-  TheLoop(Lp), SE(Se), DL(Dl), Legal(Leg), VTTI(Vtti) { }
+  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
 
   /// Returns the most profitable vectorization factor for the loop that is
   /// smaller or equal to the VF argument. This method checks every power
@@ -325,8 +325,7 @@ private:
   Loop *TheLoop;
   /// Scev analysis.
   ScalarEvolution *SE;
-  /// DataLayout analysis.
-  DataLayout *DL;
+
   /// Vectorization legality.
   LoopVectorizationLegality *Legal;
   /// Vector target information.
@@ -372,7 +371,7 @@ struct LoopVectorize : public LoopPass {
       if (TTI)
         VTTI = TTI->getVectorTargetTransformInfo();
       // Use the cost model.
-      LoopVectorizationCostModel CM(L, SE, DL, &LVL, VTTI);
+      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
       VF = CM.findBestVectorizationFactor();
 
       if (VF == 1) {
@@ -1432,11 +1431,12 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
     Instruction *Inst = it;
-    Cost += getInstructionCost(Inst, VF);
+    unsigned C = getInstructionCost(Inst, VF);
+    Cost += C;
+    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
+          " For instruction: "<< *Inst << "\n");
   }
 
-  // Return the cost divided by VF, because we will be executing
-  // less iterations of the vector form.
   return Cost;
 }
 
@@ -1444,11 +1444,13 @@ unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   assert(VTTI && "Invalid vector target transformation info");
   switch (I->getOpcode()) {
+    case Instruction::GetElementPtr:
+      return 0;
     case Instruction::Br: {
       return VTTI->getInstrCost(I->getOpcode());
     }
     case Instruction::PHI:
-      // PHIs are handled the same as the binary instructions below.
+      return 0;
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -1493,11 +1495,17 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       // Scalarized stores.
       if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
         unsigned Cost = 0;
-        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
-        // The cost of extracting from the vector value.
-        Cost += VF * ExtCost;
+        if (VF != 1) {
+          unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                                VTy);
+          // The cost of extracting from the value vector and pointer vector.
+          Cost += VF * (ExtCost * 2);
+        }
         // The cost of the scalar stores.
-        Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           VTy->getScalarType(),
+                                           SI->getAlignment(),
+                                           SI->getPointerAddressSpace());
         return Cost;
       }
 
@@ -1512,11 +1520,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       // Scalarized loads.
       if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
         unsigned Cost = 0;
-        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
-        // The cost of inserting the loaded value into the result vector.
-        Cost += VF * InCost;
+        if (VF != 1) {
+          unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+          unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy);
+
+          // The cost of inserting the loaded value into the result vector, and
+          // extracting from a vector of pointers.
+          Cost += VF * (InCost + ExCost);
+        }
         // The cost of the scalar stores.
-        Cost += VF * VTTI->getInstrCost(I->getOpcode(), VTy->getScalarType());
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(),
+                                           LI->getAlignment(),
+                                           LI->getPointerAddressSpace());
         return Cost;
       }
 
diff --git a/test/Transforms/LoopVectorize/cost-model.ll b/test/Transforms/LoopVectorize/cost-model.ll
index 20c710db46..18abf2885e 100644
--- a/test/Transforms/LoopVectorize/cost-model.ll
+++ b/test/Transforms/LoopVectorize/cost-model.ll
@@ -8,10 +8,8 @@ target triple = "x86_64-apple-macosx10.8.0"
 @d = common global [2048 x i32] zeroinitializer, align 16
 @a = common global [2048 x i32] zeroinitializer, align 16
 
-; At this point the cost model is pretty bad and we are vectorizing the code below.
-; TODO: This code should not be vectorized on x86.
 ;CHECK: cost_model_1
-;CHECK: <4 x i32>
+;CHECK-NOT: <4 x i32>
 ;CHECK: ret void
 define void @cost_model_1() nounwind uwtable noinline ssp {
 entry:
-- 
cgit v1.2.3