[InstCombiner] Slice a big load in two loads when the elements are next to each

other in memory. The motivation was to get rid of truncate and shift right instructions that get in the way of paired load or floating point load. E.g., Consider the following example: struct Complex { float real; float imm; }; When accessing a complex, llvm was generating a 64-bits load and the imm field was obtained by a trunc(lshr) sequence, resulting in poor code generation, at least for x86. The idea is to declare that two load instructions is the canonical form for loading two arithmetic type, which are next to each other in memory. Two scalar loads at a constant offset from each other are pretty easy to detect for the sorts of passes that like to mess with loads. <rdar://problem/14477220> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@190870 91177308-0d34-0410-b5e6-96231b3b80d8
author: Quentin Colombet <qcolombet@apple.com> 2013-09-17 16:57:34 +0000
committer: Quentin Colombet <qcolombet@apple.com> 2013-09-17 16:57:34 +0000
commit: 0119f3df9c2016664540f8e3be89fe5cd54cbb07 (patch)
tree: dbcfd1cafb1d905de869cd63e54b3904d9a36904 /lib
parent: 2ff37c701ebb4e4d593df085fe8bb60f2d6d340a (diff)
download: llvm-0119f3df9c2016664540f8e3be89fe5cd54cbb07.tar.gz
llvm-0119f3df9c2016664540f8e3be89fe5cd54cbb07.tar.bz2
llvm-0119f3df9c2016664540f8e3be89fe5cd54cbb07.tar.xz
1 files changed, 285 insertions, 0 deletions
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 88e16e9725..0579c27db8 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -16,10 +16,20 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+/// Hidden option to stress test load slicing, i.e., when this option
+/// is enabled, load slicing bypasses most of its profitability guards.
+/// It will also generate, uncanonalized form of slicing.
+static cl::opt<bool>
+StressLoadSlicing("instcombine-stress-load-slicing", cl::Hidden,
+                  cl::desc("Bypass the profitability model of load "
+                           "slicing"),
+                  cl::init(false));
+
 STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
 
@@ -337,6 +347,274 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
   return 0;
 }
 
+namespace {
+  /// \brief Helper structure used to slice a load in smaller loads.
+  struct LoadedSlice {
+    // The last instruction that represent the slice. This should be a
+    // truncate instruction.
+    Instruction *Inst;
+    // The original load instruction.
+    LoadInst *Origin;
+    // The right shift amount in bits from the original load.
+    unsigned Shift;
+
+    LoadedSlice(Instruction *Inst = NULL, LoadInst *Origin = NULL,
+                unsigned Shift = 0)
+    : Inst(Inst), Origin(Origin), Shift(Shift) {}
+
+    LoadedSlice(const LoadedSlice& LS) : Inst(LS.Inst), Origin(LS.Origin),
+      Shift(LS.Shift) {}
+
+    /// \brief Get the bits used in a chunk of bits \p BitWidth large.
+    /// \return Result is \p BitWidth and has used bits set to 1 and
+    ///         not used bits set to 0.
+    APInt getUsedBits() const {
+      // Reproduce the trunc(lshr) sequence:
+      // - Start from the truncated value.
+      // - Zero extend to the desired bit width.
+      // - Shift left.
+      assert(Origin && "No original load to compare against.");
+      unsigned BitWidth = Origin->getType()->getPrimitiveSizeInBits();
+      assert(Inst && "This slice is not bound to an instruction");
+      assert(Inst->getType()->getPrimitiveSizeInBits() <= BitWidth &&
+             "Extracted slice is smaller than the whole type!");
+      APInt UsedBits(Inst->getType()->getPrimitiveSizeInBits(), 0);
+      UsedBits.setAllBits();
+      UsedBits = UsedBits.zext(BitWidth);
+      UsedBits <<= Shift;
+      return UsedBits;
+    }
+
+    /// \brief Get the size of the slice to be loaded in bytes.
+    unsigned getLoadedSize() const {
+      unsigned SliceSize = getUsedBits().countPopulation();
+      assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
+      return SliceSize / 8;
+    }
+
+    /// \brief Get the offset in bytes of this slice in the original chunk of
+    /// bits, whose layout is defined by \p IsBigEndian.
+    uint64_t getOffsetFromBase(bool IsBigEndian) const {
+      assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not support.");
+      uint64_t Offset = Shift / 8;
+      unsigned TySizeInBytes = Origin->getType()->getPrimitiveSizeInBits() / 8;
+      assert(!(Origin->getType()->getPrimitiveSizeInBits() & 0x7) &&
+             "The size of the original loaded type is not a multiple of a"
+             " byte.");
+      // If Offset is bigger than TySizeInBytes, it means we are loading all
+      // zeros. This should have been optimized before in the process.
+      assert(TySizeInBytes > Offset &&
+             "Invalid shift amount for given loaded size");
+      if (IsBigEndian)
+        Offset = TySizeInBytes - Offset - getLoadedSize();
+      return Offset;
+    }
+
+    /// \brief Generate the sequence of instructions to load the slice
+    /// represented by this object and redirect the uses of this slice to
+    /// this new sequence of instructions.
+    /// \pre this->Inst && this->Origin are valid Instructions.
+    /// \return The last instruction of the sequence used to load the slice.
+    Instruction *loadSlice(InstCombiner::BuilderTy &Builder,
+                           bool IsBigEndian) const {
+      assert(Inst && Origin && "Unable to replace a non-existing slice.");
+      Value *BaseAddr = Origin->getOperand(0);
+      unsigned Alignment = Origin->getAlignment();
+      Builder.SetInsertPoint(Origin);
+      // Assume we are looking at a chunk of bytes.
+      // BaseAddr = (i8*)BaseAddr.
+      BaseAddr = Builder.CreateBitCast(BaseAddr, Builder.getInt8PtrTy(),
+                                       "raw_cast");
+      // Get the offset in that chunk of bytes w.r.t. the endianess.
+      uint64_t Offset = getOffsetFromBase(IsBigEndian);
+      if (Offset) {
+        APInt APOffset(64, Offset);
+        // BaseAddr = BaseAddr + Offset.
+        BaseAddr = Builder.CreateInBoundsGEP(BaseAddr, Builder.getInt(APOffset),
+                                             "raw_idx");
+      }
+
+      // Create the type of the loaded slice according to its size.
+      Type *SliceType =
+        Type::getIntNTy(Origin->getContext(), getLoadedSize() * 8);
+
+      // Bit cast the raw pointer to the pointer type of the slice.
+      BaseAddr = Builder.CreateBitCast(BaseAddr, SliceType->getPointerTo(),
+                                       "cast");
+
+      // Compute the new alignment.
+      if (Offset != 0)
+        Alignment = MinAlign(Alignment, Alignment + Offset);
+
+      // Create the load for the slice.
+      Instruction *LastInst = Builder.CreateAlignedLoad(BaseAddr, Alignment,
+                                                        Inst->getName()+".val");
+      // If the final type is not the same as the loaded type, this means that
+      // we have to pad with zero. Create a zero extend for that.
+      Type * FinalType = Inst->getType();
+      if (SliceType != FinalType)
+        LastInst = cast<Instruction>(Builder.CreateZExt(LastInst, FinalType));
+
+      // Update the IR to reflect the new access to the slice.
+      Inst->replaceAllUsesWith(LastInst);
+
+      return LastInst;
+    }
+
+    /// \brief Check if it would be profitable to expand this slice as an
+    /// independant load.
+    bool isProfitable() const {
+      // Slicing is assumed to be profitable iff the chains leads to arithmetic
+      // operations.
+      SmallVector<const Instruction *, 8> Uses;
+      Uses.push_back(Inst);
+      do {
+        const Instruction *Use = Uses.pop_back_val();
+        for (Value::const_use_iterator UseIt = Use->use_begin(),
+             UseItEnd = Use->use_end(); UseIt != UseItEnd; ++UseIt) {
+          const Instruction *UseOfUse = cast<Instruction>(*UseIt);
+          // Consider these instructions as arithmetic operations.
+          if (isa<BinaryOperator>(UseOfUse) ||
+              isa<CastInst>(UseOfUse) ||
+              isa<PHINode>(UseOfUse) ||
+              isa<GetElementPtrInst>(UseOfUse))
+            return true;
+          // No need to check if the Use has already been checked as we do not
+          // insert any PHINode.
+          Uses.push_back(UseOfUse);
+        }
+      } while (!Uses.empty());
+      DEBUG(dbgs() << "IC: Not a profitable slice " << *Inst << '\n');
+      return false;
+    }
+  };
+}
+
+/// \brief Check the profitability of all involved LoadedSlice.
+/// Unless StressLoadSlicing is specified, this also returns false
+/// when slicing is not in the canonical form.
+/// The canonical form of sliced load is (1) two loads,
+/// which are (2) next to each other in memory.
+///
+/// FIXME: We may want to allow more slices to be created but
+/// this means other passes should know how to deal with all those
+/// slices.
+/// FIXME: We may want to split loads to different types, e.g.,
+/// int vs. float.
+static bool
+isSlicingProfitable(const SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                    const APInt &UsedBits) {
+  unsigned NbOfSlices = LoadedSlices.size();
+  // Check (1).
+  if (!StressLoadSlicing && NbOfSlices != 2)
+    return false;
+
+  // Check (2).
+  if (!StressLoadSlicing && !UsedBits.isAllOnesValue()) {
+    // Get rid of the unused bits on the right.
+    APInt MemoryLayout = UsedBits.lshr(UsedBits.countTrailingZeros());
+    // Get rid of the unused bits on the left.
+    if (MemoryLayout.countLeadingZeros())
+      MemoryLayout = MemoryLayout.trunc(MemoryLayout.getActiveBits());
+    // Check that the chunk of memory is completely used.
+    if (!MemoryLayout.isAllOnesValue())
+      return false;
+  }
+
+  unsigned NbOfProfitableSlices = 0;
+  for (unsigned CurrSlice = 0; CurrSlice < NbOfSlices; ++CurrSlice) {
+    if (LoadedSlices[CurrSlice].isProfitable())
+      ++NbOfProfitableSlices;
+    else if (!StressLoadSlicing)
+      return false;
+  }
+  // In Stress mode, we may have 0 profitable slice.
+  // Check that here.
+  // In non-Stress mode, all the slices are profitable at this point.
+  return NbOfProfitableSlices > 0;
+}
+
+/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
+/// operations, split it in the various pieces being extracted.
+///
+/// This sort of thing is introduced by SROA.
+/// This slicing takes care not to insert overlapping loads.
+/// \pre LI is a simple load (i.e., not an atomic or volatile load).
+static Instruction *sliceUpLoadInst(LoadInst &LI,
+                                    InstCombiner::BuilderTy &Builder,
+                                    DataLayout &TD) {
+  assert(LI.isSimple() && "We are trying to transform a non-simple load!");
+
+  // FIXME: If we want to support floating point and vector types, we should
+  // support bitcast and extract/insert element instructions.
+  Type *LITy = LI.getType();
+  if (!LITy->isIntegerTy()) return 0;
+
+  // Keep track of already used bits to detect overlapping values.
+  // In that case, we will just abort the transformation.
+  APInt UsedBits(LITy->getPrimitiveSizeInBits(), 0);
+
+  SmallVector<LoadedSlice, 4> LoadedSlices;
+
+  // Check if this load is used as several smaller chunks of bits.
+  // Basically, look for uses in trunc or trunc(lshr) and record a new chain
+  // of computation for each trunc.
+  for (Value::use_iterator UI = LI.use_begin(), UIEnd = LI.use_end();
+       UI != UIEnd; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    unsigned Shift = 0;
+
+    // Check if this is a trunc(lshr).
+    if (User->getOpcode() == Instruction::LShr && User->hasOneUse() &&
+        isa<ConstantInt>(User->getOperand(1))) {
+      Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue();
+      User = User->use_back();
+    }
+
+    // At this point, User is a TruncInst, iff we encountered, trunc or
+    // trunc(lshr).
+    if (!isa<TruncInst>(User))
+      return 0;
+
+    // The width of the type must be a power of 2 and greater than 8-bits.
+    // Otherwise the load cannot be represented in LLVM IR.
+    // Moreover, if we shifted with a non 8-bits multiple, the slice
+    // will be accross several bytes. We do not support that.
+    unsigned Width = User->getType()->getPrimitiveSizeInBits();
+    if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
+      return 0;
+
+    // Build the slice for this chain of computations.
+    LoadedSlice LS(User, &LI, Shift);
+    APInt CurrentUsedBits = LS.getUsedBits();
+
+    // Check if this slice overlaps with another.
+    if ((CurrentUsedBits & UsedBits) != 0)
+      return 0;
+    // Update the bits used globally.
+    UsedBits |= CurrentUsedBits;
+
+    // Record the slice.
+    LoadedSlices.push_back(LS);
+  }
+
+  // Abort slicing if it does not seem to be profitable.
+  if (!isSlicingProfitable(LoadedSlices, UsedBits))
+    return 0;
+
+  // Rewrite each chain to use an independent load.
+  // By construction, each chain can be represented by a unique load.
+  bool IsBigEndian = TD.isBigEndian();
+  for (SmallVectorImpl<LoadedSlice>::const_iterator LSIt = LoadedSlices.begin(),
+       LSItEnd = LoadedSlices.end(); LSIt != LSItEnd; ++LSIt) {
+    Instruction *SliceInst = LSIt->loadSlice(Builder, IsBigEndian);
+    (void)SliceInst;
+    DEBUG(dbgs() << "IC: Replacing " << *LSIt->Inst << "\n"
+                    "    with " << *SliceInst << '\n');
+  }
+  return 0; // Don't do anything with LI.
+}
+
 Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   Value *Op = LI.getOperand(0);
 
@@ -443,6 +721,13 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
         }
     }
   }
+
+  // Try to split a load in smaller non-overlapping loads to expose independant
+  // chain of computations and get rid of trunc/lshr sequence of code.
+  // The data layout is required for that operation, as code generation will
+  // change with respect to endianess.
+  if (TD)
+    return sliceUpLoadInst(LI, *Builder, *TD);
   return 0;
 }
author	Quentin Colombet <qcolombet@apple.com>	2013-09-17 16:57:34 +0000
committer	Quentin Colombet <qcolombet@apple.com>	2013-09-17 16:57:34 +0000
commit	0119f3df9c2016664540f8e3be89fe5cd54cbb07 (patch)
tree	dbcfd1cafb1d905de869cd63e54b3904d9a36904 /lib
parent	2ff37c701ebb4e4d593df085fe8bb60f2d6d340a (diff)
download	llvm-0119f3df9c2016664540f8e3be89fe5cd54cbb07.tar.gz llvm-0119f3df9c2016664540f8e3be89fe5cd54cbb07.tar.bz2 llvm-0119f3df9c2016664540f8e3be89fe5cd54cbb07.tar.xz