Implement X86TTI::getUnrollingPreferences

This provides an initial implementation of getUnrollingPreferences for x86. getUnrollingPreferences is used by the generic (concatenation) unroller, which is distinct from the unrolling done by the loop vectorizer. Many modern x86 cores have some kind of uop cache and loop-stream detector (LSD) used to efficiently dispatch small loops, and taking full advantage of this requires unrolling small loops (small here means 10s of uops). These caches also have limits on the number of taken branches in the loop, and so we also cap the loop unrolling factor based on the maximum "depth" of the loop. This is currently calculated with a partial DFS traversal (partial because it will stop early if the path length grows too much). This is still an approximation, and one that is both conservative (because it does not account for branches eliminated via block placement) and optimistic (because it is only recording the maximum depth over minimum paths). Nevertheless, because the loops that fit in these uop caches are so small, it is not clear how much the details matter. The original set of patches posted for review produced the following test-suite performance results (from the TSVC benchmark) at that time: ControlLoops-dbl - 13% speedup ControlLoops-flt - 15% speedup Reductions-dbl - 7.5% speedup git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205348 91177308-0d34-0410-b5e6-96231b3b80d8
author: Hal Finkel <hfinkel@anl.gov> 2014-04-01 18:50:34 +0000
committer: Hal Finkel <hfinkel@anl.gov> 2014-04-01 18:50:34 +0000
commit: e30aa957e33a8e75a79a5ffb5faa1e1397731273 (patch)
tree: ff0b70af194f1aef77c0f7b259f4f8a6618b5157
parent: 0d5c0629bbd49977ed53a093fd96ed3fd2c234f5 (diff)
download: llvm-e30aa957e33a8e75a79a5ffb5faa1e1397731273.tar.gz
llvm-e30aa957e33a8e75a79a5ffb5faa1e1397731273.tar.bz2
llvm-e30aa957e33a8e75a79a5ffb5faa1e1397731273.tar.xz
4 files changed, 197 insertions, 10 deletions
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index ed04cdc4e4..437f63d328 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -17,8 +17,11 @@
 #define DEBUG_TYPE "x86tti"
 #include "X86.h"
 #include "X86TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
@@ -31,6 +34,17 @@ namespace llvm {
 void initializeX86TTIPass(PassRegistry &);
 }
 
+static cl::opt<bool>
+UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
+  cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
+  cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
+static cl::opt<unsigned>
+PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2),
+  cl::desc("Threshold for taken branches in X86 partial unrolling"),
+  cl::Hidden);
+
 namespace {
 
 class X86TTI final : public ImmutablePass, public TargetTransformInfo {
@@ -73,6 +87,8 @@ public:
   /// \name Scalar TTI Implementations
   /// @{
   PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
+  void getUnrollingPreferences(Loop *L,
+                               UnrollingPreferences &UP) const override;
 
   /// @}
 
@@ -137,6 +153,93 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
 }
 
+void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
+  if (!UsePartialUnrolling)
+    return;
+  // According to the Intel 64 and IA-32 Architectures Optimization Reference
+  // Manual, Intel Core models and later have a loop stream detector
+  // (and associated uop queue) that can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
+  //    taken, and none of them may be calls.
+  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
+
+  // According to the Software Optimization Guide for AMD Family 15h Processors,
+  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
+  // buffer which can benefit from partial unrolling.
+  // The relevant requirements are:
+  //  - The loop must have fewer than 16 branches
+  //  - The loop must have less than 40 uops in all executed loop branches
+
+  unsigned MaxBranches, MaxOps;
+  if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
+    MaxBranches = PartialUnrollingMaxBranches;
+    MaxOps = PartialUnrollingThreshold;
+  } else if (ST->isAtom()) {
+    // On the Atom, the throughput for taken branches is 2 cycles. For small
+    // simple loops, expand by a small factor to hide the backedge cost.
+    MaxBranches = 2;
+    MaxOps = 10;
+  } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
+    MaxBranches = 16;
+    MaxOps = 40;
+  } else if (ST->hasFMA4() /* Any other recent AMD */) {
+    return;
+  } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
+    MaxBranches = 8;
+    MaxOps = 28;
+  } else if (ST->hasSSSE3() /* Intel Core */) {
+    MaxBranches = 4;
+    MaxOps = 18;
+  } else {
+    return;
+  }
+
+  // Scan the loop: don't unroll loops with calls, and count the potential
+  // number of taken branches (this is somewhat conservative because we're
+  // counting all block transitions as potential branches while in reality some
+  // of these will become implicit via block placement).
+  unsigned MaxDepth = 0;
+  for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()),
+       DE = df_end(L->getHeader()); DI != DE;) {
+    if (!L->contains(*DI)) {
+      DI.skipChildren();
+      continue;
+    }
+
+    MaxDepth = std::max(MaxDepth, DI.getPathLength());
+    if (MaxDepth > MaxBranches)
+      return;
+
+    for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I)
+      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+        ImmutableCallSite CS(I);
+        if (const Function *F = CS.getCalledFunction()) {
+          if (!isLoweredToCall(F))
+            continue;
+        }
+
+        return;
+      }
+
+    ++DI;
+  }
+
+  // Enable runtime and partial unrolling up to the specified size.
+  UP.Partial = UP.Runtime = true;
+  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
+
+  // Set the maximum count based on the loop depth. The maximum number of
+  // branches taken in a loop (including the backedge) is equal to the maximum
+  // loop depth (the DFS path length from the loop header to any block in the
+  // loop). When the loop is unrolled, this depth (except for the backedge
+  // itself) is multiplied by the unrolling factor. This new unrolled depth
+  // must be less than the target-specific maximum branch count (which limits
+  // the number of taken branches in the uop buffer).
+  if (MaxDepth > 1)
+    UP.MaxCount = (MaxBranches-1)/(MaxDepth-1);
+}
+
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasSSE1())
     return 0;
diff --git a/test/Transforms/LoopUnroll/X86/lit.local.cfg b/test/Transforms/LoopUnroll/X86/lit.local.cfg
new file mode 100644
index 0000000000..ba763cf03f
--- /dev/null
+++ b/test/Transforms/LoopUnroll/X86/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/LoopUnroll/X86/partial.ll b/test/Transforms/LoopUnroll/X86/partial.ll
new file mode 100644
index 0000000000..15867cbea0
--- /dev/null
+++ b/test/Transforms/LoopUnroll/X86/partial.ll
@@ -0,0 +1,80 @@
+; RUN: opt < %s -S -loop-unroll -mcpu=nehalem -x86-use-partial-unrolling=1 | FileCheck %s
+; RUN: opt < %s -S -loop-unroll -mcpu=core -x86-use-partial-unrolling=1 | FileCheck -check-prefix=CHECK-NOUNRL %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i32* noalias nocapture readnone %ip, double %alpha, double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds double* %b, i64 %index
+  %1 = bitcast double* %0 to <2 x double>*
+  %wide.load = load <2 x double>* %1, align 8
+  %.sum9 = or i64 %index, 2
+  %2 = getelementptr double* %b, i64 %.sum9
+  %3 = bitcast double* %2 to <2 x double>*
+  %wide.load8 = load <2 x double>* %3, align 8
+  %4 = fadd <2 x double> %wide.load, <double 1.000000e+00, double 1.000000e+00>
+  %5 = fadd <2 x double> %wide.load8, <double 1.000000e+00, double 1.000000e+00>
+  %6 = getelementptr inbounds double* %a, i64 %index
+  %7 = bitcast double* %6 to <2 x double>*
+  store <2 x double> %4, <2 x double>* %7, align 8
+  %.sum10 = or i64 %index, 2
+  %8 = getelementptr double* %a, i64 %.sum10
+  %9 = bitcast double* %8 to <2 x double>*
+  store <2 x double> %5, <2 x double>* %9, align 8
+  %index.next = add i64 %index, 4
+  %10 = icmp eq i64 %index.next, 1600
+  br i1 %10, label %for.end, label %vector.body
+
+; FIXME: We should probably unroll this loop by a factor of 2, but the cost
+; model needs to be fixed to account for instructions likely to be folded
+; as part of an addressing mode.
+; CHECK-LABEL: @foo
+; CHECK-NOUNRL-LABEL: @foo
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+define void @bar(i32* noalias nocapture readnone %ip, double %alpha, double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
+entry:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %v0 = getelementptr inbounds double* %b, i64 %index
+  %v1 = bitcast double* %v0 to <2 x double>*
+  %wide.load = load <2 x double>* %v1, align 8
+  %v4 = fadd <2 x double> %wide.load, <double 1.000000e+00, double 1.000000e+00>
+  %v5 = fmul <2 x double> %v4, <double 8.000000e+00, double 8.000000e+00>
+  %v6 = getelementptr inbounds double* %a, i64 %index
+  %v7 = bitcast double* %v6 to <2 x double>*
+  store <2 x double> %v5, <2 x double>* %v7, align 8
+  %index.next = add i64 %index, 2
+  %v10 = icmp eq i64 %index.next, 1600
+  br i1 %v10, label %for.end, label %vector.body
+
+; FIXME: We should probably unroll this loop by a factor of 2, but the cost
+; model needs to first to fixed to account for instructions likely to be folded
+; as part of an addressing mode.
+
+; CHECK-LABEL: @bar
+; CHECK: fadd
+; CHECK-NEXT: fmul
+; CHECK: fadd
+; CHECK-NEXT: fmul
+
+; CHECK-NOUNRL-LABEL: @bar
+; CHECK-NOUNRL: fadd
+; CHECK-NOUNRL-NEXT: fmul
+; CHECK-NOUNRL-NOT: fadd
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+attributes #0 = { nounwind uwtable }
+
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index e98a4acdde..224823b8ed 100644
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1,13 +1,13 @@
-; RUN: opt < %s -mcpu=corei7 -O1 -S | FileCheck %s --check-prefix=O1
-; RUN: opt < %s -mcpu=corei7 -O2 -S | FileCheck %s --check-prefix=O2
-; RUN: opt < %s -mcpu=corei7 -O3 -S | FileCheck %s --check-prefix=O3
-; RUN: opt < %s -mcpu=corei7 -Os -S | FileCheck %s --check-prefix=Os
-; RUN: opt < %s -mcpu=corei7 -Oz -S | FileCheck %s --check-prefix=Oz
-; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S | FileCheck %s --check-prefix=O1VEC
-; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S | FileCheck %s --check-prefix=OzVEC
-; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S | FileCheck %s --check-prefix=O1VEC2
-; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S | FileCheck %s --check-prefix=OzVEC2
-; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S | FileCheck %s --check-prefix=O3DIS
+; RUN: opt < %s -mcpu=corei7 -O1 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1
+; RUN: opt < %s -mcpu=corei7 -O2 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O2
+; RUN: opt < %s -mcpu=corei7 -O3 -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -Os -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Os
+; RUN: opt < %s -mcpu=corei7 -Oz -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=Oz
+; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC
+; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC
+; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O1VEC2
+; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=OzVEC2
+; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -x86-use-partial-unrolling=0 | FileCheck %s --check-prefix=O3DIS
 
 ; This file tests the llvm.vectorizer.pragma forcing vectorization even when
 ; optimization levels are too low, or when vectorization is disabled.
author	Hal Finkel <hfinkel@anl.gov>	2014-04-01 18:50:34 +0000
committer	Hal Finkel <hfinkel@anl.gov>	2014-04-01 18:50:34 +0000
commit	e30aa957e33a8e75a79a5ffb5faa1e1397731273 (patch)
tree	ff0b70af194f1aef77c0f7b259f4f8a6618b5157
parent	0d5c0629bbd49977ed53a093fd96ed3fd2c234f5 (diff)
download	llvm-e30aa957e33a8e75a79a5ffb5faa1e1397731273.tar.gz llvm-e30aa957e33a8e75a79a5ffb5faa1e1397731273.tar.bz2 llvm-e30aa957e33a8e75a79a5ffb5faa1e1397731273.tar.xz