summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Trick <atrick@apple.com>2013-12-05 17:55:58 +0000
committerAndrew Trick <atrick@apple.com>2013-12-05 17:55:58 +0000
commit573931394fc307a4606bd0b1854d4df5bf5638a1 (patch)
tree9f16093011d94726301cec49d8ded650df2cff3a
parentbdbcb4dfbc4e5c0bfeafa8416c9ac1ae39e4b794 (diff)
downloadllvm-573931394fc307a4606bd0b1854d4df5bf5638a1.tar.gz
llvm-573931394fc307a4606bd0b1854d4df5bf5638a1.tar.bz2
llvm-573931394fc307a4606bd0b1854d4df5bf5638a1.tar.xz
MI-Sched: handle latency of in-order operations with the new machine model.
The per-operand machine model allows the target to define "unbuffered" processor resources. This change is a quick, cheap way to model stalls caused by the latency of operations that use such resources. This only applies when the processor's micro-op buffer size is non-zero (Out-of-Order). We can't precisely model in-order stalls during out-of-order execution, but this is an easy and effective heuristic. It benefits cortex-a9 scheduling when using the new machine model, which is not yet on by default. MI-Sched for armv7 was evaluated on Swift (and only not enabled because of a performance bug related to predication). However, we never evaluated Cortex-A9 performance on MI-Sched in its current form. This change adds MI-Sched functionality to reach performance goals on A9. The only remaining change is to allow MI-Sched to run as a PostRA pass. I evaluated performance using a set of options to estimate the performance impact once MI sched is default on armv7: -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false For a simple saxpy loop I see a 1.7x speedup. Here are the llvm-testsuite results: (min run time over 2 runs, filtering tiny changes) Speedups: | Benchmarks/BenchmarkGame/recursive | 52.39% | | Benchmarks/VersaBench/beamformer | 20.80% | | Benchmarks/Misc/pi | 19.97% | | Benchmarks/Misc/mandel-2 | 19.95% | | SPEC/CFP2000/188.ammp | 18.72% | | Benchmarks/McCat/08-main/main | 18.58% | | Benchmarks/Misc-C++/Large/sphereflake | 18.46% | | Benchmarks/Olden/power | 17.11% | | Benchmarks/Misc-C++/mandel-text | 16.47% | | Benchmarks/Misc/oourafft | 15.94% | | Benchmarks/Misc/flops-7 | 14.99% | | Benchmarks/FreeBench/distray | 14.26% | | SPEC/CFP2006/470.lbm | 14.00% | | mediabench/mpeg2/mpeg2dec/mpeg2decode | 12.28% | | Benchmarks/SmallPT/smallpt | 10.36% | | Benchmarks/Misc-C++/Large/ray | 8.97% | | Benchmarks/Misc/fp-convert | 8.75% | | Benchmarks/Olden/perimeter | 7.10% | | Benchmarks/Bullet/bullet | 7.03% | | Benchmarks/Misc/mandel | 6.75% | | Benchmarks/Olden/voronoi | 6.26% | | Benchmarks/Misc/flops-8 | 5.77% | | Benchmarks/Misc/matmul_f64_4x4 | 5.19% | | Benchmarks/MiBench/security-rijndael | 5.15% | | Benchmarks/Misc/flops-6 | 5.10% | | Benchmarks/Olden/tsp | 4.46% | | Benchmarks/MiBench/consumer-lame | 4.28% | | Benchmarks/Misc/flops-5 | 4.27% | | Benchmarks/mafft/pairlocalalign | 4.19% | | Benchmarks/Misc/himenobmtxpa | 4.07% | | Benchmarks/Misc/lowercase | 4.06% | | SPEC/CFP2006/433.milc | 3.99% | | Benchmarks/tramp3d-v4 | 3.79% | | Benchmarks/FreeBench/pifft | 3.66% | | Benchmarks/Ptrdist/ks | 3.21% | | Benchmarks/Adobe-C++/loop_unroll | 3.12% | | SPEC/CINT2000/175.vpr | 3.12% | | Benchmarks/nbench | 2.98% | | SPEC/CFP2000/183.equake | 2.91% | | Benchmarks/Misc/perlin | 2.85% | | Benchmarks/Misc/flops-1 | 2.82% | | Benchmarks/Misc-C++-EH/spirit | 2.80% | | Benchmarks/Misc/flops-2 | 2.77% | | Benchmarks/NPB-serial/is | 2.42% | | Benchmarks/ASC_Sequoia/CrystalMk | 2.33% | | Benchmarks/BenchmarkGame/n-body | 2.28% | | Benchmarks/SciMark2-C/scimark2 | 2.27% | | Benchmarks/Olden/bh | 2.03% | | skidmarks10/skidmarks | 1.81% | | Benchmarks/Misc/flops | 1.72% | Slowdowns: | Benchmarks/llubenchmark/llu | -14.14% | | Benchmarks/Polybench/stencils/seidel-2d | -5.67% | | Benchmarks/Adobe-C++/functionobjects | -5.25% | | Benchmarks/Misc-C++/oopack_v1p8 | -5.00% | | Benchmarks/Shootout/hash | -2.35% | | Benchmarks/Prolangs-C++/ocean | -2.01% | | Benchmarks/Polybench/medley/floyd-warshall | -1.98% | | Polybench/linear-algebra/kernels/3mm | -1.95% | | Benchmarks/McCat/09-vor/vor | -1.68% | git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@196516 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/CodeGen/ScheduleDAG.h22
-rw-r--r--lib/CodeGen/MachineScheduler.cpp41
-rw-r--r--lib/CodeGen/ScheduleDAGInstrs.cpp16
-rw-r--r--lib/Target/ARM/ARMScheduleA9.td2
-rw-r--r--test/CodeGen/ARM/saxpy10-a9.ll135
5 files changed, 201 insertions, 15 deletions
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index ccba1b0364..48ed232a15 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -292,6 +292,7 @@ namespace llvm {
bool isScheduleHigh : 1; // True if preferable to schedule high.
bool isScheduleLow : 1; // True if preferable to schedule low.
bool isCloned : 1; // True if this node has been cloned.
+ bool isUnbuffered : 1; // Reads an unbuffered resource.
Sched::Preference SchedulingPref; // Scheduling preference.
private:
@@ -316,9 +317,10 @@ namespace llvm {
isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
isAvailable(false), isScheduled(false), isScheduleHigh(false),
- isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
- isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
- TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
+ isScheduleLow(false), isCloned(false), isUnbuffered(false),
+ SchedulingPref(Sched::None), isDepthCurrent(false),
+ isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
+ BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
/// SUnit - Construct an SUnit for post-regalloc scheduling to represent
/// a MachineInstr.
@@ -330,9 +332,10 @@ namespace llvm {
isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
isAvailable(false), isScheduled(false), isScheduleHigh(false),
- isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
- isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
- TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
+ isScheduleLow(false), isCloned(false), isUnbuffered(false),
+ SchedulingPref(Sched::None), isDepthCurrent(false),
+ isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
+ BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
/// SUnit - Construct a placeholder SUnit.
SUnit()
@@ -343,9 +346,10 @@ namespace llvm {
isTwoAddress(false), isCommutable(false), hasPhysRegUses(false),
hasPhysRegDefs(false), hasPhysRegClobbers(false), isPending(false),
isAvailable(false), isScheduled(false), isScheduleHigh(false),
- isScheduleLow(false), isCloned(false), SchedulingPref(Sched::None),
- isDepthCurrent(false), isHeightCurrent(false), Depth(0), Height(0),
- TopReadyCycle(0), BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
+ isScheduleLow(false), isCloned(false), isUnbuffered(false),
+ SchedulingPref(Sched::None), isDepthCurrent(false),
+ isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
+ BotReadyCycle(0), CopyDstRC(NULL), CopySrcRC(NULL) {}
/// \brief Boundary nodes are placeholders for the boundary of the
/// scheduling region.
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index f8c4a893c1..3296149afa 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1330,7 +1330,7 @@ public:
/// Represent the type of SchedCandidate found within a single queue.
/// pickNodeBidirectional depends on these listed by decreasing priority.
enum CandReason {
- NoCand, PhysRegCopy, RegExcess, RegCritical, Cluster, Weak, RegMax,
+ NoCand, PhysRegCopy, RegExcess, RegCritical, Stall, Cluster, Weak, RegMax,
ResourceReduce, ResourceDemand, BotHeightReduce, BotPathReduce,
TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
@@ -1583,6 +1583,10 @@ public:
MaxExecutedResCount);
}
+ /// Get the difference between the given SUnit's ready time and the current
+ /// cycle.
+ unsigned getLatencyStallCycles(SUnit *SU);
+
bool checkHazard(SUnit *SU);
unsigned findMaxLatency(ArrayRef<SUnit*> ReadySUs);
@@ -1869,6 +1873,23 @@ void GenericScheduler::registerRoots() {
}
}
+/// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
+/// these "soft stalls" differently than the hard stall cycles based on CPU
+/// resources and computed by checkHazard(). A fully in-order model
+/// (MicroOpBufferSize==0) will not make use of this since instructions are not
+/// available for scheduling until they are ready. However, a weaker in-order
+/// model may use this for heuristics. For example, if a processor has in-order
+/// behavior when reading certain resources, this may come into play.
+unsigned GenericScheduler::SchedBoundary::getLatencyStallCycles(SUnit *SU) {
+ if (!SU->isUnbuffered)
+ return 0;
+
+ unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
+ if (ReadyCycle > CurrCycle)
+ return ReadyCycle - CurrCycle;
+ return 0;
+}
+
/// Does this SU have a hazard within the current instruction group.
///
/// The scheduler supports two modes of hazard recognition. The first is the
@@ -1948,9 +1969,9 @@ getOtherResourceCount(unsigned &OtherCritIdx) {
/// inside and outside the zone.
void GenericScheduler::SchedBoundary::setPolicy(CandPolicy &Policy,
SchedBoundary &OtherZone) {
- // Now that potential stalls have been considered, apply preemptive heuristics
- // based on the the total latency and resources inside and outside this
- // zone.
+ // Apply preemptive heuristics based on the the total latency and resources
+ // inside and outside this zone. Potential stalls should be considered before
+ // following this policy.
// Compute remaining latency. We need this both to determine whether the
// overall schedule has become latency-limited and whether the instructions
@@ -2141,7 +2162,11 @@ void GenericScheduler::SchedBoundary::bumpNode(SUnit *SU) {
break;
default:
// We don't currently model the OOO reorder buffer, so consider all
- // scheduled MOps to be "retired".
+ // scheduled MOps to be "retired". We do loosely model in-order resource
+ // latency. If this instruction uses an in-order resource, account for any
+ // likely stall cycles.
+ if (SU->isUnbuffered && ReadyCycle > NextCycle)
+ NextCycle = ReadyCycle;
break;
}
RetiredMOps += IncMOps;
@@ -2514,6 +2539,11 @@ void GenericScheduler::tryCandidate(SchedCandidate &Cand,
&& tryLatency(TryCand, Cand, Zone))
return;
+ // Prioritize instructions that read unbuffered resources by stall cycles.
+ if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),
+ Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+ return;
+
// Keep clustered nodes together to encourage downstream peephole
// optimizations which may reduce resource requirements.
//
@@ -2577,6 +2607,7 @@ const char *GenericScheduler::getReasonStr(
case PhysRegCopy: return "PREG-COPY";
case RegExcess: return "REG-EXCESS";
case RegCritical: return "REG-CRIT ";
+ case Stall: return "STALL ";
case Cluster: return "CLUSTER ";
case Weak: return "WEAK ";
case RegMax: return "REG-MAX ";
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 7d9160ab1f..eeae6ec03d 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -687,6 +687,22 @@ void ScheduleDAGInstrs::initSUnits() {
// Assign the Latency field of SU using target-provided information.
SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
+
+ // If this SUnit uses an unbuffered resource, mark it as such.
+ // These resources are used for in-order execution pipelines within an
+ // out-of-order core and are identified by BufferSize=1. BufferSize=0 is
+ // used for dispatch/issue groups and is not considered here.
+ if (SchedModel.hasInstrSchedModel()) {
+ const MCSchedClassDesc *SC = getSchedClass(SU);
+ for (TargetSchedModel::ProcResIter
+ PI = SchedModel.getWriteProcResBegin(SC),
+ PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
+ if (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize == 1) {
+ SU->isUnbuffered = true;
+ break;
+ }
+ }
+ }
}
}
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 6276cfc200..7d114a36b9 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1905,7 +1905,7 @@ def A9UnitALU : ProcResource<2>;
def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
def A9UnitAGU : ProcResource<1>;
def A9UnitLS : ProcResource<1>;
-def A9UnitFP : ProcResource<1> { let BufferSize = 0; }
+def A9UnitFP : ProcResource<1> { let BufferSize = 1; }
def A9UnitB : ProcResource<1>;
//===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/ARM/saxpy10-a9.ll b/test/CodeGen/ARM/saxpy10-a9.ll
new file mode 100644
index 0000000000..1102800dce
--- /dev/null
+++ b/test/CodeGen/ARM/saxpy10-a9.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -march=arm -mtriple=thumbv7-apple-ios7.0.0 -float-abi=hard -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false | FileCheck %s
+;
+; Test MI-Sched suppory latency based stalls on in in-order pipeline
+; using the new machine model.
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+; Don't be too strict with the top of the schedule, but most of it
+; should be nicely pipelined.
+;
+; CHECK: saxpy10:
+; CHECK: vldr
+; CHECK: vldr
+; CHECK: vldr
+; CHECK: vldr
+; CHECK: vldr
+; CHECK: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vmul
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vldr
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vadd
+; CHECK-NEXT: vmov
+; CHECK-NEXT: bx
+;
+; This accumulates a sum rather than storing each result.
+define float @saxpy10(float* nocapture readonly %data1, float* nocapture readonly %data2, float %a) {
+entry:
+ %0 = load float* %data1, align 4
+ %mul = fmul float %0, %a
+ %1 = load float* %data2, align 4
+ %add = fadd float %mul, %1
+ %add2 = fadd float %add, 0.000000e+00
+ %arrayidx.1 = getelementptr inbounds float* %data1, i32 1
+ %2 = load float* %arrayidx.1, align 4
+ %mul.1 = fmul float %2, %a
+ %arrayidx1.1 = getelementptr inbounds float* %data2, i32 1
+ %3 = load float* %arrayidx1.1, align 4
+ %add.1 = fadd float %mul.1, %3
+ %add2.1 = fadd float %add2, %add.1
+ %arrayidx.2 = getelementptr inbounds float* %data1, i32 2
+ %4 = load float* %arrayidx.2, align 4
+ %mul.2 = fmul float %4, %a
+ %arrayidx1.2 = getelementptr inbounds float* %data2, i32 2
+ %5 = load float* %arrayidx1.2, align 4
+ %add.2 = fadd float %mul.2, %5
+ %add2.2 = fadd float %add2.1, %add.2
+ %arrayidx.3 = getelementptr inbounds float* %data1, i32 3
+ %6 = load float* %arrayidx.3, align 4
+ %mul.3 = fmul float %6, %a
+ %arrayidx1.3 = getelementptr inbounds float* %data2, i32 3
+ %7 = load float* %arrayidx1.3, align 4
+ %add.3 = fadd float %mul.3, %7
+ %add2.3 = fadd float %add2.2, %add.3
+ %arrayidx.4 = getelementptr inbounds float* %data1, i32 4
+ %8 = load float* %arrayidx.4, align 4
+ %mul.4 = fmul float %8, %a
+ %arrayidx1.4 = getelementptr inbounds float* %data2, i32 4
+ %9 = load float* %arrayidx1.4, align 4
+ %add.4 = fadd float %mul.4, %9
+ %add2.4 = fadd float %add2.3, %add.4
+ %arrayidx.5 = getelementptr inbounds float* %data1, i32 5
+ %10 = load float* %arrayidx.5, align 4
+ %mul.5 = fmul float %10, %a
+ %arrayidx1.5 = getelementptr inbounds float* %data2, i32 5
+ %11 = load float* %arrayidx1.5, align 4
+ %add.5 = fadd float %mul.5, %11
+ %add2.5 = fadd float %add2.4, %add.5
+ %arrayidx.6 = getelementptr inbounds float* %data1, i32 6
+ %12 = load float* %arrayidx.6, align 4
+ %mul.6 = fmul float %12, %a
+ %arrayidx1.6 = getelementptr inbounds float* %data2, i32 6
+ %13 = load float* %arrayidx1.6, align 4
+ %add.6 = fadd float %mul.6, %13
+ %add2.6 = fadd float %add2.5, %add.6
+ %arrayidx.7 = getelementptr inbounds float* %data1, i32 7
+ %14 = load float* %arrayidx.7, align 4
+ %mul.7 = fmul float %14, %a
+ %arrayidx1.7 = getelementptr inbounds float* %data2, i32 7
+ %15 = load float* %arrayidx1.7, align 4
+ %add.7 = fadd float %mul.7, %15
+ %add2.7 = fadd float %add2.6, %add.7
+ %arrayidx.8 = getelementptr inbounds float* %data1, i32 8
+ %16 = load float* %arrayidx.8, align 4
+ %mul.8 = fmul float %16, %a
+ %arrayidx1.8 = getelementptr inbounds float* %data2, i32 8
+ %17 = load float* %arrayidx1.8, align 4
+ %add.8 = fadd float %mul.8, %17
+ %add2.8 = fadd float %add2.7, %add.8
+ %arrayidx.9 = getelementptr inbounds float* %data1, i32 9
+ %18 = load float* %arrayidx.9, align 4
+ %mul.9 = fmul float %18, %a
+ %arrayidx1.9 = getelementptr inbounds float* %data2, i32 9
+ %19 = load float* %arrayidx1.9, align 4
+ %add.9 = fadd float %mul.9, %19
+ %add2.9 = fadd float %add2.8, %add.9
+ ret float %add2.9
+}