diff options
author | Andrew Trick <atrick@apple.com> | 2013-12-05 17:55:58 +0000 |
---|---|---|
committer | Andrew Trick <atrick@apple.com> | 2013-12-05 17:55:58 +0000 |
commit | 573931394fc307a4606bd0b1854d4df5bf5638a1 (patch) | |
tree | 9f16093011d94726301cec49d8ded650df2cff3a /test/CodeGen | |
parent | bdbcb4dfbc4e5c0bfeafa8416c9ac1ae39e4b794 (diff) | |
download | llvm-573931394fc307a4606bd0b1854d4df5bf5638a1.tar.gz llvm-573931394fc307a4606bd0b1854d4df5bf5638a1.tar.bz2 llvm-573931394fc307a4606bd0b1854d4df5bf5638a1.tar.xz |
MI-Sched: handle latency of in-order operations with the new machine model.
The per-operand machine model allows the target to define "unbuffered"
processor resources. This change is a quick, cheap way to model stalls
caused by the latency of operations that use such resources. This only
applies when the processor's micro-op buffer size is non-zero
(Out-of-Order). We can't precisely model in-order stalls during
out-of-order execution, but this is an easy and effective
heuristic. It benefits cortex-a9 scheduling when using the new
machine model, which is not yet on by default.
MI-Sched for armv7 was evaluated on Swift (and only not enabled because
of a performance bug related to predication). However, we never
evaluated Cortex-A9 performance on MI-Sched in its current form. This
change adds MI-Sched functionality to reach performance goals on
A9. The only remaining change is to allow MI-Sched to run as a PostRA
pass.
I evaluated performance using a set of options to estimate the performance impact once MI sched is default on armv7:
-mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false
For a simple saxpy loop I see a 1.7x speedup. Here are the llvm-testsuite results:
(min run time over 2 runs, filtering tiny changes)
Speedups:
| Benchmarks/BenchmarkGame/recursive | 52.39% |
| Benchmarks/VersaBench/beamformer | 20.80% |
| Benchmarks/Misc/pi | 19.97% |
| Benchmarks/Misc/mandel-2 | 19.95% |
| SPEC/CFP2000/188.ammp | 18.72% |
| Benchmarks/McCat/08-main/main | 18.58% |
| Benchmarks/Misc-C++/Large/sphereflake | 18.46% |
| Benchmarks/Olden/power | 17.11% |
| Benchmarks/Misc-C++/mandel-text | 16.47% |
| Benchmarks/Misc/oourafft | 15.94% |
| Benchmarks/Misc/flops-7 | 14.99% |
| Benchmarks/FreeBench/distray | 14.26% |
| SPEC/CFP2006/470.lbm | 14.00% |
| mediabench/mpeg2/mpeg2dec/mpeg2decode | 12.28% |
| Benchmarks/SmallPT/smallpt | 10.36% |
| Benchmarks/Misc-C++/Large/ray | 8.97% |
| Benchmarks/Misc/fp-convert | 8.75% |
| Benchmarks/Olden/perimeter | 7.10% |
| Benchmarks/Bullet/bullet | 7.03% |
| Benchmarks/Misc/mandel | 6.75% |
| Benchmarks/Olden/voronoi | 6.26% |
| Benchmarks/Misc/flops-8 | 5.77% |
| Benchmarks/Misc/matmul_f64_4x4 | 5.19% |
| Benchmarks/MiBench/security-rijndael | 5.15% |
| Benchmarks/Misc/flops-6 | 5.10% |
| Benchmarks/Olden/tsp | 4.46% |
| Benchmarks/MiBench/consumer-lame | 4.28% |
| Benchmarks/Misc/flops-5 | 4.27% |
| Benchmarks/mafft/pairlocalalign | 4.19% |
| Benchmarks/Misc/himenobmtxpa | 4.07% |
| Benchmarks/Misc/lowercase | 4.06% |
| SPEC/CFP2006/433.milc | 3.99% |
| Benchmarks/tramp3d-v4 | 3.79% |
| Benchmarks/FreeBench/pifft | 3.66% |
| Benchmarks/Ptrdist/ks | 3.21% |
| Benchmarks/Adobe-C++/loop_unroll | 3.12% |
| SPEC/CINT2000/175.vpr | 3.12% |
| Benchmarks/nbench | 2.98% |
| SPEC/CFP2000/183.equake | 2.91% |
| Benchmarks/Misc/perlin | 2.85% |
| Benchmarks/Misc/flops-1 | 2.82% |
| Benchmarks/Misc-C++-EH/spirit | 2.80% |
| Benchmarks/Misc/flops-2 | 2.77% |
| Benchmarks/NPB-serial/is | 2.42% |
| Benchmarks/ASC_Sequoia/CrystalMk | 2.33% |
| Benchmarks/BenchmarkGame/n-body | 2.28% |
| Benchmarks/SciMark2-C/scimark2 | 2.27% |
| Benchmarks/Olden/bh | 2.03% |
| skidmarks10/skidmarks | 1.81% |
| Benchmarks/Misc/flops | 1.72% |
Slowdowns:
| Benchmarks/llubenchmark/llu | -14.14% |
| Benchmarks/Polybench/stencils/seidel-2d | -5.67% |
| Benchmarks/Adobe-C++/functionobjects | -5.25% |
| Benchmarks/Misc-C++/oopack_v1p8 | -5.00% |
| Benchmarks/Shootout/hash | -2.35% |
| Benchmarks/Prolangs-C++/ocean | -2.01% |
| Benchmarks/Polybench/medley/floyd-warshall | -1.98% |
| Polybench/linear-algebra/kernels/3mm | -1.95% |
| Benchmarks/McCat/09-vor/vor | -1.68% |
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@196516 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen')
-rw-r--r-- | test/CodeGen/ARM/saxpy10-a9.ll | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/test/CodeGen/ARM/saxpy10-a9.ll b/test/CodeGen/ARM/saxpy10-a9.ll new file mode 100644 index 0000000000..1102800dce --- /dev/null +++ b/test/CodeGen/ARM/saxpy10-a9.ll @@ -0,0 +1,135 @@ +; RUN: llc < %s -march=arm -mtriple=thumbv7-apple-ios7.0.0 -float-abi=hard -mcpu=cortex-a9 -disable-post-ra -misched-bench -scheditins=false | FileCheck %s +; +; Test MI-Sched suppory latency based stalls on in in-order pipeline +; using the new machine model. + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" + +; Don't be too strict with the top of the schedule, but most of it +; should be nicely pipelined. +; +; CHECK: saxpy10: +; CHECK: vldr +; CHECK: vldr +; CHECK: vldr +; CHECK: vldr +; CHECK: vldr +; CHECK: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vldr +; CHECK-NEXT: vmul +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vldr +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vadd +; CHECK-NEXT: vmov +; CHECK-NEXT: bx +; +; This accumulates a sum rather than storing each result. +define float @saxpy10(float* nocapture readonly %data1, float* nocapture readonly %data2, float %a) { +entry: + %0 = load float* %data1, align 4 + %mul = fmul float %0, %a + %1 = load float* %data2, align 4 + %add = fadd float %mul, %1 + %add2 = fadd float %add, 0.000000e+00 + %arrayidx.1 = getelementptr inbounds float* %data1, i32 1 + %2 = load float* %arrayidx.1, align 4 + %mul.1 = fmul float %2, %a + %arrayidx1.1 = getelementptr inbounds float* %data2, i32 1 + %3 = load float* %arrayidx1.1, align 4 + %add.1 = fadd float %mul.1, %3 + %add2.1 = fadd float %add2, %add.1 + %arrayidx.2 = getelementptr inbounds float* %data1, i32 2 + %4 = load float* %arrayidx.2, align 4 + %mul.2 = fmul float %4, %a + %arrayidx1.2 = getelementptr inbounds float* %data2, i32 2 + %5 = load float* %arrayidx1.2, align 4 + %add.2 = fadd float %mul.2, %5 + %add2.2 = fadd float %add2.1, %add.2 + %arrayidx.3 = getelementptr inbounds float* %data1, i32 3 + %6 = load float* %arrayidx.3, align 4 + %mul.3 = fmul float %6, %a + %arrayidx1.3 = getelementptr inbounds float* %data2, i32 3 + %7 = load float* %arrayidx1.3, align 4 + %add.3 = fadd float %mul.3, %7 + %add2.3 = fadd float %add2.2, %add.3 + %arrayidx.4 = getelementptr inbounds float* %data1, i32 4 + %8 = load float* %arrayidx.4, align 4 + %mul.4 = fmul float %8, %a + %arrayidx1.4 = getelementptr inbounds float* %data2, i32 4 + %9 = load float* %arrayidx1.4, align 4 + %add.4 = fadd float %mul.4, %9 + %add2.4 = fadd float %add2.3, %add.4 + %arrayidx.5 = getelementptr inbounds float* %data1, i32 5 + %10 = load float* %arrayidx.5, align 4 + %mul.5 = fmul float %10, %a + %arrayidx1.5 = getelementptr inbounds float* %data2, i32 5 + %11 = load float* %arrayidx1.5, align 4 + %add.5 = fadd float %mul.5, %11 + %add2.5 = fadd float %add2.4, %add.5 + %arrayidx.6 = getelementptr inbounds float* %data1, i32 6 + %12 = load float* %arrayidx.6, align 4 + %mul.6 = fmul float %12, %a + %arrayidx1.6 = getelementptr inbounds float* %data2, i32 6 + %13 = load float* %arrayidx1.6, align 4 + %add.6 = fadd float %mul.6, %13 + %add2.6 = fadd float %add2.5, %add.6 + %arrayidx.7 = getelementptr inbounds float* %data1, i32 7 + %14 = load float* %arrayidx.7, align 4 + %mul.7 = fmul float %14, %a + %arrayidx1.7 = getelementptr inbounds float* %data2, i32 7 + %15 = load float* %arrayidx1.7, align 4 + %add.7 = fadd float %mul.7, %15 + %add2.7 = fadd float %add2.6, %add.7 + %arrayidx.8 = getelementptr inbounds float* %data1, i32 8 + %16 = load float* %arrayidx.8, align 4 + %mul.8 = fmul float %16, %a + %arrayidx1.8 = getelementptr inbounds float* %data2, i32 8 + %17 = load float* %arrayidx1.8, align 4 + %add.8 = fadd float %mul.8, %17 + %add2.8 = fadd float %add2.7, %add.8 + %arrayidx.9 = getelementptr inbounds float* %data1, i32 9 + %18 = load float* %arrayidx.9, align 4 + %mul.9 = fmul float %18, %a + %arrayidx1.9 = getelementptr inbounds float* %data2, i32 9 + %19 = load float* %arrayidx1.9, align 4 + %add.9 = fadd float %mul.9, %19 + %add2.9 = fadd float %add2.8, %add.9 + ret float %add2.9 +} |