From 572645cf84060c0fc25cb91d38cb9079918b3a88 Mon Sep 17 00:00:00 2001 From: Dan Gohman Date: Fri, 12 Feb 2010 10:34:29 +0000 Subject: Reapply the new LoopStrengthReduction code, with compile time and bug fixes, and with improved heuristics for analyzing foreign-loop addrecs. This change also flattens IVUsers, eliminating the stride-oriented groupings, which makes it easier to work with. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@95975 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/2006-05-11-InstrSched.ll | 4 +- test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll | 2 +- test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll | 2 +- test/CodeGen/X86/full-lsr.ll | 9 +- test/CodeGen/X86/iv-users-in-other-loops.ll | 8 +- test/CodeGen/X86/loop-strength-reduce-2.ll | 19 +- test/CodeGen/X86/loop-strength-reduce-3.ll | 13 +- test/CodeGen/X86/loop-strength-reduce.ll | 13 +- test/CodeGen/X86/loop-strength-reduce4.ll | 18 +- test/CodeGen/X86/loop-strength-reduce8.ll | 8 +- test/CodeGen/X86/lsr-reuse.ll | 386 +++++++++++++++++++++++++ test/CodeGen/X86/masked-iv-safe.ll | 6 +- test/CodeGen/X86/pr3495.ll | 3 +- 13 files changed, 455 insertions(+), 36 deletions(-) create mode 100644 test/CodeGen/X86/lsr-reuse.ll (limited to 'test/CodeGen/X86') diff --git a/test/CodeGen/X86/2006-05-11-InstrSched.ll b/test/CodeGen/X86/2006-05-11-InstrSched.ll index bdbe713a29..56d6aa960e 100644 --- a/test/CodeGen/X86/2006-05-11-InstrSched.ll +++ b/test/CodeGen/X86/2006-05-11-InstrSched.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -march=x86 -mattr=+sse2 -stats -realign-stack=0 |&\ -; RUN: grep {asm-printer} | grep 31 +; RUN: grep {asm-printer} | grep 34 target datalayout = "e-p:32:32" define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind { @@ -40,7 +40,7 @@ cond_true: ; preds = %cond_true, %entry %tmp137.upgrd.7 = bitcast i32* %tmp137 to <2 x i64>* ; <<2 x i64>*> [#uses=1] store <2 x i64> %tmp131, <2 x i64>* %tmp137.upgrd.7 %tmp147 = add nsw i32 %tmp.10, 8 ; [#uses=1] - %tmp.upgrd.8 = icmp slt i32 %tmp147, %M ; [#uses=1] + %tmp.upgrd.8 = icmp ne i32 %tmp147, %M ; [#uses=1] %indvar.next = add i32 %indvar, 1 ; [#uses=1] br i1 %tmp.upgrd.8, label %cond_true, label %return diff --git a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll index 4cac9b4c4a..e1f890192d 100644 --- a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll +++ b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=x86 -mtriple=i686-darwin | \ ; RUN: grep push | count 3 -define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) { +define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) nounwind { entry: icmp sgt i32 %size, 0 ; :0 [#uses=1] br i1 %0, label %bb.preheader, label %return diff --git a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll index 721d4c945b..8e315f4d80 100644 --- a/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll +++ b/test/CodeGen/X86/2007-11-30-LoadFolding-Bug.ll @@ -35,7 +35,7 @@ cond_next36.i: ; preds = %cond_next.i bb.i28.i: ; preds = %bb.i28.i, %cond_next36.i ; CHECK: %bb.i28.i ; CHECK: addl $2 -; CHECK: addl $2 +; CHECK: addl $-2 %j.0.reg2mem.0.i16.i = phi i32 [ 0, %cond_next36.i ], [ %indvar.next39.i, %bb.i28.i ] ; [#uses=2] %din_addr.1.reg2mem.0.i17.i = phi double [ 0.000000e+00, %cond_next36.i ], [ %tmp16.i25.i, %bb.i28.i ] ; [#uses=1] %tmp1.i18.i = fptosi double %din_addr.1.reg2mem.0.i17.i to i32 ; [#uses=2] diff --git a/test/CodeGen/X86/full-lsr.ll b/test/CodeGen/X86/full-lsr.ll index 3bd58b65be..ff9b1b0b6a 100644 --- a/test/CodeGen/X86/full-lsr.ll +++ b/test/CodeGen/X86/full-lsr.ll @@ -1,12 +1,7 @@ ; RUN: llc < %s -march=x86 >%t -; TODO: Enhance full lsr mode to get this: -; RUNX: grep {addl \\\$4,} %t | count 3 -; RUNX: not grep {,%} %t - -; For now, it should find this, which is still pretty good: -; RUN: not grep {addl \\\$4,} %t -; RUN: grep {,%} %t | count 6 +; RUN: grep {addl \\\$4,} %t | count 3 +; RUN: not grep {,%} %t define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind { entry: diff --git a/test/CodeGen/X86/iv-users-in-other-loops.ll b/test/CodeGen/X86/iv-users-in-other-loops.ll index c695c29e06..408fb20b8d 100644 --- a/test/CodeGen/X86/iv-users-in-other-loops.ll +++ b/test/CodeGen/X86/iv-users-in-other-loops.ll @@ -1,11 +1,11 @@ ; RUN: llc < %s -march=x86-64 -o %t -; RUN: grep inc %t | count 1 +; RUN: not grep inc %t ; RUN: grep dec %t | count 2 ; RUN: grep addq %t | count 13 ; RUN: not grep addb %t -; RUN: grep leaq %t | count 9 -; RUN: grep leal %t | count 3 -; RUN: grep movq %t | count 5 +; RUN: not grep leaq %t +; RUN: not grep leal %t +; RUN: not grep movq %t ; IV users in each of the loops from other loops shouldn't cause LSR ; to insert new induction variables. Previously it would create a diff --git a/test/CodeGen/X86/loop-strength-reduce-2.ll b/test/CodeGen/X86/loop-strength-reduce-2.ll index 30b5114349..b546462b68 100644 --- a/test/CodeGen/X86/loop-strength-reduce-2.ll +++ b/test/CodeGen/X86/loop-strength-reduce-2.ll @@ -1,11 +1,24 @@ -; RUN: llc < %s -march=x86 -relocation-model=pic | \ -; RUN: grep {, 4} | count 1 -; RUN: llc < %s -march=x86 | not grep lea +; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC +; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s -check-prefix=STATIC ; ; Make sure the common loop invariant A is hoisted up to preheader, ; since too many registers are needed to subsume it into the addressing modes. ; It's safe to sink A in when it's not pic. +; PIC: align +; PIC: movl $4, -4([[REG:%e[a-z]+]]) +; PIC: movl $5, ([[REG]]) +; PIC: addl $4, [[REG]] +; PIC: decl {{%e[[a-z]+}} +; PIC: jne + +; STATIC: align +; STATIC: movl $4, -4(%ecx) +; STATIC: movl $5, (%ecx) +; STATIC: addl $4, %ecx +; STATIC: decl %eax +; STATIC: jne + @A = global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2] define void @test(i32 %row, i32 %N.in) nounwind { diff --git a/test/CodeGen/X86/loop-strength-reduce-3.ll b/test/CodeGen/X86/loop-strength-reduce-3.ll index 70c91340c9..b1c9fb9c07 100644 --- a/test/CodeGen/X86/loop-strength-reduce-3.ll +++ b/test/CodeGen/X86/loop-strength-reduce-3.ll @@ -1,8 +1,11 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | \ -; RUN: grep {A+} | count 2 -; -; Make sure the common loop invariant A is not hoisted up to preheader, -; since it can be subsumed it into the addressing modes. +; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=dynamic-no-pic | FileCheck %s + +; CHECK: align +; CHECK: movl $4, -4(%ecx) +; CHECK: movl $5, (%ecx) +; CHECK: addl $4, %ecx +; CHECK: decl %eax +; CHECK: jne @A = global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2] diff --git a/test/CodeGen/X86/loop-strength-reduce.ll b/test/CodeGen/X86/loop-strength-reduce.ll index 4cb56ca9ed..42c6ac4983 100644 --- a/test/CodeGen/X86/loop-strength-reduce.ll +++ b/test/CodeGen/X86/loop-strength-reduce.ll @@ -1,8 +1,11 @@ -; RUN: llc < %s -march=x86 -relocation-model=static | \ -; RUN: grep {A+} | count 2 -; -; Make sure the common loop invariant A is not hoisted up to preheader, -; since it can be subsumed into the addressing mode in all uses. +; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s + +; CHECK: align +; CHECK: movl $4, -4(%ecx) +; CHECK: movl $5, (%ecx) +; CHECK: addl $4, %ecx +; CHECK: decl %eax +; CHECK: jne @A = internal global [16 x [16 x i32]] zeroinitializer, align 32 ; <[16 x [16 x i32]]*> [#uses=2] diff --git a/test/CodeGen/X86/loop-strength-reduce4.ll b/test/CodeGen/X86/loop-strength-reduce4.ll index 07e46eca75..6c0eb8c0df 100644 --- a/test/CodeGen/X86/loop-strength-reduce4.ll +++ b/test/CodeGen/X86/loop-strength-reduce4.ll @@ -1,5 +1,19 @@ -; RUN: llc < %s -march=x86 | grep cmp | grep 64 -; RUN: llc < %s -march=x86 | not grep inc +; RUN: llc < %s -march=x86 -relocation-model=static -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=STATIC +; RUN: llc < %s -march=x86 -relocation-model=pic | FileCheck %s -check-prefix=PIC + +; By starting the IV at -64 instead of 0, a cmp is eliminated, +; as the flags from the add can be used directly. + +; STATIC: movl $-64, %ecx + +; STATIC: movl %eax, _state+76(%ecx) +; STATIC: addl $16, %ecx +; STATIC: jne + +; In PIC mode the symbol can't be folded, so the change-compare-stride +; trick applies. + +; PIC: cmpl $64 @state = external global [0 x i32] ; <[0 x i32]*> [#uses=4] @S = external global [0 x i32] ; <[0 x i32]*> [#uses=4] diff --git a/test/CodeGen/X86/loop-strength-reduce8.ll b/test/CodeGen/X86/loop-strength-reduce8.ll index e14cd8a99e..6b2247d1d6 100644 --- a/test/CodeGen/X86/loop-strength-reduce8.ll +++ b/test/CodeGen/X86/loop-strength-reduce8.ll @@ -1,4 +1,10 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin | grep leal | not grep 16 +; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s + +; CHECK: leal 16(%eax), %edx +; CHECK: align +; CHECK: addl $4, %edx +; CHECK: decl %ecx +; CHECK: jne LBB1_2 %struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32 } %struct.bitmap_element = type { %struct.bitmap_element*, %struct.bitmap_element*, i32, [2 x i64] } diff --git a/test/CodeGen/X86/lsr-reuse.ll b/test/CodeGen/X86/lsr-reuse.ll new file mode 100644 index 0000000000..7f2b8cc8f8 --- /dev/null +++ b/test/CodeGen/X86/lsr-reuse.ll @@ -0,0 +1,386 @@ +; RUN: llc < %s -march=x86-64 -O3 | FileCheck %s +target datalayout = "e-p:64:64:64" +target triple = "x86_64-unknown-unknown" + +; Full strength reduction reduces register pressure from 5 to 4 here. +; Instruction selection should use the FLAGS value from the dec for +; the branch. Scheduling should push the adds upwards. + +; CHECK: full_me_0: +; CHECK: movsd (%rsi), %xmm0 +; CHECK: addq $8, %rsi +; CHECK: mulsd (%rdx), %xmm0 +; CHECK: addq $8, %rdx +; CHECK: movsd %xmm0, (%rdi) +; CHECK: addq $8, %rdi +; CHECK: decq %rcx +; CHECK: jne + +define void @full_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + %Ai = getelementptr inbounds double* %A, i64 %i + %Bi = getelementptr inbounds double* %B, i64 %i + %Ci = getelementptr inbounds double* %C, i64 %i + %t1 = load double* %Bi + %t2 = load double* %Ci + %m = fmul double %t1, %t2 + store double %m, double* %Ai + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + ret void +} + +; Mostly-full strength reduction means we do full strength reduction on all +; except for the offsets. +; +; Given a choice between constant offsets -2048 and 2048, choose the negative +; value, because at boundary conditions it has a smaller encoding. +; TODO: That's an over-general heuristic. It would be better for the target +; to indicate what the encoding cost would be. Then using a 2048 offset +; would be better on x86-64, since the start value would be 0 instead of +; 2048. + +; CHECK: mostly_full_me_0: +; CHECK: movsd -2048(%rsi), %xmm0 +; CHECK: mulsd -2048(%rdx), %xmm0 +; CHECK: movsd %xmm0, -2048(%rdi) +; CHECK: movsd (%rsi), %xmm0 +; CHECK: addq $8, %rsi +; CHECK: divsd (%rdx), %xmm0 +; CHECK: addq $8, %rdx +; CHECK: movsd %xmm0, (%rdi) +; CHECK: addq $8, %rdi +; CHECK: decq %rcx +; CHECK: jne + +define void @mostly_full_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + %Ai = getelementptr inbounds double* %A, i64 %i + %Bi = getelementptr inbounds double* %B, i64 %i + %Ci = getelementptr inbounds double* %C, i64 %i + %t1 = load double* %Bi + %t2 = load double* %Ci + %m = fmul double %t1, %t2 + store double %m, double* %Ai + %j = add i64 %i, 256 + %Aj = getelementptr inbounds double* %A, i64 %j + %Bj = getelementptr inbounds double* %B, i64 %j + %Cj = getelementptr inbounds double* %C, i64 %j + %t3 = load double* %Bj + %t4 = load double* %Cj + %o = fdiv double %t3, %t4 + store double %o, double* %Aj + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + ret void +} + +; A minor variation on mostly_full_me_0. +; Prefer to start the indvar at 0. + +; CHECK: mostly_full_me_1: +; CHECK: movsd (%rsi), %xmm0 +; CHECK: mulsd (%rdx), %xmm0 +; CHECK: movsd %xmm0, (%rdi) +; CHECK: movsd -2048(%rsi), %xmm0 +; CHECK: addq $8, %rsi +; CHECK: divsd -2048(%rdx), %xmm0 +; CHECK: addq $8, %rdx +; CHECK: movsd %xmm0, -2048(%rdi) +; CHECK: addq $8, %rdi +; CHECK: decq %rcx +; CHECK: jne + +define void @mostly_full_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + %Ai = getelementptr inbounds double* %A, i64 %i + %Bi = getelementptr inbounds double* %B, i64 %i + %Ci = getelementptr inbounds double* %C, i64 %i + %t1 = load double* %Bi + %t2 = load double* %Ci + %m = fmul double %t1, %t2 + store double %m, double* %Ai + %j = sub i64 %i, 256 + %Aj = getelementptr inbounds double* %A, i64 %j + %Bj = getelementptr inbounds double* %B, i64 %j + %Cj = getelementptr inbounds double* %C, i64 %j + %t3 = load double* %Bj + %t4 = load double* %Cj + %o = fdiv double %t3, %t4 + store double %o, double* %Aj + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + ret void +} + +; A slightly less minor variation on mostly_full_me_0. + +; CHECK: mostly_full_me_2: +; CHECK: movsd (%rsi), %xmm0 +; CHECK: mulsd (%rdx), %xmm0 +; CHECK: movsd %xmm0, (%rdi) +; CHECK: movsd -4096(%rsi), %xmm0 +; CHECK: addq $8, %rsi +; CHECK: divsd -4096(%rdx), %xmm0 +; CHECK: addq $8, %rdx +; CHECK: movsd %xmm0, -4096(%rdi) +; CHECK: addq $8, %rdi +; CHECK: decq %rcx +; CHECK: jne + +define void @mostly_full_me_2(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + %k = add i64 %i, 256 + %Ak = getelementptr inbounds double* %A, i64 %k + %Bk = getelementptr inbounds double* %B, i64 %k + %Ck = getelementptr inbounds double* %C, i64 %k + %t1 = load double* %Bk + %t2 = load double* %Ck + %m = fmul double %t1, %t2 + store double %m, double* %Ak + %j = sub i64 %i, 256 + %Aj = getelementptr inbounds double* %A, i64 %j + %Bj = getelementptr inbounds double* %B, i64 %j + %Cj = getelementptr inbounds double* %C, i64 %j + %t3 = load double* %Bj + %t4 = load double* %Cj + %o = fdiv double %t3, %t4 + store double %o, double* %Aj + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + ret void +} + +; In this test, the counting IV exit value is used, so full strength reduction +; would not reduce register pressure. IndVarSimplify ought to simplify such +; cases away, but it's useful here to verify that LSR's register pressure +; heuristics are working as expected. + +; CHECK: count_me_0: +; CHECK: movsd (%rsi,%rax,8), %xmm0 +; CHECK: mulsd (%rdx,%rax,8), %xmm0 +; CHECK: movsd %xmm0, (%rdi,%rax,8) +; CHECK: incq %rax +; CHECK: cmpq %rax, %rcx +; CHECK: jne + +define i64 @count_me_0(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + %Ai = getelementptr inbounds double* %A, i64 %i + %Bi = getelementptr inbounds double* %B, i64 %i + %Ci = getelementptr inbounds double* %C, i64 %i + %t1 = load double* %Bi + %t2 = load double* %Ci + %m = fmul double %t1, %t2 + store double %m, double* %Ai + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + %q = phi i64 [ 0, %entry ], [ %i.next, %loop ] + ret i64 %q +} + +; In this test, the trip count value is used, so full strength reduction +; would not reduce register pressure. +; (though it would reduce register pressure inside the loop...) + +; CHECK: count_me_1: +; CHECK: movsd (%rsi,%rax,8), %xmm0 +; CHECK: mulsd (%rdx,%rax,8), %xmm0 +; CHECK: movsd %xmm0, (%rdi,%rax,8) +; CHECK: incq %rax +; CHECK: cmpq %rax, %rcx +; CHECK: jne + +define i64 @count_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + %Ai = getelementptr inbounds double* %A, i64 %i + %Bi = getelementptr inbounds double* %B, i64 %i + %Ci = getelementptr inbounds double* %C, i64 %i + %t1 = load double* %Bi + %t2 = load double* %Ci + %m = fmul double %t1, %t2 + store double %m, double* %Ai + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + %q = phi i64 [ 0, %entry ], [ %n, %loop ] + ret i64 %q +} + +; Full strength reduction doesn't save any registers here because the +; loop tripcount is a constant. + +; CHECK: count_me_2: +; CHECK: movl $10, %eax +; CHECK: align +; CHECK: BB7_1: +; CHECK: movsd -40(%rdi,%rax,8), %xmm0 +; CHECK: addsd -40(%rsi,%rax,8), %xmm0 +; CHECK: movsd %xmm0, -40(%rdx,%rax,8) +; CHECK: movsd (%rdi,%rax,8), %xmm0 +; CHECK: subsd (%rsi,%rax,8), %xmm0 +; CHECK: movsd %xmm0, (%rdx,%rax,8) +; CHECK: incq %rax +; CHECK: cmpq $5010, %rax +; CHECK: jne + +define void @count_me_2(double* nocapture %A, double* nocapture %B, double* nocapture %C) nounwind { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %i.next, %loop ] + %i5 = add i64 %i, 5 + %Ai = getelementptr double* %A, i64 %i5 + %t2 = load double* %Ai + %Bi = getelementptr double* %B, i64 %i5 + %t4 = load double* %Bi + %t5 = fadd double %t2, %t4 + %Ci = getelementptr double* %C, i64 %i5 + store double %t5, double* %Ci + %i10 = add i64 %i, 10 + %Ai10 = getelementptr double* %A, i64 %i10 + %t9 = load double* %Ai10 + %Bi10 = getelementptr double* %B, i64 %i10 + %t11 = load double* %Bi10 + %t12 = fsub double %t9, %t11 + %Ci10 = getelementptr double* %C, i64 %i10 + store double %t12, double* %Ci10 + %i.next = add i64 %i, 1 + %exitcond = icmp eq i64 %i.next, 5000 + br i1 %exitcond, label %return, label %loop + +return: + ret void +} + +; This should be fully strength-reduced to reduce register pressure. + +; CHECK: full_me_1: +; CHECK: align +; CHECK: BB8_1: +; CHECK: movsd (%rdi), %xmm0 +; CHECK: addsd (%rsi), %xmm0 +; CHECK: movsd %xmm0, (%rdx) +; CHECK: movsd 40(%rdi), %xmm0 +; CHECK: addq $8, %rdi +; CHECK: subsd 40(%rsi), %xmm0 +; CHECK: addq $8, %rsi +; CHECK: movsd %xmm0, 40(%rdx) +; CHECK: addq $8, %rdx +; CHECK: decq %rcx +; CHECK: jne + +define void @full_me_1(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + br label %loop + +loop: + %i = phi i64 [ 0, %entry ], [ %i.next, %loop ] + %i5 = add i64 %i, 5 + %Ai = getelementptr double* %A, i64 %i5 + %t2 = load double* %Ai + %Bi = getelementptr double* %B, i64 %i5 + %t4 = load double* %Bi + %t5 = fadd double %t2, %t4 + %Ci = getelementptr double* %C, i64 %i5 + store double %t5, double* %Ci + %i10 = add i64 %i, 10 + %Ai10 = getelementptr double* %A, i64 %i10 + %t9 = load double* %Ai10 + %Bi10 = getelementptr double* %B, i64 %i10 + %t11 = load double* %Bi10 + %t12 = fsub double %t9, %t11 + %Ci10 = getelementptr double* %C, i64 %i10 + store double %t12, double* %Ci10 + %i.next = add i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + ret void +} + +; This is a variation on full_me_0 in which the 0,+,1 induction variable +; has a non-address use, pinning that value in a register. + +; CHECK: count_me_3: +; CHECK: call +; CHECK: movsd (%r15,%r13,8), %xmm0 +; CHECK: mulsd (%r14,%r13,8), %xmm0 +; CHECK: movsd %xmm0, (%r12,%r13,8) +; CHECK: incq %r13 +; CHECK: cmpq %r13, %rbx +; CHECK: jne + +declare void @use(i64) + +define void @count_me_3(double* nocapture %A, double* nocapture %B, double* nocapture %C, i64 %n) nounwind { +entry: + %t0 = icmp sgt i64 %n, 0 + br i1 %t0, label %loop, label %return + +loop: + %i = phi i64 [ %i.next, %loop ], [ 0, %entry ] + call void @use(i64 %i) + %Ai = getelementptr inbounds double* %A, i64 %i + %Bi = getelementptr inbounds double* %B, i64 %i + %Ci = getelementptr inbounds double* %C, i64 %i + %t1 = load double* %Bi + %t2 = load double* %Ci + %m = fmul double %t1, %t2 + store double %m, double* %Ai + %i.next = add nsw i64 %i, 1 + %exitcond = icmp eq i64 %i.next, %n + br i1 %exitcond, label %return, label %loop + +return: + ret void +} diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll index bc493bd8f7..0b4d73a683 100644 --- a/test/CodeGen/X86/masked-iv-safe.ll +++ b/test/CodeGen/X86/masked-iv-safe.ll @@ -169,7 +169,7 @@ loop: %indvar.i24 = and i64 %indvar, 16777215 %t3 = getelementptr double* %d, i64 %indvar.i24 %t4 = load double* %t3 - %t5 = fmul double %t4, 2.3 + %t5 = fdiv double %t4, 2.3 store double %t5, double* %t3 %t6 = getelementptr double* %d, i64 %indvar %t7 = load double* %t6 @@ -199,7 +199,7 @@ loop: %indvar.i24 = ashr i64 %s1, 24 %t3 = getelementptr double* %d, i64 %indvar.i24 %t4 = load double* %t3 - %t5 = fmul double %t4, 2.3 + %t5 = fdiv double %t4, 2.3 store double %t5, double* %t3 %t6 = getelementptr double* %d, i64 %indvar %t7 = load double* %t6 @@ -229,7 +229,7 @@ loop: %indvar.i24 = ashr i64 %s1, 24 %t3 = getelementptr double* %d, i64 %indvar.i24 %t4 = load double* %t3 - %t5 = fmul double %t4, 2.3 + %t5 = fdiv double %t4, 2.3 store double %t5, double* %t3 %t6 = getelementptr double* %d, i64 %indvar %t7 = load double* %t6 diff --git a/test/CodeGen/X86/pr3495.ll b/test/CodeGen/X86/pr3495.ll index 14f2a54486..e84a84f59b 100644 --- a/test/CodeGen/X86/pr3495.ll +++ b/test/CodeGen/X86/pr3495.ll @@ -1,8 +1,7 @@ ; RUN: llc < %s -march=x86 -stats |& grep {Number of loads added} | grep 2 ; RUN: llc < %s -march=x86 -stats |& grep {Number of register spills} | grep 1 -; RUN: llc < %s -march=x86 -stats |& grep {Number of machine instrs printed} | grep 37 +; RUN: llc < %s -march=x86 -stats |& grep {Number of machine instrs printed} | grep 34 ; PR3495 -; The loop reversal kicks in once here, resulting in one fewer instruction. target triple = "i386-pc-linux-gnu" @x = external global [8 x i32], align 32 ; <[8 x i32]*> [#uses=1] -- cgit v1.2.3