Enhanced vzeroupper insertion pass that avoids inserting vzeroupper where it is unnecessary through local analysis. Patch from Bruno Cardoso Lopes, with some additional changes.

I'm going to wait for any review comments and perform some additional testing before turning this on by default. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@143750 91177308-0d34-0410-b5e6-96231b3b80d8
author: Eli Friedman <eli.friedman@gmail.com> 2011-11-04 23:46:11 +0000
committer: Eli Friedman <eli.friedman@gmail.com> 2011-11-04 23:46:11 +0000
commit: bd00a934c653fb1666fa7d18267644b4e9d14e5e (patch)
tree: 5419d6ccb8710b93b51c4ad8c73eb4c924f3f0b3 /test/CodeGen
parent: 451afbc6a22a3a662eea7e86088c65c36e84949f (diff)
download: llvm-bd00a934c653fb1666fa7d18267644b4e9d14e5e.tar.gz
llvm-bd00a934c653fb1666fa7d18267644b4e9d14e5e.tar.bz2
llvm-bd00a934c653fb1666fa7d18267644b4e9d14e5e.tar.xz
1 files changed, 70 insertions, 13 deletions
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index eaf236c6c7..bf4ab5be15 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,26 +1,83 @@
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
-define <4 x float> @do_sse_local(<4 x float> %a) nounwind uwtable readnone ssp {
-entry:
-  %add.i = fadd <4 x float> %a, %a
-  ret <4 x float> %add.i
-}
+declare <4 x float> @do_sse(<4 x float>)
+declare <8 x float> @do_avx(<8 x float>)
+declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
+@x = common global <4 x float> zeroinitializer, align 16
+@g = common global <8 x float> zeroinitializer, align 32
+
+;; Basic checking - don't emit any vzeroupper instruction
 
 ; CHECK: _test00
 define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
 entry:
+  ; CHECK-NOT: vzeroupper
   %add.i = fadd <4 x float> %a, %b
+  %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
+  ; CHECK: ret
+  ret <4 x float> %call3
+}
+
+;; Check parameter 256-bit parameter passing
+
+; CHECK: _test01
+define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
+entry:
+  %tmp = load <4 x float>* @x, align 16
   ; CHECK: vzeroupper
   ; CHECK-NEXT: callq _do_sse
-  %call3 = tail call <4 x float> @do_sse(<4 x float> %add.i) nounwind
-  %sub.i = fsub <4 x float> %call3, %add.i
+  %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
+  store <4 x float> %call, <4 x float>* @x, align 16
   ; CHECK-NOT: vzeroupper
-  ; CHECK: callq _do_sse_local
-  %call8 = tail call <4 x float> @do_sse_local(<4 x float> %sub.i)
+  ; CHECK: callq _do_sse
+  %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
+  store <4 x float> %call2, <4 x float>* @x, align 16
+  ; CHECK: ret
+  ret <8 x float> %c
+}
+
+;; Test the pass convergence and also that vzeroupper is only issued when necessary,
+;; for this function it should be only once
+
+; CHECK: _test02
+define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+entry:
+  %add.i = fadd <4 x float> %a, %b
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  ; CHECK: LBB
+  ; CHECK-NOT: vzeroupper
+  %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ]
+  %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ]
+  ; CHECK: callq _do_sse
+  %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
+  ; CHECK-NEXT: callq _do_sse
+  %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
+  %tmp11 = load <8 x float>* @g, align 32
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
   ; CHECK: vzeroupper
-  ; CHECK-NEXT: jmp _do_sse
-  %call10 = tail call <4 x float> @do_sse(<4 x float> %call8) nounwind
-  ret <4 x float> %call10
+  ; CHECK-NEXT: callq _do_sse
+  %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
+  %1 = add nsw i32 %i.018, 1
+  %exitcond = icmp eq i32 %1, 4
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret <4 x float> %call14
 }
 
-declare <4 x float> @do_sse(<4 x float>)
+;; Check that we also perform vzeroupper when we return from a function.
+
+; CHECK: _test03
+define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+entry:
+  %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ; CHECK-NOT: vzeroupper
+  ; CHECK: call
+  %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
+  %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK: vzeroupper
+  ; CHECK: ret
+  ret <4 x float> %shuf2
+}
author	Eli Friedman <eli.friedman@gmail.com>	2011-11-04 23:46:11 +0000
committer	Eli Friedman <eli.friedman@gmail.com>	2011-11-04 23:46:11 +0000
commit	bd00a934c653fb1666fa7d18267644b4e9d14e5e (patch)
tree	5419d6ccb8710b93b51c4ad8c73eb4c924f3f0b3 /test/CodeGen
parent	451afbc6a22a3a662eea7e86088c65c36e84949f (diff)
download	llvm-bd00a934c653fb1666fa7d18267644b4e9d14e5e.tar.gz llvm-bd00a934c653fb1666fa7d18267644b4e9d14e5e.tar.bz2 llvm-bd00a934c653fb1666fa7d18267644b4e9d14e5e.tar.xz