From c5114dbcc3cd5a768bed43e5ae88d87a88b4a1b1 Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 27 Jun 2014 11:40:13 +0000 Subject: [x86] Teach the target combine step to aggressively fold pshufd insturcions. Summary: This allows it to fold pshufd instructions across intervening half-shuffles and other noise. This pattern actually shows up in the generic lowering tests, but I've also added direct tests using intrinsics to make sure that the specific desired functionality is working even if the lowering stuff changes in the future. Differential Revision: http://reviews.llvm.org/D4292 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211892 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/vector-shuffle-128-v8.ll | 3 +- test/CodeGen/X86/vector-shuffle-combining.ll | 60 ++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 2 deletions(-) (limited to 'test') diff --git a/test/CodeGen/X86/vector-shuffle-128-v8.ll b/test/CodeGen/X86/vector-shuffle-128-v8.ll index 091822b6cb..5d1922a348 100644 --- a/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -157,9 +157,8 @@ define <8 x i16> @shuffle_v8i16_26401375(<8 x i16> %a, <8 x i16> %b) { ; CHECK-SSE2: # BB#0: ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[0,2,1,3,4,5,6,7] ; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,5,4,6] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,2,1] +; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,3,1,2] ; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[1,3,2,0,4,5,6,7] -; CHECK-SSE2-NEXT: pshufd {{.*}} # xmm0 = xmm0[0,1,3,2] ; CHECK-SSE2-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll index dae1ef5875..1bc2aee6ef 100644 --- a/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3,9 +3,69 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" +declare <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32>, i8) declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) +define <4 x i32> @combine_pshufd1(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd1 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) + %c = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %b, i8 27) + ret <4 x i32> %c +} + +define <4 x i32> @combine_pshufd2(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd2 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) + %b.cast = bitcast <4 x i32> %b to <8 x i16> + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 -28) + %c.cast = bitcast <8 x i16> %c to <4 x i32> + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) + ret <4 x i32> %d +} + +define <4 x i32> @combine_pshufd3(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd3 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 27) + %b.cast = bitcast <4 x i32> %b to <8 x i16> + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 -28) + %c.cast = bitcast <8 x i16> %c to <4 x i32> + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 27) + ret <4 x i32> %d +} + +define <4 x i32> @combine_pshufd4(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd4 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -31) + %b.cast = bitcast <4 x i32> %b to <8 x i16> + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b.cast, i8 27) + %c.cast = bitcast <8 x i16> %c to <4 x i32> + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -31) + ret <4 x i32> %d +} + +define <4 x i32> @combine_pshufd5(<4 x i32> %a) { +; CHECK-SSE2-LABEL: @combine_pshufd5 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] +; CHECK-SSE2-NEXT: retq + %b = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %a, i8 -76) + %b.cast = bitcast <4 x i32> %b to <8 x i16> + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b.cast, i8 27) + %c.cast = bitcast <8 x i16> %c to <4 x i32> + %d = call <4 x i32> @llvm.x86.sse2.pshuf.d(<4 x i32> %c.cast, i8 -76) + ret <4 x i32> %d +} + define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { ; CHECK-SSE2-LABEL: @combine_pshuflw1 ; CHECK-SSE2: # BB#0: -- cgit v1.2.3