From 4363b0729b2b2aa50949ac9fb3af5ced0240898f Mon Sep 17 00:00:00 2001 From: Chandler Carruth Date: Fri, 27 Jun 2014 11:34:40 +0000 Subject: [x86] Teach the target-specific combining how to aggressively fold half-shuffles, even looking through intervening instructions in a chain. Summary: This doesn't happen to show up with any test cases I've found for the current shuffle lowering, but previous attempts would benefit from this and it seems generally useful. I've tested it directly using intrinsics, which also shows that it will work with hand vectorized code as well. Note that even though pshufd isn't directly used in these tests, it gets exercised because we combine some of the half shuffles into a pshufd first, and then merge them. Differential Revision: http://reviews.llvm.org/D4291 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211890 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/vector-shuffle-combining.ll | 49 ++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 test/CodeGen/X86/vector-shuffle-combining.ll (limited to 'test') diff --git a/test/CodeGen/X86/vector-shuffle-combining.ll b/test/CodeGen/X86/vector-shuffle-combining.ll new file mode 100644 index 0000000000..dae1ef5875 --- /dev/null +++ b/test/CodeGen/X86/vector-shuffle-combining.ll @@ -0,0 +1,49 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2 + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-unknown" + +declare <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16>, i8) +declare <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16>, i8) + +define <8 x i16> @combine_pshuflw1(<8 x i16> %a) { +; CHECK-SSE2-LABEL: @combine_pshuflw1 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: retq + %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) + ret <8 x i16> %c +} + +define <8 x i16> @combine_pshuflw2(<8 x i16> %a) { +; CHECK-SSE2-LABEL: @combine_pshuflw2 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: retq + %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 -28) + %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) + ret <8 x i16> %d +} + +define <8 x i16> @combine_pshuflw3(<8 x i16> %a) { +; CHECK-SSE2-LABEL: @combine_pshuflw3 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: pshufhw {{.*}} # xmm0 = xmm0[0,1,2,3,7,6,5,4] +; CHECK-SSE2-NEXT: retq + %b = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %a, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %b, i8 27) + %d = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %c, i8 27) + ret <8 x i16> %d +} + +define <8 x i16> @combine_pshufhw1(<8 x i16> %a) { +; CHECK-SSE2-LABEL: @combine_pshufhw1 +; CHECK-SSE2: # BB#0: +; CHECK-SSE2-NEXT: pshuflw {{.*}} # xmm0 = xmm0[3,2,1,0,4,5,6,7] +; CHECK-SSE2-NEXT: retq + %b = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %a, i8 27) + %c = call <8 x i16> @llvm.x86.sse2.pshufl.w(<8 x i16> %b, i8 27) + %d = call <8 x i16> @llvm.x86.sse2.pshufh.w(<8 x i16> %c, i8 27) + ret <8 x i16> %d +} + -- cgit v1.2.3