From 5e6ebaf4d1d3043d3428b65ee8054c71c24af930 Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Wed, 23 Jul 2008 00:22:17 +0000 Subject: Fix PR2485: do all 4-element SSE shuffles in max. of 2 shuffle instructions. Based on patch by Nicolas Capens. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@53939 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 65 +++++++++++++++++++++++++++++++++++--- test/CodeGen/X86/vec_insert-2.ll | 5 ++- test/CodeGen/X86/vec_insert-6.ll | 3 +- test/CodeGen/X86/vec_shuffle-19.ll | 8 +++++ 4 files changed, 72 insertions(+), 9 deletions(-) create mode 100644 test/CodeGen/X86/vec_shuffle-19.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 2ebda55060..e8e1b2e7ea 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -3655,13 +3655,8 @@ LowerVECTOR_SHUFFLE_4wide(SDOperand V1, SDOperand V2, SmallVector, 8> Locs; Locs.reserve(4); SmallVector Mask1(4, DAG.getNode(ISD::UNDEF, MaskEVT)); - SmallVector Mask2(4, DAG.getNode(ISD::UNDEF, MaskEVT)); unsigned NumHi = 0; unsigned NumLo = 0; - // If no more than two elements come from either vector. This can be - // implemented with two shuffles. First shuffle gather the elements. - // The second shuffle, which takes the first shuffle as both of its - // vector operands, put the elements into the right order. for (unsigned i = 0; i != 4; ++i) { SDOperand Elt = PermMask.getOperand(i); if (Elt.getOpcode() == ISD::UNDEF) { @@ -3680,10 +3675,17 @@ LowerVECTOR_SHUFFLE_4wide(SDOperand V1, SDOperand V2, } } } + if (NumLo <= 2 && NumHi <= 2) { + // If no more than two elements come from either vector. This can be + // implemented with two shuffles. First shuffle gather the elements. + // The second shuffle, which takes the first shuffle as both of its + // vector operands, put the elements into the right order. V1 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], Mask1.size())); + + SmallVector Mask2(4, DAG.getNode(ISD::UNDEF, MaskEVT)); for (unsigned i = 0; i != 4; ++i) { if (Locs[i].first == -1) continue; @@ -3697,6 +3699,59 @@ LowerVECTOR_SHUFFLE_4wide(SDOperand V1, SDOperand V2, return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V1, DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask2[0], Mask2.size())); + } else if (NumLo == 3 || NumHi == 3) { + // Otherwise, we must have three elements from one vector, call it X, and + // one element from the other, call it Y. First, use a shufps to build an + // intermediate vector with the one element from Y and the element from X + // that will be in the same half in the final destination (the indexes don't + // matter). Then, use a shufps to build the final vector, taking the half + // containing the element from Y from the intermediate, and the other half + // from X. + if (NumHi == 3) { + // Normalize it so the 3 elements come from V1. + PermMask = CommuteVectorShuffleMask(PermMask, DAG); + std::swap(V1, V2); + } + + // Find the element from V2. + unsigned HiIndex; + for (HiIndex = 0; HiIndex < 3; ++HiIndex) { + SDOperand Elt = PermMask.getOperand(HiIndex); + if (Elt.getOpcode() == ISD::UNDEF) + continue; + unsigned Val = cast(Elt)->getValue(); + if (Val >= 4) + break; + } + + Mask1[0] = PermMask.getOperand(HiIndex); + Mask1[1] = DAG.getNode(ISD::UNDEF, MaskEVT); + Mask1[2] = PermMask.getOperand(HiIndex^1); + Mask1[3] = DAG.getNode(ISD::UNDEF, MaskEVT); + V2 = DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4)); + + if (HiIndex >= 2) { + Mask1[0] = PermMask.getOperand(0); + Mask1[1] = PermMask.getOperand(1); + Mask1[2] = DAG.getConstant(HiIndex & 1 ? 6 : 4, MaskEVT); + Mask1[3] = DAG.getConstant(HiIndex & 1 ? 4 : 6, MaskEVT); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V1, V2, + DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4)); + } else { + Mask1[0] = DAG.getConstant(HiIndex & 1 ? 2 : 0, MaskEVT); + Mask1[1] = DAG.getConstant(HiIndex & 1 ? 0 : 2, MaskEVT); + Mask1[2] = PermMask.getOperand(2); + Mask1[3] = PermMask.getOperand(3); + if (Mask1[2].getOpcode() != ISD::UNDEF) + Mask1[2] = DAG.getConstant(cast(Mask1[2])->getValue()+4, + MaskEVT); + if (Mask1[3].getOpcode() != ISD::UNDEF) + Mask1[3] = DAG.getConstant(cast(Mask1[3])->getValue()+4, + MaskEVT); + return DAG.getNode(ISD::VECTOR_SHUFFLE, VT, V2, V1, + DAG.getNode(ISD::BUILD_VECTOR, MaskVT, &Mask1[0], 4)); + } } // Break it into (shuffle shuffle_hi, shuffle_lo). diff --git a/test/CodeGen/X86/vec_insert-2.ll b/test/CodeGen/X86/vec_insert-2.ll index 8207afe634..a6d4f014d2 100644 --- a/test/CodeGen/X86/vec_insert-2.ll +++ b/test/CodeGen/X86/vec_insert-2.ll @@ -1,6 +1,5 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep {\$132,} | count 2 -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep {\$2,} | count 2 -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep shufps | count 4 +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep {\$36,} | count 2 +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep shufps | count 2 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pinsrw | count 1 ; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep movhpd | count 1 ; RUN: llvm-as < %s | llc -march=x86-64 -mattr=+sse2 | grep unpcklpd | count 1 diff --git a/test/CodeGen/X86/vec_insert-6.ll b/test/CodeGen/X86/vec_insert-6.ll index 405152e2dc..865a6f009a 100644 --- a/test/CodeGen/X86/vec_insert-6.ll +++ b/test/CodeGen/X86/vec_insert-6.ll @@ -1,4 +1,5 @@ -; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | not grep pslldq +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 | grep pslldq +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -stats -info-output-file - | grep asm-printer | grep 6 define <4 x float> @t3(<4 x float>* %P) nounwind { %tmp1 = load <4 x float>* %P diff --git a/test/CodeGen/X86/vec_shuffle-19.ll b/test/CodeGen/X86/vec_shuffle-19.ll new file mode 100644 index 0000000000..eeeab81362 --- /dev/null +++ b/test/CodeGen/X86/vec_shuffle-19.ll @@ -0,0 +1,8 @@ +; RUN: llvm-as < %s | llc -march=x86 -mattr=+sse2 -stats -info-output-file - | grep asm-printer | grep 4 +; PR2485 + +define <4 x i32> @t(<4 x i32> %a, <4 x i32> %b) nounwind { +entry: + %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> < i32 4, i32 0, i32 0, i32 0 > ; <<4 x i32>> [#uses=1] + ret <4 x i32> %shuffle +} -- cgit v1.2.3