From c692cb77aaa8b16bcc7fe0c70d47adce94c43911 Mon Sep 17 00:00:00 2001 From: Bob Wilson Date: Fri, 21 Aug 2009 20:54:19 +0000 Subject: Match VTRN, VZIP, and VUZP shuffles. Restore the tests for these operations, now using shuffles instead of intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@79673 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 78 +++++++++++++++++++++++++++--- lib/Target/ARM/ARMISelLowering.h | 5 +- test/CodeGen/ARM/vtrn.ll | 99 ++++++++++++++++++++++++++++++++++++++ test/CodeGen/ARM/vuzp.ll | 79 ++++++++++++++++++++++++++++++ test/CodeGen/ARM/vzip.ll | 79 ++++++++++++++++++++++++++++++ 5 files changed, 331 insertions(+), 9 deletions(-) create mode 100644 test/CodeGen/ARM/vtrn.ll create mode 100644 test/CodeGen/ARM/vuzp.ll create mode 100644 test/CodeGen/ARM/vzip.ll diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 1c59cb01fa..f04b45dc79 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -2403,6 +2403,53 @@ static bool isVREVMask(const SmallVectorImpl &M, EVT VT, return true; } +static bool isVTRNMask(const SmallVectorImpl &M, EVT VT, + unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i < NumElts; i += 2) { + if ((unsigned) M[i] != i + WhichResult || + (unsigned) M[i+1] != i + NumElts + WhichResult) + return false; + } + return true; +} + +static bool isVUZPMask(const SmallVectorImpl &M, EVT VT, + unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + if ((unsigned) M[i] != 2 * i + WhichResult) + return false; + } + + // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32) + return false; + + return true; +} + +static bool isVZIPMask(const SmallVectorImpl &M, EVT VT, + unsigned &WhichResult) { + unsigned NumElts = VT.getVectorNumElements(); + WhichResult = (M[0] == 0 ? 0 : 1); + unsigned Idx = WhichResult * NumElts / 2; + for (unsigned i = 0; i != NumElts; i += 2) { + if ((unsigned) M[i] != Idx || + (unsigned) M[i+1] != Idx + NumElts) + return false; + Idx += 1; + } + + // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32. + if (VT.is64BitVector() && VT.getVectorElementType().getSizeInBits() == 32) + return false; + + return true; +} + static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) { // Canonicalize all-zeros and all-ones vectors. ConstantSDNode *ConstVal = cast(Val.getNode()); @@ -2504,13 +2551,16 @@ ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl &M, } bool ReverseVEXT; - unsigned Imm; + unsigned Imm, WhichResult; return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isVREVMask(M, VT, 64) || isVREVMask(M, VT, 32) || isVREVMask(M, VT, 16) || - isVEXTMask(M, VT, ReverseVEXT, Imm)); + isVEXTMask(M, VT, ReverseVEXT, Imm) || + isVTRNMask(M, VT, WhichResult) || + isVUZPMask(M, VT, WhichResult) || + isVZIPMask(M, VT, WhichResult)); } /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit @@ -2610,11 +2660,9 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { bool ReverseVEXT; unsigned Imm; if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) { - SDValue Op0 = SVN->getOperand(0); - SDValue Op1 = SVN->getOperand(1); if (ReverseVEXT) - std::swap(Op0, Op1); - return DAG.getNode(ARMISD::VEXT, dl, VT, Op0, Op1, + std::swap(V1, V2); + return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2, DAG.getConstant(Imm, MVT::i32)); } @@ -2625,6 +2673,24 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) { if (isVREVMask(ShuffleMask, VT, 16)) return DAG.getNode(ARMISD::VREV16, dl, VT, V1); + // Check for Neon shuffles that modify both input vectors in place. + // If both results are used, i.e., if there are two shuffles with the same + // source operands and with masks corresponding to both results of one of + // these operations, DAG memoization will ensure that a single node is + // used for both shuffles. + unsigned WhichResult; + if (isVTRNMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVUZPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + if (isVZIPMask(ShuffleMask, VT, WhichResult)) + return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT), + V1, V2).getValue(WhichResult); + + // If the shuffle is not directly supported and it has 4 elements, use + // the PerfectShuffle-generated table to synthesize it from other shuffles. if (VT.getVectorNumElements() == 4 && (VT.is128BitVector() || VT.is64BitVector())) { unsigned PFIndexes[4]; diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index b2260e2d83..cb88be5513 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -132,9 +132,8 @@ namespace llvm { VREV64, // reverse elements within 64-bit doublewords VREV32, // reverse elements within 32-bit words VREV16, // reverse elements within 16-bit halfwords - - VZIP, // zip - VUZP, // unzip + VZIP, // zip (interleave) + VUZP, // unzip (deinterleave) VTRN // transpose }; } diff --git a/test/CodeGen/ARM/vtrn.ll b/test/CodeGen/ARM/vtrn.ll new file mode 100644 index 0000000000..96dfb3ede4 --- /dev/null +++ b/test/CodeGen/ARM/vtrn.ll @@ -0,0 +1,99 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s + +%struct.__builtin_neon_v8qi2 = type { <8 x i8>, <8 x i8> } +%struct.__builtin_neon_v4hi2 = type { <4 x i16>, <4 x i16> } +%struct.__builtin_neon_v2si2 = type { <2 x i32>, <2 x i32> } +%struct.__builtin_neon_v2sf2 = type { <2 x float>, <2 x float> } + +%struct.__builtin_neon_v16qi2 = type { <16 x i8>, <16 x i8> } +%struct.__builtin_neon_v8hi2 = type { <8 x i16>, <8 x i16> } +%struct.__builtin_neon_v4si2 = type { <4 x i32>, <4 x i32> } +%struct.__builtin_neon_v4sf2 = type { <4 x float>, <4 x float> } + +define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vtrni8: +;CHECK: vtrn.8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + %tmp5 = add <8 x i8> %tmp3, %tmp4 + ret <8 x i8> %tmp5 +} + +define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vtrni16: +;CHECK: vtrn.16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + %tmp5 = add <4 x i16> %tmp3, %tmp4 + ret <4 x i16> %tmp5 +} + +define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vtrni32: +;CHECK: vtrn.32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> + %tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> + %tmp5 = add <2 x i32> %tmp3, %tmp4 + ret <2 x i32> %tmp5 +} + +define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind { +;CHECK: vtrnf: +;CHECK: vtrn.32 + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> + %tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> + %tmp5 = add <2 x float> %tmp3, %tmp4 + ret <2 x float> %tmp5 +} + +define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vtrnQi8: +;CHECK: vtrn.8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + %tmp5 = add <16 x i8> %tmp3, %tmp4 + ret <16 x i8> %tmp5 +} + +define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vtrnQi16: +;CHECK: vtrn.16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp5 = add <8 x i16> %tmp3, %tmp4 + ret <8 x i16> %tmp5 +} + +define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vtrnQi32: +;CHECK: vtrn.32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp5 = add <4 x i32> %tmp3, %tmp4 + ret <4 x i32> %tmp5 +} + +define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK: vtrnQf: +;CHECK: vtrn.32 + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp5 = add <4 x float> %tmp3, %tmp4 + ret <4 x float> %tmp5 +} diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll new file mode 100644 index 0000000000..35beae36fb --- /dev/null +++ b/test/CodeGen/ARM/vuzp.ll @@ -0,0 +1,79 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s + +%struct.__builtin_neon_v8qi2 = type { <8 x i8>, <8 x i8> } +%struct.__builtin_neon_v4hi2 = type { <4 x i16>, <4 x i16> } +%struct.__builtin_neon_v2si2 = type { <2 x i32>, <2 x i32> } +%struct.__builtin_neon_v2sf2 = type { <2 x float>, <2 x float> } + +%struct.__builtin_neon_v16qi2 = type { <16 x i8>, <16 x i8> } +%struct.__builtin_neon_v8hi2 = type { <8 x i16>, <8 x i16> } +%struct.__builtin_neon_v4si2 = type { <4 x i32>, <4 x i32> } +%struct.__builtin_neon_v4sf2 = type { <4 x float>, <4 x float> } + +define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vuzpi8: +;CHECK: vuzp.8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + %tmp5 = add <8 x i8> %tmp3, %tmp4 + ret <8 x i8> %tmp5 +} + +define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vuzpi16: +;CHECK: vuzp.16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + %tmp5 = add <4 x i16> %tmp3, %tmp4 + ret <4 x i16> %tmp5 +} + +; VUZP.32 is equivalent to VTRN.32 for 64-bit vectors. + +define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vuzpQi8: +;CHECK: vuzp.8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + %tmp5 = add <16 x i8> %tmp3, %tmp4 + ret <16 x i8> %tmp5 +} + +define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vuzpQi16: +;CHECK: vuzp.16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp5 = add <8 x i16> %tmp3, %tmp4 + ret <8 x i16> %tmp5 +} + +define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vuzpQi32: +;CHECK: vuzp.32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp5 = add <4 x i32> %tmp3, %tmp4 + ret <4 x i32> %tmp5 +} + +define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK: vuzpQf: +;CHECK: vuzp.32 + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp5 = add <4 x float> %tmp3, %tmp4 + ret <4 x float> %tmp5 +} diff --git a/test/CodeGen/ARM/vzip.ll b/test/CodeGen/ARM/vzip.ll new file mode 100644 index 0000000000..f8946558b4 --- /dev/null +++ b/test/CodeGen/ARM/vzip.ll @@ -0,0 +1,79 @@ +; RUN: llvm-as < %s | llc -march=arm -mattr=+neon | FileCheck %s + +%struct.__builtin_neon_v8qi2 = type { <8 x i8>, <8 x i8> } +%struct.__builtin_neon_v4hi2 = type { <4 x i16>, <4 x i16> } +%struct.__builtin_neon_v2si2 = type { <2 x i32>, <2 x i32> } +%struct.__builtin_neon_v2sf2 = type { <2 x float>, <2 x float> } + +%struct.__builtin_neon_v16qi2 = type { <16 x i8>, <16 x i8> } +%struct.__builtin_neon_v8hi2 = type { <8 x i16>, <8 x i16> } +%struct.__builtin_neon_v4si2 = type { <4 x i32>, <4 x i32> } +%struct.__builtin_neon_v4sf2 = type { <4 x float>, <4 x float> } + +define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vzipi8: +;CHECK: vzip.8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> + %tmp5 = add <8 x i8> %tmp3, %tmp4 + ret <8 x i8> %tmp5 +} + +define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vzipi16: +;CHECK: vzip.16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> + %tmp5 = add <4 x i16> %tmp3, %tmp4 + ret <4 x i16> %tmp5 +} + +; VZIP.32 is equivalent to VTRN.32 for 64-bit vectors. + +define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vzipQi8: +;CHECK: vzip.8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + %tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> + %tmp5 = add <16 x i8> %tmp3, %tmp4 + ret <16 x i8> %tmp5 +} + +define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vzipQi16: +;CHECK: vzip.16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> + %tmp5 = add <8 x i16> %tmp3, %tmp4 + ret <8 x i16> %tmp5 +} + +define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vzipQi32: +;CHECK: vzip.32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> + %tmp5 = add <4 x i32> %tmp3, %tmp4 + ret <4 x i32> %tmp5 +} + +define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK: vzipQf: +;CHECK: vzip.32 + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> + %tmp5 = add <4 x float> %tmp3, %tmp4 + ret <4 x float> %tmp5 +} -- cgit v1.2.3