From 8a0ff1f236e77214878c9d493e786b30656ad2a1 Mon Sep 17 00:00:00 2001 From: Bill Wendling Date: Tue, 26 Nov 2013 10:57:43 +0000 Subject: Merging r195716: ------------------------------------------------------------------------ r195716 | kevinqin | 2013-11-25 19:26:47 -0800 (Mon, 25 Nov 2013) | 3 lines Refactored the implementation of AArch64 NEON instruction ZIP, UZP and TRN. Fix a bug when mixed use of vget_high_u8() and vuzp_u8(). ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_34@195735 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64ISelLowering.cpp | 165 ++++++++++++- lib/Target/AArch64/AArch64ISelLowering.h | 10 + lib/Target/AArch64/AArch64InstrNEON.td | 379 +++++------------------------ test/CodeGen/AArch64/neon-perm.ll | 14 ++ 4 files changed, 240 insertions(+), 328 deletions(-) diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 003359d1b5..ee98b4cf1d 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -921,6 +921,18 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { return "AArch64ISD::NEON_REV32"; case AArch64ISD::NEON_REV64: return "AArch64ISD::NEON_REV64"; + case AArch64ISD::NEON_UZP1: + return "AArch64ISD::NEON_UZP1"; + case AArch64ISD::NEON_UZP2: + return "AArch64ISD::NEON_UZP2"; + case AArch64ISD::NEON_ZIP1: + return "AArch64ISD::NEON_ZIP1"; + case AArch64ISD::NEON_ZIP2: + return "AArch64ISD::NEON_ZIP2"; + case AArch64ISD::NEON_TRN1: + return "AArch64ISD::NEON_TRN1"; + case AArch64ISD::NEON_TRN2: + return "AArch64ISD::NEON_TRN2"; case AArch64ISD::NEON_LD1_UPD: return "AArch64ISD::NEON_LD1_UPD"; case AArch64ISD::NEON_LD2_UPD: @@ -3826,6 +3838,59 @@ AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; } +// Check whether a Build Vector could be presented as Shuffle Vector. If yes, +// try to call LowerVECTOR_SHUFFLE to lower it. +bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, + SDValue &Res) const { + SDLoc DL(Op); + EVT VT = Op.getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + unsigned V0NumElts = 0; + int Mask[16]; + SDValue V0, V1; + + // Check if all elements are extracted from less than 3 vectors. + for (unsigned i = 0; i < NumElts; ++i) { + SDValue Elt = Op.getOperand(i); + if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + if (V0.getNode() == 0) { + V0 = Elt.getOperand(0); + V0NumElts = V0.getValueType().getVectorNumElements(); + } + if (Elt.getOperand(0) == V0) { + Mask[i] = (cast(Elt->getOperand(1))->getZExtValue()); + continue; + } else if (V1.getNode() == 0) { + V1 = Elt.getOperand(0); + } + if (Elt.getOperand(0) == V1) { + unsigned Lane = cast(Elt->getOperand(1))->getZExtValue(); + Mask[i] = (Lane + V0NumElts); + continue; + } else { + return false; + } + } + + if (!V1.getNode() && V0NumElts == NumElts * 2) { + V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0, + DAG.getConstant(NumElts, MVT::i64)); + V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0, + DAG.getConstant(0, MVT::i64)); + V0NumElts = V0.getValueType().getVectorNumElements(); + } + + if (V1.getNode() && NumElts == V0NumElts && + V0NumElts == V1.getValueType().getVectorNumElements()) { + SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask); + Res = LowerVECTOR_SHUFFLE(Shuffle, DAG); + return true; + } else + return false; +} + // If this is a case we can't handle, return null and let the default // expansion code take care of it. SDValue @@ -3964,7 +4029,7 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SmallVector Ops; Ops.push_back(N); Ops.push_back(Op.getOperand(I)); - Ops.push_back(DAG.getConstant(I, MVT::i32)); + Ops.push_back(DAG.getConstant(I, MVT::i64)); N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3); } } @@ -3980,6 +4045,11 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, if (isConstant) return SDValue(); + // Try to lower this in lowering ShuffleVector way. + SDValue Shuf; + if (isKnownShuffleVector(Op, DAG, Shuf)) + return Shuf; + // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we // know the default expansion would otherwise fall back on something even // worse. For a vector with one or two non-undef values, that's @@ -3992,7 +4062,7 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, SDValue V = Op.getOperand(i); if (V.getOpcode() == ISD::UNDEF) continue; - SDValue LaneIdx = DAG.getConstant(i, MVT::i32); + SDValue LaneIdx = DAG.getConstant(i, MVT::i64); Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx); } return Vec; @@ -4030,6 +4100,83 @@ static bool isREVMask(ArrayRef M, EVT VT, unsigned BlockSize) { return true; } +// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and +// TRN instruction. +static unsigned isPermuteMask(ArrayRef M, EVT VT) { + unsigned NumElts = VT.getVectorNumElements(); + if (NumElts < 4) + return 0; + + bool ismatch = true; + + // Check UZP1 + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i * 2) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_UZP1; + + // Check UZP2 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i * 2 + 1) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_UZP2; + + // Check ZIP1 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i / 2 + NumElts * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_ZIP1; + + // Check ZIP2 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != (NumElts + i) / 2 + NumElts * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_ZIP2; + + // Check TRN1 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != i + (NumElts - 1) * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_TRN1; + + // Check TRN2 + ismatch = true; + for (unsigned i = 0; i < NumElts; ++i) { + if ((unsigned)M[i] != 1 + i + (NumElts - 1) * (i % 2)) { + ismatch = false; + break; + } + } + if (ismatch) + return AArch64ISD::NEON_TRN2; + + return 0; +} + SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const { @@ -4056,6 +4203,10 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, if (isREVMask(ShuffleMask, VT, 16)) return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1); + unsigned ISDNo = isPermuteMask(ShuffleMask, VT); + if (ISDNo) + return DAG.getNode(ISDNo, dl, VT, V1, V2); + // If the element of shuffle mask are all the same constant, we can // transform it into either NEON_VDUP or NEON_VDUPLANE if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) { @@ -4167,10 +4318,12 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, else EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32; - ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV, - DAG.getConstant(Mask, MVT::i64)); - InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV, - DAG.getConstant(InsIndex[I], MVT::i64)); + if (Mask >= 0) { + ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV, + DAG.getConstant(Mask, MVT::i64)); + InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV, + DAG.getConstant(InsIndex[I], MVT::i64)); + } } return InsV; } diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index a51d10f01c..4cc2135143 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -125,6 +125,14 @@ namespace AArch64ISD { // Vector FP move immediate NEON_FMOVIMM, + // Vector permute + NEON_UZP1, + NEON_UZP2, + NEON_ZIP1, + NEON_ZIP2, + NEON_TRN1, + NEON_TRN2, + // Vector Element reverse NEON_REV64, NEON_REV32, @@ -225,6 +233,8 @@ public: SDLoc dl, SelectionDAG &DAG, SmallVectorImpl &InVals) const; + bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &Res) const; + SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget *ST) const; diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index c0c572a62e..f6e747a4c7 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -46,6 +46,15 @@ def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, def Neon_sqrshlImm : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>; def Neon_uqrshlImm : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>; +def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisSameAs<0, 2>]>; +def Neon_uzp1 : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>; +def Neon_uzp2 : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>; +def Neon_zip1 : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>; +def Neon_zip2 : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>; +def Neon_trn1 : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>; +def Neon_trn2 : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>; + def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>; def Neon_rev64 : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>; def Neon_rev32 : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>; @@ -2384,331 +2393,57 @@ defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv", // The followings are for instruction class (Perm) class NeonI_Permute size, bits<3> opcode, - string asmop, RegisterOperand OpVPR, string OpS> + string asmop, RegisterOperand OpVPR, string OpS, + SDPatternOperator opnode, ValueType Ty> : NeonI_Perm; - -multiclass NeonI_Perm_pat opcode, string asmop> { - def _8b : NeonI_Permute<0b0, 0b00, opcode, asmop, VPR64, "8b">; - def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop, VPR128, "16b">; - def _4h : NeonI_Permute<0b0, 0b01, opcode, asmop, VPR64, "4h">; - def _8h : NeonI_Permute<0b1, 0b01, opcode, asmop, VPR128, "8h">; - def _2s : NeonI_Permute<0b0, 0b10, opcode, asmop, VPR64, "2s">; - def _4s : NeonI_Permute<0b1, 0b10, opcode, asmop, VPR128, "4s">; - def _2d : NeonI_Permute<0b1, 0b11, opcode, asmop, VPR128, "2d">; -} - -defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1">; -defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1">; -defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1">; -defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2">; -defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2">; -defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2">; - -// Extract and Insert -def NI_ei_i32 : PatFrag<(ops node:$Rn, node:$Rm, node:$Ext, node:$Ins), - (vector_insert node:$Rn, - (i32 (vector_extract node:$Rm, node:$Ext)), - node:$Ins)>; - -def NI_ei_f32 : PatFrag<(ops node:$Rn, node:$Rm, node:$Ext, node:$Ins), - (vector_insert node:$Rn, - (f32 (vector_extract node:$Rm, node:$Ext)), - node:$Ins)>; - -// uzp1 -def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 VPR128:$Rn), - (v16i8 VPR128:$Rn), 2, 1)), - (v16i8 VPR128:$Rn), 4, 2)), - (v16i8 VPR128:$Rn), 6, 3)), - (v16i8 VPR128:$Rn), 8, 4)), - (v16i8 VPR128:$Rn), 10, 5)), - (v16i8 VPR128:$Rn), 12, 6)), - (v16i8 VPR128:$Rn), 14, 7)), - (v16i8 VPR128:$Rm), 0, 8)), - (v16i8 VPR128:$Rm), 2, 9)), - (v16i8 VPR128:$Rm), 4, 10)), - (v16i8 VPR128:$Rm), 6, 11)), - (v16i8 VPR128:$Rm), 8, 12)), - (v16i8 VPR128:$Rm), 10, 13)), - (v16i8 VPR128:$Rm), 12, 14)), - (v16i8 VPR128:$Rm), 14, 15)), - (UZP1vvv_16b VPR128:$Rn, VPR128:$Rm)>; - -class NI_Uzp1_v8 - : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty VPR:$Rn), - (Ty VPR:$Rn), 2, 1)), - (Ty VPR:$Rn), 4, 2)), - (Ty VPR:$Rn), 6, 3)), - (Ty VPR:$Rm), 0, 4)), - (Ty VPR:$Rm), 2, 5)), - (Ty VPR:$Rm), 4, 6)), - (Ty VPR:$Rm), 6, 7)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Uzp1_v8; -def : NI_Uzp1_v8; - -class NI_Uzp1_v4 - : Pat<(Ty (ei (Ty (ei (Ty (ei - (Ty VPR:$Rn), - (Ty VPR:$Rn), 2, 1)), - (Ty VPR:$Rm), 0, 2)), - (Ty VPR:$Rm), 2, 3)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Uzp1_v4; -def : NI_Uzp1_v4; -def : NI_Uzp1_v4; - -// uzp2 -def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 VPR128:$Rm), - (v16i8 VPR128:$Rn), 1, 0)), - (v16i8 VPR128:$Rn), 3, 1)), - (v16i8 VPR128:$Rn), 5, 2)), - (v16i8 VPR128:$Rn), 7, 3)), - (v16i8 VPR128:$Rn), 9, 4)), - (v16i8 VPR128:$Rn), 11, 5)), - (v16i8 VPR128:$Rn), 13, 6)), - (v16i8 VPR128:$Rn), 15, 7)), - (v16i8 VPR128:$Rm), 1, 8)), - (v16i8 VPR128:$Rm), 3, 9)), - (v16i8 VPR128:$Rm), 5, 10)), - (v16i8 VPR128:$Rm), 7, 11)), - (v16i8 VPR128:$Rm), 9, 12)), - (v16i8 VPR128:$Rm), 11, 13)), - (v16i8 VPR128:$Rm), 13, 14)), - (UZP2vvv_16b VPR128:$Rn, VPR128:$Rm)>; - -class NI_Uzp2_v8 - : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty VPR:$Rm), - (Ty VPR:$Rn), 1, 0)), - (Ty VPR:$Rn), 3, 1)), - (Ty VPR:$Rn), 5, 2)), - (Ty VPR:$Rn), 7, 3)), - (Ty VPR:$Rm), 1, 4)), - (Ty VPR:$Rm), 3, 5)), - (Ty VPR:$Rm), 5, 6)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Uzp2_v8; -def : NI_Uzp2_v8; - -class NI_Uzp2_v4 - : Pat<(Ty (ei (Ty (ei (Ty (ei - (Ty VPR:$Rm), - (Ty VPR:$Rn), 1, 0)), - (Ty VPR:$Rn), 3, 1)), - (Ty VPR:$Rm), 1, 2)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Uzp2_v4; -def : NI_Uzp2_v4; -def : NI_Uzp2_v4; - -// zip1 -def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 VPR128:$Rn), - (v16i8 VPR128:$Rm), 0, 1)), - (v16i8 VPR128:$Rn), 1, 2)), - (v16i8 VPR128:$Rm), 1, 3)), - (v16i8 VPR128:$Rn), 2, 4)), - (v16i8 VPR128:$Rm), 2, 5)), - (v16i8 VPR128:$Rn), 3, 6)), - (v16i8 VPR128:$Rm), 3, 7)), - (v16i8 VPR128:$Rn), 4, 8)), - (v16i8 VPR128:$Rm), 4, 9)), - (v16i8 VPR128:$Rn), 5, 10)), - (v16i8 VPR128:$Rm), 5, 11)), - (v16i8 VPR128:$Rn), 6, 12)), - (v16i8 VPR128:$Rm), 6, 13)), - (v16i8 VPR128:$Rn), 7, 14)), - (v16i8 VPR128:$Rm), 7, 15)), - (ZIP1vvv_16b VPR128:$Rn, VPR128:$Rm)>; - -class NI_Zip1_v8 - : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty VPR:$Rn), - (Ty VPR:$Rm), 0, 1)), - (Ty VPR:$Rn), 1, 2)), - (Ty VPR:$Rm), 1, 3)), - (Ty VPR:$Rn), 2, 4)), - (Ty VPR:$Rm), 2, 5)), - (Ty VPR:$Rn), 3, 6)), - (Ty VPR:$Rm), 3, 7)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Zip1_v8; -def : NI_Zip1_v8; - -class NI_Zip1_v4 - : Pat<(Ty (ei (Ty (ei (Ty (ei - (Ty VPR:$Rn), - (Ty VPR:$Rm), 0, 1)), - (Ty VPR:$Rn), 1, 2)), - (Ty VPR:$Rm), 1, 3)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Zip1_v4; -def : NI_Zip1_v4; -def : NI_Zip1_v4; - -// zip2 -def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 VPR128:$Rm), - (v16i8 VPR128:$Rn), 8, 0)), - (v16i8 VPR128:$Rm), 8, 1)), - (v16i8 VPR128:$Rn), 9, 2)), - (v16i8 VPR128:$Rm), 9, 3)), - (v16i8 VPR128:$Rn), 10, 4)), - (v16i8 VPR128:$Rm), 10, 5)), - (v16i8 VPR128:$Rn), 11, 6)), - (v16i8 VPR128:$Rm), 11, 7)), - (v16i8 VPR128:$Rn), 12, 8)), - (v16i8 VPR128:$Rm), 12, 9)), - (v16i8 VPR128:$Rn), 13, 10)), - (v16i8 VPR128:$Rm), 13, 11)), - (v16i8 VPR128:$Rn), 14, 12)), - (v16i8 VPR128:$Rm), 14, 13)), - (v16i8 VPR128:$Rn), 15, 14)), - (ZIP2vvv_16b VPR128:$Rn, VPR128:$Rm)>; - -class NI_Zip2_v8 - : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty VPR:$Rm), - (Ty VPR:$Rn), 4, 0)), - (Ty VPR:$Rm), 4, 1)), - (Ty VPR:$Rn), 5, 2)), - (Ty VPR:$Rm), 5, 3)), - (Ty VPR:$Rn), 6, 4)), - (Ty VPR:$Rm), 6, 5)), - (Ty VPR:$Rn), 7, 6)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Zip2_v8; -def : NI_Zip2_v8; - -class NI_Zip2_v4 - : Pat<(Ty (ei (Ty (ei (Ty (ei - (Ty VPR:$Rm), - (Ty VPR:$Rn), 2, 0)), - (Ty VPR:$Rm), 2, 1)), - (Ty VPR:$Rn), 3, 2)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Zip2_v4; -def : NI_Zip2_v4; -def : NI_Zip2_v4; - -// trn1 -def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 VPR128:$Rn), - (v16i8 VPR128:$Rm), 0, 1)), - (v16i8 VPR128:$Rm), 2, 3)), - (v16i8 VPR128:$Rm), 4, 5)), - (v16i8 VPR128:$Rm), 6, 7)), - (v16i8 VPR128:$Rm), 8, 9)), - (v16i8 VPR128:$Rm), 10, 11)), - (v16i8 VPR128:$Rm), 12, 13)), - (v16i8 VPR128:$Rm), 14, 15)), - (TRN1vvv_16b VPR128:$Rn, VPR128:$Rm)>; - -class NI_Trn1_v8 - : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty VPR:$Rn), - (Ty VPR:$Rm), 0, 1)), - (Ty VPR:$Rm), 2, 3)), - (Ty VPR:$Rm), 4, 5)), - (Ty VPR:$Rm), 6, 7)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Trn1_v8; -def : NI_Trn1_v8; - -class NI_Trn1_v4 - : Pat<(Ty (ei (Ty (ei - (Ty VPR:$Rn), - (Ty VPR:$Rm), 0, 1)), - (Ty VPR:$Rm), 2, 3)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Trn1_v4; -def : NI_Trn1_v4; -def : NI_Trn1_v4; - -// trn2 -def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 - (v16i8 VPR128:$Rm), - (v16i8 VPR128:$Rn), 1, 0)), - (v16i8 VPR128:$Rn), 3, 2)), - (v16i8 VPR128:$Rn), 5, 4)), - (v16i8 VPR128:$Rn), 7, 6)), - (v16i8 VPR128:$Rn), 9, 8)), - (v16i8 VPR128:$Rn), 11, 10)), - (v16i8 VPR128:$Rn), 13, 12)), - (v16i8 VPR128:$Rn), 15, 14)), - (TRN2vvv_16b VPR128:$Rn, VPR128:$Rm)>; - -class NI_Trn2_v8 - : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 - (Ty VPR:$Rm), - (Ty VPR:$Rn), 1, 0)), - (Ty VPR:$Rn), 3, 2)), - (Ty VPR:$Rn), 5, 4)), - (Ty VPR:$Rn), 7, 6)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Trn2_v8; -def : NI_Trn2_v8; - -class NI_Trn2_v4 - : Pat<(Ty (ei (Ty (ei - (Ty VPR:$Rm), - (Ty VPR:$Rn), 1, 0)), - (Ty VPR:$Rn), 3, 2)), - (INST VPR:$Rn, VPR:$Rm)>; - -def : NI_Trn2_v4; -def : NI_Trn2_v4; -def : NI_Trn2_v4; - -// End of implementation for instruction class (Perm) + [(set (Ty OpVPR:$Rd), + (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))], + NoItinerary>; + +multiclass NeonI_Perm_pat opcode, string asmop, + SDPatternOperator opnode> { + def _8b : NeonI_Permute<0b0, 0b00, opcode, asmop, + VPR64, "8b", opnode, v8i8>; + def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop, + VPR128, "16b",opnode, v16i8>; + def _4h : NeonI_Permute<0b0, 0b01, opcode, asmop, + VPR64, "4h", opnode, v4i16>; + def _8h : NeonI_Permute<0b1, 0b01, opcode, asmop, + VPR128, "8h", opnode, v8i16>; + def _2s : NeonI_Permute<0b0, 0b10, opcode, asmop, + VPR64, "2s", opnode, v2i32>; + def _4s : NeonI_Permute<0b1, 0b10, opcode, asmop, + VPR128, "4s", opnode, v4i32>; + def _2d : NeonI_Permute<0b1, 0b11, opcode, asmop, + VPR128, "2d", opnode, v2i64>; +} + +defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>; +defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>; +defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>; +defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>; +defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>; +defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>; + +multiclass NeonI_Perm_float_pat { + def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))), + (!cast(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>; + + def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))), + (!cast(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>; + + def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))), + (!cast(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>; +} + +defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>; +defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>; +defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>; +defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>; +defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>; +defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>; // The followings are for instruction class (3V Diff) diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll index 4db4771cf1..4e1756e705 100644 --- a/test/CodeGen/AArch64/neon-perm.ll +++ b/test/CodeGen/AArch64/neon-perm.ll @@ -1674,3 +1674,17 @@ entry: %.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1 ret %struct.poly16x8x2_t %.fca.0.1.insert } + +define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) { +; CHECK: test_uzp: + + %vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> + %vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> + %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0 + %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1 + ret %struct.uint8x8x2_t %.fca.0.1.insert + +; CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1] +; CHECK-NEXT: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK-NEXT: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +} -- cgit v1.2.3