From 1c8add99789a3066e94e97c56a6ce11a5c8e8740 Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Wed, 30 Apr 2014 12:09:32 +0000 Subject: [mips][msa] Fix vector insertions where the index is variable Summary: This isn't supported directly so we rotate the vector by the desired number of elements, insert to element zero, then rotate back. The i64 case generates rather poor code on MIPS32. There is an obvious optimisation to be made in future (do both insert.w's inside a shared rotate/unrotate sequence) but for now it's sufficient to select valid code instead of aborting. Depends on D3536 Reviewers: matheusalmeida Reviewed By: matheusalmeida Differential Revision: http://reviews.llvm.org/D3537 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207640 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MipsMSAInstrInfo.td | 30 ++++++ lib/Target/Mips/MipsSEISelLowering.cpp | 137 ++++++++++++++++++++++++ lib/Target/Mips/MipsSEISelLowering.h | 5 + test/CodeGen/Mips/msa/basic_operations.ll | 123 +++++++++++++++++++++ test/CodeGen/Mips/msa/basic_operations_float.ll | 52 +++++++++ 5 files changed, 347 insertions(+) diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td index d30e5f0461..a35a16e1d3 100644 --- a/lib/Target/Mips/MipsMSAInstrInfo.td +++ b/lib/Target/Mips/MipsMSAInstrInfo.td @@ -1505,6 +1505,15 @@ class MSA_INSERT_PSEUDO_BASE : + MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, GPR32Opnd:$n, ROFS:$fs), + [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs, + GPR32Opnd:$n))]> { + bit usesCustomInserter = 1; + string Constraints = "$wd = $wd_in"; +} + class MSA_INSVE_DESC_BASE { @@ -2300,11 +2309,25 @@ class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32, class INSERT_D_DESC : MSA_INSERT_DESC_BASE<"insert.d", vinsert_v2i64, MSA128DOpnd, GPR64Opnd>; +class INSERT_B_VIDX_PSEUDO_DESC : + MSA_INSERT_VIDX_PSEUDO_BASE; +class INSERT_H_VIDX_PSEUDO_DESC : + MSA_INSERT_VIDX_PSEUDO_BASE; +class INSERT_W_VIDX_PSEUDO_DESC : + MSA_INSERT_VIDX_PSEUDO_BASE; +class INSERT_D_VIDX_PSEUDO_DESC : + MSA_INSERT_VIDX_PSEUDO_BASE; + class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE; class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE; +class INSERT_FW_VIDX_PSEUDO_DESC : + MSA_INSERT_VIDX_PSEUDO_BASE; +class INSERT_FD_VIDX_PSEUDO_DESC : + MSA_INSERT_VIDX_PSEUDO_BASE; + class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8, MSA128BOpnd>; class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16, @@ -3214,6 +3237,13 @@ let DecoderMethod = "DecodeINSVE_DF" in { def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC; def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC; +def INSERT_B_VIDX_PSEUDO : INSERT_B_VIDX_PSEUDO_DESC; +def INSERT_H_VIDX_PSEUDO : INSERT_H_VIDX_PSEUDO_DESC; +def INSERT_W_VIDX_PSEUDO : INSERT_W_VIDX_PSEUDO_DESC; +def INSERT_D_VIDX_PSEUDO : INSERT_D_VIDX_PSEUDO_DESC; +def INSERT_FW_VIDX_PSEUDO : INSERT_FW_VIDX_PSEUDO_DESC; +def INSERT_FD_VIDX_PSEUDO : INSERT_FD_VIDX_PSEUDO_DESC; + def LD_B: LD_B_ENC, LD_B_DESC; def LD_H: LD_H_ENC, LD_H_DESC; def LD_W: LD_W_ENC, LD_W_DESC; diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index 93a1a03a87..eb9a819aa3 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -1054,6 +1054,18 @@ MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, return emitINSERT_FW(MI, BB); case Mips::INSERT_FD_PSEUDO: return emitINSERT_FD(MI, BB); + case Mips::INSERT_B_VIDX_PSEUDO: + return emitINSERT_DF_VIDX(MI, BB, 1, false); + case Mips::INSERT_H_VIDX_PSEUDO: + return emitINSERT_DF_VIDX(MI, BB, 2, false); + case Mips::INSERT_W_VIDX_PSEUDO: + return emitINSERT_DF_VIDX(MI, BB, 4, false); + case Mips::INSERT_D_VIDX_PSEUDO: + return emitINSERT_DF_VIDX(MI, BB, 8, false); + case Mips::INSERT_FW_VIDX_PSEUDO: + return emitINSERT_DF_VIDX(MI, BB, 4, true); + case Mips::INSERT_FD_VIDX_PSEUDO: + return emitINSERT_DF_VIDX(MI, BB, 8, true); case Mips::FILL_FW_PSEUDO: return emitFILL_FW(MI, BB); case Mips::FILL_FD_PSEUDO: @@ -2887,6 +2899,131 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI, return BB; } +// Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction. +// +// For integer: +// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $rs) +// => +// (SLL $lanetmp1, $lane, +// (SUBREG_TO_REG $wt, $fs, ) +// (SLL $lanetmp1, $lane, getParent()->getRegInfo(); + DebugLoc DL = MI->getDebugLoc(); + unsigned Wd = MI->getOperand(0).getReg(); + unsigned SrcVecReg = MI->getOperand(1).getReg(); + unsigned LaneReg = MI->getOperand(2).getReg(); + unsigned SrcValReg = MI->getOperand(3).getReg(); + + const TargetRegisterClass *VecRC = nullptr; + const TargetRegisterClass *GPRRC = isGP64bit() ? &Mips::GPR64RegClass + : &Mips::GPR32RegClass; + unsigned EltLog2Size; + unsigned InsertOp = 0; + unsigned InsveOp = 0; + switch (EltSizeInBytes) { + default: + llvm_unreachable("Unexpected size"); + case 1: + EltLog2Size = 0; + InsertOp = Mips::INSERT_B; + InsveOp = Mips::INSVE_B; + VecRC = &Mips::MSA128BRegClass; + break; + case 2: + EltLog2Size = 1; + InsertOp = Mips::INSERT_H; + InsveOp = Mips::INSVE_H; + VecRC = &Mips::MSA128HRegClass; + break; + case 4: + EltLog2Size = 2; + InsertOp = Mips::INSERT_W; + InsveOp = Mips::INSVE_W; + VecRC = &Mips::MSA128WRegClass; + break; + case 8: + EltLog2Size = 3; + InsertOp = Mips::INSERT_D; + InsveOp = Mips::INSVE_D; + VecRC = &Mips::MSA128DRegClass; + break; + } + + if (IsFP) { + unsigned Wt = RegInfo.createVirtualRegister(VecRC); + BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt) + .addImm(0) + .addReg(SrcValReg) + .addImm(EltSizeInBytes == 8 ? Mips::sub_64 : Mips::sub_lo); + SrcValReg = Wt; + } + + // Convert the lane index into a byte index + if (EltSizeInBytes != 1) { + unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC); + BuildMI(*BB, MI, DL, TII->get(Mips::SLL), LaneTmp1) + .addReg(LaneReg) + .addImm(EltLog2Size); + LaneReg = LaneTmp1; + } + + // Rotate bytes around so that the desired lane is element zero + unsigned WdTmp1 = RegInfo.createVirtualRegister(VecRC); + BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1) + .addReg(SrcVecReg) + .addReg(SrcVecReg) + .addReg(LaneReg); + + unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC); + if (IsFP) { + // Use insve.df to insert to element zero + BuildMI(*BB, MI, DL, TII->get(InsveOp), WdTmp2) + .addReg(WdTmp1) + .addImm(0) + .addReg(SrcValReg) + .addImm(0); + } else { + // Use insert.df to insert to element zero + BuildMI(*BB, MI, DL, TII->get(InsertOp), WdTmp2) + .addReg(WdTmp1) + .addReg(SrcValReg) + .addImm(0); + } + + // Rotate elements the rest of the way for a full rotation. + // sld.df inteprets $rt modulo the number of columns so we only need to negate + // the lane index to do this. + unsigned LaneTmp2 = RegInfo.createVirtualRegister(GPRRC); + BuildMI(*BB, MI, DL, TII->get(Mips::SUB), LaneTmp2) + .addReg(Mips::ZERO) + .addReg(LaneReg); + BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), Wd) + .addReg(WdTmp2) + .addReg(WdTmp2) + .addReg(LaneTmp2); + + MI->eraseFromParent(); // The pseudo instruction is gone now. + return BB; +} + // Emit the FILL_FW pseudo instruction. // // fill_fw_pseudo $wd, $fs diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h index b88ddea89b..03a20ef674 100644 --- a/lib/Target/Mips/MipsSEISelLowering.h +++ b/lib/Target/Mips/MipsSEISelLowering.h @@ -98,6 +98,11 @@ namespace llvm { /// \brief Emit the INSERT_FD pseudo instruction MachineBasicBlock *emitINSERT_FD(MachineInstr *MI, MachineBasicBlock *BB) const; + /// \brief Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction + MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr *MI, + MachineBasicBlock *BB, + unsigned EltSizeInBytes, + bool IsFP) const; /// \brief Emit the FILL_FW pseudo instruction MachineBasicBlock *emitFILL_FW(MachineInstr *MI, MachineBasicBlock *BB) const; diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll index cfe43b94ff..dbdf42be49 100644 --- a/test/CodeGen/Mips/msa/basic_operations.ll +++ b/test/CodeGen/Mips/msa/basic_operations.ll @@ -673,6 +673,129 @@ define void @insert_v2i64(i64 %a) nounwind { ; MIPS32-AE: .size insert_v2i64 } +define void @insert_v16i8_vidx(i32 %a) nounwind { + ; MIPS32-AE: insert_v16i8_vidx: + + %1 = load <16 x i8>* @v16i8 + ; MIPS32-AE-DAG: ld.b [[R1:\$w[0-9]+]], + + %2 = load i32* @i32 + ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)( + ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]]) + + %a2 = trunc i32 %a to i8 + %a3 = sext i8 %a2 to i32 + %a4 = trunc i32 %a3 to i8 + ; MIPS32-AE-NOT: andi + ; MIPS32-AE-NOT: sra + + %3 = insertelement <16 x i8> %1, i8 %a4, i32 %2 + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[IDX]]] + ; MIPS32-AE-DAG: insert.b [[R1]][0], $4 + ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[IDX]] + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]] + + store <16 x i8> %3, <16 x i8>* @v16i8 + ; MIPS32-AE-DAG: st.b [[R1]] + + ret void + ; MIPS32-AE: .size insert_v16i8_vidx +} + +define void @insert_v8i16_vidx(i32 %a) nounwind { + ; MIPS32-AE: insert_v8i16_vidx: + + %1 = load <8 x i16>* @v8i16 + ; MIPS32-AE-DAG: ld.h [[R1:\$w[0-9]+]], + + %2 = load i32* @i32 + ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)( + ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]]) + + %a2 = trunc i32 %a to i16 + %a3 = sext i16 %a2 to i32 + %a4 = trunc i32 %a3 to i16 + ; MIPS32-AE-NOT: andi + ; MIPS32-AE-NOT: sra + + %3 = insertelement <8 x i16> %1, i16 %a4, i32 %2 + ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 1 + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]] + ; MIPS32-AE-DAG: insert.h [[R1]][0], $4 + ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]] + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]] + + store <8 x i16> %3, <8 x i16>* @v8i16 + ; MIPS32-AE-DAG: st.h [[R1]] + + ret void + ; MIPS32-AE: .size insert_v8i16_vidx +} + +define void @insert_v4i32_vidx(i32 %a) nounwind { + ; MIPS32-AE: insert_v4i32_vidx: + + %1 = load <4 x i32>* @v4i32 + ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]], + + %2 = load i32* @i32 + ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)( + ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]]) + + ; MIPS32-AE-NOT: andi + ; MIPS32-AE-NOT: sra + + %3 = insertelement <4 x i32> %1, i32 %a, i32 %2 + ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 2 + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]] + ; MIPS32-AE-DAG: insert.w [[R1]][0], $4 + ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]] + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]] + + store <4 x i32> %3, <4 x i32>* @v4i32 + ; MIPS32-AE-DAG: st.w [[R1]] + + ret void + ; MIPS32-AE: .size insert_v4i32_vidx +} + +define void @insert_v2i64_vidx(i64 %a) nounwind { + ; MIPS32-AE: insert_v2i64_vidx: + + %1 = load <2 x i64>* @v2i64 + ; MIPS32-AE-DAG: ld.w [[R1:\$w[0-9]+]], + + %2 = load i32* @i32 + ; MIPS32-AE-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)( + ; MIPS32-AE-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]]) + + ; MIPS32-AE-NOT: andi + ; MIPS32-AE-NOT: sra + + %3 = insertelement <2 x i64> %1, i64 %a, i32 %2 + ; TODO: This code could be a lot better but it works. The legalizer splits + ; 64-bit inserts into two 32-bit inserts because there is no i64 type on + ; MIPS32. The obvious optimisation is to perform both insert.w's at once while + ; the vector is rotated. + ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 2 + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]] + ; MIPS32-AE-DAG: insert.w [[R1]][0], $4 + ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]] + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]] + ; MIPS32-AE-DAG: addiu [[IDX2:\$[0-9]+]], [[IDX]], 1 + ; MIPS32-AE-DAG: sll [[BIDX:\$[0-9]+]], [[IDX2]], 2 + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]] + ; MIPS32-AE-DAG: insert.w [[R1]][0], $5 + ; MIPS32-AE-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]] + ; MIPS32-AE-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]] + + store <2 x i64> %3, <2 x i64>* @v2i64 + ; MIPS32-AE-DAG: st.w [[R1]] + + ret void + ; MIPS32-AE: .size insert_v2i64_vidx +} + define void @truncstore() nounwind { ; MIPS32-AE-LABEL: truncstore: diff --git a/test/CodeGen/Mips/msa/basic_operations_float.ll b/test/CodeGen/Mips/msa/basic_operations_float.ll index ceefa9114b..a0c9d29e23 100644 --- a/test/CodeGen/Mips/msa/basic_operations_float.ll +++ b/test/CodeGen/Mips/msa/basic_operations_float.ll @@ -275,3 +275,55 @@ define void @insert_v2f64(double %a) nounwind { ret void ; MIPS32: .size insert_v2f64 } + +define void @insert_v4f32_vidx(float %a) nounwind { + ; MIPS32-LABEL: insert_v4f32_vidx: + + %1 = load <4 x float>* @v4f32 + ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v4f32)( + ; MIPS32-DAG: ld.w [[R1:\$w[0-9]+]], 0([[PTR_V]]) + + %2 = load i32* @i32 + ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)( + ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]]) + + %3 = insertelement <4 x float> %1, float %a, i32 %2 + ; float argument passed in $f12 + ; MIPS32-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 2 + ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]] + ; MIPS32-DAG: insve.w [[R1]][0], $w12[0] + ; MIPS32-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]] + ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]] + + store <4 x float> %3, <4 x float>* @v4f32 + ; MIPS32-DAG: st.w [[R1]] + + ret void + ; MIPS32: .size insert_v4f32_vidx +} + +define void @insert_v2f64_vidx(double %a) nounwind { + ; MIPS32-LABEL: insert_v2f64_vidx: + + %1 = load <2 x double>* @v2f64 + ; MIPS32-DAG: lw [[PTR_V:\$[0-9]+]], %got(v2f64)( + ; MIPS32-DAG: ld.d [[R1:\$w[0-9]+]], 0([[PTR_V]]) + + %2 = load i32* @i32 + ; MIPS32-DAG: lw [[PTR_I:\$[0-9]+]], %got(i32)( + ; MIPS32-DAG: lw [[IDX:\$[0-9]+]], 0([[PTR_I]]) + + %3 = insertelement <2 x double> %1, double %a, i32 %2 + ; double argument passed in $f12 + ; MIPS32-DAG: sll [[BIDX:\$[0-9]+]], [[IDX]], 3 + ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[BIDX]]] + ; MIPS32-DAG: insve.d [[R1]][0], $w12[0] + ; MIPS32-DAG: neg [[NIDX:\$[0-9]+]], [[BIDX]] + ; MIPS32-DAG: sld.b [[R1]], [[R1]]{{\[}}[[NIDX]]] + + store <2 x double> %3, <2 x double>* @v2f64 + ; MIPS32-DAG: st.d [[R1]] + + ret void + ; MIPS32: .size insert_v2f64_vidx +} -- cgit v1.2.3