From 709d59255a3100c7d440c93069efa1f726677a27 Mon Sep 17 00:00:00 2001 From: Bob Wilson Date: Wed, 25 Aug 2010 23:27:42 +0000 Subject: Start converting NEON load/stores to use pseudo instructions, beginning here with the VST4 instructions. Until after register allocation, we want to represent sets of adjacent registers by a single super-register. These VST4 pseudo instructions have a single QQ or QQQQ source register operand. They get expanded to the real VST4 instructions with 4 separate D register operands. Once this conversion is complete, we'll be able to remove the NEONPreAllocPass and avoid some fragile and hacky code elsewhere. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112108 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMExpandPseudoInsts.cpp | 102 ++++++++++++++++++++++++++++++-- lib/Target/ARM/ARMISelDAGToDAG.cpp | 44 ++++++++++---- lib/Target/ARM/ARMInstrFormats.td | 8 +++ lib/Target/ARM/ARMInstrNEON.td | 31 +++++++++- lib/Target/ARM/NEONPreAllocPass.cpp | 21 ------- 5 files changed, 165 insertions(+), 41 deletions(-) diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index e5f8a638eb..c71b093b0b 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -24,6 +24,13 @@ using namespace llvm; namespace { class ARMExpandPseudo : public MachineFunctionPass { + // Constants for register spacing in NEON load/store instructions. + enum NEONRegSpacing { + SingleSpc, + EvenDblSpc, + OddDblSpc + }; + public: static char ID; ARMExpandPseudo() : MachineFunctionPass(ID) {} @@ -41,6 +48,8 @@ namespace { void TransferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI); bool ExpandMBB(MachineBasicBlock &MBB); + void ExpandVST4(MachineBasicBlock::iterator &MBBI, unsigned Opc, + bool hasWriteBack, NEONRegSpacing RegSpc); }; char ARMExpandPseudo::ID = 0; } @@ -63,6 +72,61 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI, } } +/// ExpandVST4 - Translate VST4 pseudo instructions with QQ or QQQQ register +/// operands to real VST4 instructions with 4 D register operands. +void ARMExpandPseudo::ExpandVST4(MachineBasicBlock::iterator &MBBI, + unsigned Opc, bool hasWriteBack, + NEONRegSpacing RegSpc) { + MachineInstr &MI = *MBBI; + MachineBasicBlock &MBB = *MI.getParent(); + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)); + unsigned OpIdx = 0; + if (hasWriteBack) { + bool DstIsDead = MI.getOperand(OpIdx).isDead(); + unsigned DstReg = MI.getOperand(OpIdx++).getReg(); + MIB.addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead)); + } + // Copy the addrmode6 operands. + bool AddrIsKill = MI.getOperand(OpIdx).isKill(); + MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(AddrIsKill)); + MIB.addImm(MI.getOperand(OpIdx++).getImm()); + if (hasWriteBack) { + // Copy the am6offset operand. + bool OffsetIsKill = MI.getOperand(OpIdx).isKill(); + MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(OffsetIsKill)); + } + + bool SrcIsKill = MI.getOperand(OpIdx).isKill(); + unsigned SrcReg = MI.getOperand(OpIdx).getReg(); + unsigned D0, D1, D2, D3; + if (RegSpc == SingleSpc) { + D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); + D1 = TRI->getSubReg(SrcReg, ARM::dsub_1); + D2 = TRI->getSubReg(SrcReg, ARM::dsub_2); + D3 = TRI->getSubReg(SrcReg, ARM::dsub_3); + } else if (RegSpc == EvenDblSpc) { + D0 = TRI->getSubReg(SrcReg, ARM::dsub_0); + D1 = TRI->getSubReg(SrcReg, ARM::dsub_2); + D2 = TRI->getSubReg(SrcReg, ARM::dsub_4); + D3 = TRI->getSubReg(SrcReg, ARM::dsub_6); + } else { + assert(RegSpc == OddDblSpc && "unknown register spacing for VST4"); + D0 = TRI->getSubReg(SrcReg, ARM::dsub_1); + D1 = TRI->getSubReg(SrcReg, ARM::dsub_3); + D2 = TRI->getSubReg(SrcReg, ARM::dsub_5); + D3 = TRI->getSubReg(SrcReg, ARM::dsub_7); + } + + MIB.addReg(D0, getKillRegState(SrcIsKill)) + .addReg(D1, getKillRegState(SrcIsKill)) + .addReg(D2, getKillRegState(SrcIsKill)) + .addReg(D3, getKillRegState(SrcIsKill)); + MIB = AddDefaultPred(MIB); + TransferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); +} + bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { bool Modified = false; @@ -71,9 +135,13 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { MachineInstr &MI = *MBBI; MachineBasicBlock::iterator NMBBI = llvm::next(MBBI); + bool ModifiedOp = true; unsigned Opcode = MI.getOpcode(); switch (Opcode) { - default: break; + default: + ModifiedOp = false; + break; + case ARM::tLDRpci_pic: case ARM::t2LDRpci_pic: { unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic) @@ -92,7 +160,6 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { .addOperand(MI.getOperand(2)); TransferImpOps(MI, MIB1, MIB2); MI.eraseFromParent(); - Modified = true; break; } @@ -128,7 +195,6 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { HI16.addImm(Pred).addReg(PredReg); TransferImpOps(MI, LO16, HI16); MI.eraseFromParent(); - Modified = true; break; } @@ -155,9 +221,37 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { .addReg(OddSrc, getKillRegState(SrcIsKill))); TransferImpOps(MI, Even, Odd); MI.eraseFromParent(); - Modified = true; } + + case ARM::VST4d8Pseudo: + ExpandVST4(MBBI, ARM::VST4d8, false, SingleSpc); break; + case ARM::VST4d16Pseudo: + ExpandVST4(MBBI, ARM::VST4d16, false, SingleSpc); break; + case ARM::VST4d32Pseudo: + ExpandVST4(MBBI, ARM::VST4d32, false, SingleSpc); break; + case ARM::VST4d8Pseudo_UPD: + ExpandVST4(MBBI, ARM::VST4d8_UPD, true, SingleSpc); break; + case ARM::VST4d16Pseudo_UPD: + ExpandVST4(MBBI, ARM::VST4d16_UPD, true, SingleSpc); break; + case ARM::VST4d32Pseudo_UPD: + ExpandVST4(MBBI, ARM::VST4d32_UPD, true, SingleSpc); break; + case ARM::VST4q8Pseudo_UPD: + ExpandVST4(MBBI, ARM::VST4q8_UPD, true, EvenDblSpc); break; + case ARM::VST4q16Pseudo_UPD: + ExpandVST4(MBBI, ARM::VST4q16_UPD, true, EvenDblSpc); break; + case ARM::VST4q32Pseudo_UPD: + ExpandVST4(MBBI, ARM::VST4q32_UPD, true, EvenDblSpc); break; + case ARM::VST4q8oddPseudo_UPD: + ExpandVST4(MBBI, ARM::VST4q8_UPD, true, OddDblSpc); break; + case ARM::VST4q16oddPseudo_UPD: + ExpandVST4(MBBI, ARM::VST4q16_UPD, true, OddDblSpc); break; + case ARM::VST4q32oddPseudo_UPD: + ExpandVST4(MBBI, ARM::VST4q32_UPD, true, OddDblSpc); break; + break; } + + if (ModifiedOp) + Modified = true; MBBI = NMBBI; } diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 89c32eae65..86f64bc48a 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1260,6 +1260,11 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, Ops.push_back(MemAddr); Ops.push_back(Align); + // FIXME: This is a temporary flag to distinguish VSTs that have been + // converted to pseudo instructions. + bool usePseudoInstrs = (NumVecs == 4 && + VT.getSimpleVT().SimpleTy != MVT::v1i64); + if (is64BitVector) { if (NumVecs >= 2) { SDValue RegSeq; @@ -1278,6 +1283,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, : N->getOperand(3+3); RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0); } + if (usePseudoInstrs) + Ops.push_back(RegSeq); + else { // Now extract the D registers back out. Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT, @@ -1290,15 +1298,16 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, if (NumVecs > 3) Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT, RegSeq)); + } } else { - for (unsigned Vec = 0; Vec < NumVecs; ++Vec) - Ops.push_back(N->getOperand(Vec+3)); + Ops.push_back(N->getOperand(3)); } Ops.push_back(Pred); Ops.push_back(Reg0); // predicate register Ops.push_back(Chain); unsigned Opc = DOpcodes[OpcodeIndex]; - return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+5); + return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), + usePseudoInstrs ? 6 : NumVecs+5); } EVT RegVT = GetNEONSubregVT(VT); @@ -1363,6 +1372,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, // Store the even D registers. assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering"); Ops.push_back(Reg0); // post-access address offset + if (usePseudoInstrs) + Ops.push_back(RegSeq); + else for (unsigned Vec = 0; Vec < NumVecs; ++Vec) Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl, RegVT, RegSeq)); @@ -1371,18 +1383,24 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs, Ops.push_back(Chain); unsigned Opc = QOpcodes0[OpcodeIndex]; SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), - MVT::Other, Ops.data(), NumVecs+6); + MVT::Other, Ops.data(), + usePseudoInstrs ? 7 : NumVecs+6); Chain = SDValue(VStA, 1); // Store the odd D registers. Ops[0] = SDValue(VStA, 0); // MemAddr + if (usePseudoInstrs) + Ops[6] = Chain; + else { for (unsigned Vec = 0; Vec < NumVecs; ++Vec) Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl, RegVT, RegSeq); Ops[NumVecs+5] = Chain; + } Opc = QOpcodes1[OpcodeIndex]; SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(), - MVT::Other, Ops.data(), NumVecs+6); + MVT::Other, Ops.data(), + usePseudoInstrs ? 7 : NumVecs+6); Chain = SDValue(VStB, 1); ReplaceUses(SDValue(N, 0), Chain); return NULL; @@ -2312,14 +2330,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) { } case Intrinsic::arm_neon_vst4: { - unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16, - ARM::VST4d32, ARM::VST1d64Q }; - unsigned QOpcodes0[] = { ARM::VST4q8_UPD, - ARM::VST4q16_UPD, - ARM::VST4q32_UPD }; - unsigned QOpcodes1[] = { ARM::VST4q8odd_UPD, - ARM::VST4q16odd_UPD, - ARM::VST4q32odd_UPD }; + unsigned DOpcodes[] = { ARM::VST4d8Pseudo, ARM::VST4d16Pseudo, + ARM::VST4d32Pseudo, ARM::VST1d64Q }; + unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD, + ARM::VST4q16Pseudo_UPD, + ARM::VST4q32Pseudo_UPD }; + unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD, + ARM::VST4q16oddPseudo_UPD, + ARM::VST4q32oddPseudo_UPD }; return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1); } diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td index 25c94331a4..12bbde4523 100644 --- a/lib/Target/ARM/ARMInstrFormats.td +++ b/lib/Target/ARM/ARMInstrFormats.td @@ -1534,6 +1534,14 @@ class NLdSt op21_20, bits<4> op11_8, bits<4> op7_4, let Inst{7-4} = op7_4; } +class PseudoNLdSt + : InstARM { + let OutOperandList = oops; + let InOperandList = !con(iops, (ins pred:$p)); + list Predicates = [HasNEON]; +} + class NDataI pattern> : NeonI; let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in { +// Classes for VST* pseudo-instructions with multi-register operands. +// These are expanded to real instructions after register allocation. +class VSTQQPseudo + : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), IIC_VST, "">; +class VSTQQWBPseudo + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), IIC_VST, + "$addr.addr = $wb">; +class VSTQQQQWBPseudo + : PseudoNLdSt<(outs GPR:$wb), + (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), IIC_VST, + "$addr.addr = $wb">; + // VST1 : Vector Store (multiple single elements) class VST1D op7_4, string Dt> : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST, @@ -664,6 +677,10 @@ def VST4d8 : VST4D<0b0000, 0b0000, "8">; def VST4d16 : VST4D<0b0000, 0b0100, "16">; def VST4d32 : VST4D<0b0000, 0b1000, "32">; +def VST4d8Pseudo : VSTQQPseudo; +def VST4d16Pseudo : VSTQQPseudo; +def VST4d32Pseudo : VSTQQPseudo; + // ...with address register writeback: class VST4DWB op11_8, bits<4> op7_4, string Dt> : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb), @@ -676,6 +693,10 @@ def VST4d8_UPD : VST4DWB<0b0000, 0b0000, "8">; def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">; def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">; +def VST4d8Pseudo_UPD : VSTQQWBPseudo; +def VST4d16Pseudo_UPD : VSTQQWBPseudo; +def VST4d32Pseudo_UPD : VSTQQWBPseudo; + // ...with double-spaced registers (non-updating versions for disassembly only): def VST4q8 : VST4D<0b0001, 0b0000, "8">; def VST4q16 : VST4D<0b0001, 0b0100, "16">; @@ -684,10 +705,14 @@ def VST4q8_UPD : VST4DWB<0b0001, 0b0000, "8">; def VST4q16_UPD : VST4DWB<0b0001, 0b0100, "16">; def VST4q32_UPD : VST4DWB<0b0001, 0b1000, "32">; +def VST4q8Pseudo_UPD : VSTQQQQWBPseudo; +def VST4q16Pseudo_UPD : VSTQQQQWBPseudo; +def VST4q32Pseudo_UPD : VSTQQQQWBPseudo; + // ...alternate versions to be allocated odd register numbers: -def VST4q8odd_UPD : VST4DWB<0b0001, 0b0000, "8">; -def VST4q16odd_UPD : VST4DWB<0b0001, 0b0100, "16">; -def VST4q32odd_UPD : VST4DWB<0b0001, 0b1000, "32">; +def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo; +def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo; +def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo; // VST1LN : Vector Store (single element from one lane) // FIXME: Not yet implemented. diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp index 006a25fb15..0ba11c83ab 100644 --- a/lib/Target/ARM/NEONPreAllocPass.cpp +++ b/lib/Target/ARM/NEONPreAllocPass.cpp @@ -260,9 +260,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, Stride = 2; return true; - case ARM::VST4d8: - case ARM::VST4d16: - case ARM::VST4d32: case ARM::VST1d64Q: case ARM::VST4LNd8: case ARM::VST4LNd16: @@ -271,24 +268,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs, NumRegs = 4; return true; - case ARM::VST4q8_UPD: - case ARM::VST4q16_UPD: - case ARM::VST4q32_UPD: - FirstOpnd = 4; - NumRegs = 4; - Offset = 0; - Stride = 2; - return true; - - case ARM::VST4q8odd_UPD: - case ARM::VST4q16odd_UPD: - case ARM::VST4q32odd_UPD: - FirstOpnd = 4; - NumRegs = 4; - Offset = 1; - Stride = 2; - return true; - case ARM::VST4LNq16: case ARM::VST4LNq32: FirstOpnd = 2; -- cgit v1.2.3