summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Wilson <bob.wilson@apple.com>2010-08-25 23:27:42 +0000
committerBob Wilson <bob.wilson@apple.com>2010-08-25 23:27:42 +0000
commit709d59255a3100c7d440c93069efa1f726677a27 (patch)
tree9fe308a88c943b14b2d0a07e00f82ac3a3d730b0
parent5b5f7260a0f0da9a2057245fd42a6b196ccec33d (diff)
downloadllvm-709d59255a3100c7d440c93069efa1f726677a27.tar.gz
llvm-709d59255a3100c7d440c93069efa1f726677a27.tar.bz2
llvm-709d59255a3100c7d440c93069efa1f726677a27.tar.xz
Start converting NEON load/stores to use pseudo instructions, beginning here
with the VST4 instructions. Until after register allocation, we want to represent sets of adjacent registers by a single super-register. These VST4 pseudo instructions have a single QQ or QQQQ source register operand. They get expanded to the real VST4 instructions with 4 separate D register operands. Once this conversion is complete, we'll be able to remove the NEONPreAllocPass and avoid some fragile and hacky code elsewhere. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@112108 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/ARM/ARMExpandPseudoInsts.cpp102
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp44
-rw-r--r--lib/Target/ARM/ARMInstrFormats.td8
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td31
-rw-r--r--lib/Target/ARM/NEONPreAllocPass.cpp21
5 files changed, 165 insertions, 41 deletions
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index e5f8a638eb..c71b093b0b 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -24,6 +24,13 @@ using namespace llvm;
namespace {
class ARMExpandPseudo : public MachineFunctionPass {
+ // Constants for register spacing in NEON load/store instructions.
+ enum NEONRegSpacing {
+ SingleSpc,
+ EvenDblSpc,
+ OddDblSpc
+ };
+
public:
static char ID;
ARMExpandPseudo() : MachineFunctionPass(ID) {}
@@ -41,6 +48,8 @@ namespace {
void TransferImpOps(MachineInstr &OldMI,
MachineInstrBuilder &UseMI, MachineInstrBuilder &DefMI);
bool ExpandMBB(MachineBasicBlock &MBB);
+ void ExpandVST4(MachineBasicBlock::iterator &MBBI, unsigned Opc,
+ bool hasWriteBack, NEONRegSpacing RegSpc);
};
char ARMExpandPseudo::ID = 0;
}
@@ -63,6 +72,61 @@ void ARMExpandPseudo::TransferImpOps(MachineInstr &OldMI,
}
}
+/// ExpandVST4 - Translate VST4 pseudo instructions with QQ or QQQQ register
+/// operands to real VST4 instructions with 4 D register operands.
+void ARMExpandPseudo::ExpandVST4(MachineBasicBlock::iterator &MBBI,
+ unsigned Opc, bool hasWriteBack,
+ NEONRegSpacing RegSpc) {
+ MachineInstr &MI = *MBBI;
+ MachineBasicBlock &MBB = *MI.getParent();
+
+ MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc));
+ unsigned OpIdx = 0;
+ if (hasWriteBack) {
+ bool DstIsDead = MI.getOperand(OpIdx).isDead();
+ unsigned DstReg = MI.getOperand(OpIdx++).getReg();
+ MIB.addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead));
+ }
+ // Copy the addrmode6 operands.
+ bool AddrIsKill = MI.getOperand(OpIdx).isKill();
+ MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(AddrIsKill));
+ MIB.addImm(MI.getOperand(OpIdx++).getImm());
+ if (hasWriteBack) {
+ // Copy the am6offset operand.
+ bool OffsetIsKill = MI.getOperand(OpIdx).isKill();
+ MIB.addReg(MI.getOperand(OpIdx++).getReg(), getKillRegState(OffsetIsKill));
+ }
+
+ bool SrcIsKill = MI.getOperand(OpIdx).isKill();
+ unsigned SrcReg = MI.getOperand(OpIdx).getReg();
+ unsigned D0, D1, D2, D3;
+ if (RegSpc == SingleSpc) {
+ D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+ D1 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+ D2 = TRI->getSubReg(SrcReg, ARM::dsub_2);
+ D3 = TRI->getSubReg(SrcReg, ARM::dsub_3);
+ } else if (RegSpc == EvenDblSpc) {
+ D0 = TRI->getSubReg(SrcReg, ARM::dsub_0);
+ D1 = TRI->getSubReg(SrcReg, ARM::dsub_2);
+ D2 = TRI->getSubReg(SrcReg, ARM::dsub_4);
+ D3 = TRI->getSubReg(SrcReg, ARM::dsub_6);
+ } else {
+ assert(RegSpc == OddDblSpc && "unknown register spacing for VST4");
+ D0 = TRI->getSubReg(SrcReg, ARM::dsub_1);
+ D1 = TRI->getSubReg(SrcReg, ARM::dsub_3);
+ D2 = TRI->getSubReg(SrcReg, ARM::dsub_5);
+ D3 = TRI->getSubReg(SrcReg, ARM::dsub_7);
+ }
+
+ MIB.addReg(D0, getKillRegState(SrcIsKill))
+ .addReg(D1, getKillRegState(SrcIsKill))
+ .addReg(D2, getKillRegState(SrcIsKill))
+ .addReg(D3, getKillRegState(SrcIsKill));
+ MIB = AddDefaultPred(MIB);
+ TransferImpOps(MI, MIB, MIB);
+ MI.eraseFromParent();
+}
+
bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
bool Modified = false;
@@ -71,9 +135,13 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
MachineInstr &MI = *MBBI;
MachineBasicBlock::iterator NMBBI = llvm::next(MBBI);
+ bool ModifiedOp = true;
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
- default: break;
+ default:
+ ModifiedOp = false;
+ break;
+
case ARM::tLDRpci_pic:
case ARM::t2LDRpci_pic: {
unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
@@ -92,7 +160,6 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
.addOperand(MI.getOperand(2));
TransferImpOps(MI, MIB1, MIB2);
MI.eraseFromParent();
- Modified = true;
break;
}
@@ -128,7 +195,6 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
HI16.addImm(Pred).addReg(PredReg);
TransferImpOps(MI, LO16, HI16);
MI.eraseFromParent();
- Modified = true;
break;
}
@@ -155,9 +221,37 @@ bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
.addReg(OddSrc, getKillRegState(SrcIsKill)));
TransferImpOps(MI, Even, Odd);
MI.eraseFromParent();
- Modified = true;
}
+
+ case ARM::VST4d8Pseudo:
+ ExpandVST4(MBBI, ARM::VST4d8, false, SingleSpc); break;
+ case ARM::VST4d16Pseudo:
+ ExpandVST4(MBBI, ARM::VST4d16, false, SingleSpc); break;
+ case ARM::VST4d32Pseudo:
+ ExpandVST4(MBBI, ARM::VST4d32, false, SingleSpc); break;
+ case ARM::VST4d8Pseudo_UPD:
+ ExpandVST4(MBBI, ARM::VST4d8_UPD, true, SingleSpc); break;
+ case ARM::VST4d16Pseudo_UPD:
+ ExpandVST4(MBBI, ARM::VST4d16_UPD, true, SingleSpc); break;
+ case ARM::VST4d32Pseudo_UPD:
+ ExpandVST4(MBBI, ARM::VST4d32_UPD, true, SingleSpc); break;
+ case ARM::VST4q8Pseudo_UPD:
+ ExpandVST4(MBBI, ARM::VST4q8_UPD, true, EvenDblSpc); break;
+ case ARM::VST4q16Pseudo_UPD:
+ ExpandVST4(MBBI, ARM::VST4q16_UPD, true, EvenDblSpc); break;
+ case ARM::VST4q32Pseudo_UPD:
+ ExpandVST4(MBBI, ARM::VST4q32_UPD, true, EvenDblSpc); break;
+ case ARM::VST4q8oddPseudo_UPD:
+ ExpandVST4(MBBI, ARM::VST4q8_UPD, true, OddDblSpc); break;
+ case ARM::VST4q16oddPseudo_UPD:
+ ExpandVST4(MBBI, ARM::VST4q16_UPD, true, OddDblSpc); break;
+ case ARM::VST4q32oddPseudo_UPD:
+ ExpandVST4(MBBI, ARM::VST4q32_UPD, true, OddDblSpc); break;
+ break;
}
+
+ if (ModifiedOp)
+ Modified = true;
MBBI = NMBBI;
}
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 89c32eae65..86f64bc48a 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1260,6 +1260,11 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
Ops.push_back(MemAddr);
Ops.push_back(Align);
+ // FIXME: This is a temporary flag to distinguish VSTs that have been
+ // converted to pseudo instructions.
+ bool usePseudoInstrs = (NumVecs == 4 &&
+ VT.getSimpleVT().SimpleTy != MVT::v1i64);
+
if (is64BitVector) {
if (NumVecs >= 2) {
SDValue RegSeq;
@@ -1278,6 +1283,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
: N->getOperand(3+3);
RegSeq = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
}
+ if (usePseudoInstrs)
+ Ops.push_back(RegSeq);
+ else {
// Now extract the D registers back out.
Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0, dl, VT,
@@ -1290,15 +1298,16 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
if (NumVecs > 3)
Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_3, dl, VT,
RegSeq));
+ }
} else {
- for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
- Ops.push_back(N->getOperand(Vec+3));
+ Ops.push_back(N->getOperand(3));
}
Ops.push_back(Pred);
Ops.push_back(Reg0); // predicate register
Ops.push_back(Chain);
unsigned Opc = DOpcodes[OpcodeIndex];
- return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+5);
+ return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(),
+ usePseudoInstrs ? 6 : NumVecs+5);
}
EVT RegVT = GetNEONSubregVT(VT);
@@ -1363,6 +1372,9 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
// Store the even D registers.
assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
Ops.push_back(Reg0); // post-access address offset
+ if (usePseudoInstrs)
+ Ops.push_back(RegSeq);
+ else
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec*2, dl,
RegVT, RegSeq));
@@ -1371,18 +1383,24 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
Ops.push_back(Chain);
unsigned Opc = QOpcodes0[OpcodeIndex];
SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
- MVT::Other, Ops.data(), NumVecs+6);
+ MVT::Other, Ops.data(),
+ usePseudoInstrs ? 7 : NumVecs+6);
Chain = SDValue(VStA, 1);
// Store the odd D registers.
Ops[0] = SDValue(VStA, 0); // MemAddr
+ if (usePseudoInstrs)
+ Ops[6] = Chain;
+ else {
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
Ops[Vec+3] = CurDAG->getTargetExtractSubreg(ARM::dsub_1+Vec*2, dl,
RegVT, RegSeq);
Ops[NumVecs+5] = Chain;
+ }
Opc = QOpcodes1[OpcodeIndex];
SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
- MVT::Other, Ops.data(), NumVecs+6);
+ MVT::Other, Ops.data(),
+ usePseudoInstrs ? 7 : NumVecs+6);
Chain = SDValue(VStB, 1);
ReplaceUses(SDValue(N, 0), Chain);
return NULL;
@@ -2312,14 +2330,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
}
case Intrinsic::arm_neon_vst4: {
- unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16,
- ARM::VST4d32, ARM::VST1d64Q };
- unsigned QOpcodes0[] = { ARM::VST4q8_UPD,
- ARM::VST4q16_UPD,
- ARM::VST4q32_UPD };
- unsigned QOpcodes1[] = { ARM::VST4q8odd_UPD,
- ARM::VST4q16odd_UPD,
- ARM::VST4q32odd_UPD };
+ unsigned DOpcodes[] = { ARM::VST4d8Pseudo, ARM::VST4d16Pseudo,
+ ARM::VST4d32Pseudo, ARM::VST1d64Q };
+ unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+ ARM::VST4q16Pseudo_UPD,
+ ARM::VST4q32Pseudo_UPD };
+ unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+ ARM::VST4q16oddPseudo_UPD,
+ ARM::VST4q32oddPseudo_UPD };
return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
}
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 25c94331a4..12bbde4523 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -1534,6 +1534,14 @@ class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
let Inst{7-4} = op7_4;
}
+class PseudoNLdSt<dag oops, dag iops, InstrItinClass itin, string cstr>
+ : InstARM<AddrMode6, Size4Bytes, IndexModeNone, Pseudo, NeonDomain, cstr,
+ itin> {
+ let OutOperandList = oops;
+ let InOperandList = !con(iops, (ins pred:$p));
+ list<Predicate> Predicates = [HasNEON];
+}
+
class NDataI<dag oops, dag iops, Format f, InstrItinClass itin,
string opc, string dt, string asm, string cstr, list<dag> pattern>
: NeonI<oops, iops, AddrModeNone, IndexModeNone, f, itin, opc, dt, asm, cstr,
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 9c01fcd184..4c14b23314 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -486,6 +486,19 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32">;
let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
+// Classes for VST* pseudo-instructions with multi-register operands.
+// These are expanded to real instructions after register allocation.
+class VSTQQPseudo
+ : PseudoNLdSt<(outs), (ins addrmode6:$addr, QQPR:$src), IIC_VST, "">;
+class VSTQQWBPseudo
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQPR:$src), IIC_VST,
+ "$addr.addr = $wb">;
+class VSTQQQQWBPseudo
+ : PseudoNLdSt<(outs GPR:$wb),
+ (ins addrmode6:$addr, am6offset:$offset, QQQQPR:$src), IIC_VST,
+ "$addr.addr = $wb">;
+
// VST1 : Vector Store (multiple single elements)
class VST1D<bits<4> op7_4, string Dt>
: NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
@@ -664,6 +677,10 @@ def VST4d8 : VST4D<0b0000, 0b0000, "8">;
def VST4d16 : VST4D<0b0000, 0b0100, "16">;
def VST4d32 : VST4D<0b0000, 0b1000, "32">;
+def VST4d8Pseudo : VSTQQPseudo;
+def VST4d16Pseudo : VSTQQPseudo;
+def VST4d32Pseudo : VSTQQPseudo;
+
// ...with address register writeback:
class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
: NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
@@ -676,6 +693,10 @@ def VST4d8_UPD : VST4DWB<0b0000, 0b0000, "8">;
def VST4d16_UPD : VST4DWB<0b0000, 0b0100, "16">;
def VST4d32_UPD : VST4DWB<0b0000, 0b1000, "32">;
+def VST4d8Pseudo_UPD : VSTQQWBPseudo;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo;
+
// ...with double-spaced registers (non-updating versions for disassembly only):
def VST4q8 : VST4D<0b0001, 0b0000, "8">;
def VST4q16 : VST4D<0b0001, 0b0100, "16">;
@@ -684,10 +705,14 @@ def VST4q8_UPD : VST4DWB<0b0001, 0b0000, "8">;
def VST4q16_UPD : VST4DWB<0b0001, 0b0100, "16">;
def VST4q32_UPD : VST4DWB<0b0001, 0b1000, "32">;
+def VST4q8Pseudo_UPD : VSTQQQQWBPseudo;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo;
+
// ...alternate versions to be allocated odd register numbers:
-def VST4q8odd_UPD : VST4DWB<0b0001, 0b0000, "8">;
-def VST4q16odd_UPD : VST4DWB<0b0001, 0b0100, "16">;
-def VST4q32odd_UPD : VST4DWB<0b0001, 0b1000, "32">;
+def VST4q8oddPseudo_UPD : VSTQQQQWBPseudo;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo;
// VST1LN : Vector Store (single element from one lane)
// FIXME: Not yet implemented.
diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
index 006a25fb15..0ba11c83ab 100644
--- a/lib/Target/ARM/NEONPreAllocPass.cpp
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp
@@ -260,9 +260,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
Stride = 2;
return true;
- case ARM::VST4d8:
- case ARM::VST4d16:
- case ARM::VST4d32:
case ARM::VST1d64Q:
case ARM::VST4LNd8:
case ARM::VST4LNd16:
@@ -271,24 +268,6 @@ static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
NumRegs = 4;
return true;
- case ARM::VST4q8_UPD:
- case ARM::VST4q16_UPD:
- case ARM::VST4q32_UPD:
- FirstOpnd = 4;
- NumRegs = 4;
- Offset = 0;
- Stride = 2;
- return true;
-
- case ARM::VST4q8odd_UPD:
- case ARM::VST4q16odd_UPD:
- case ARM::VST4q32odd_UPD:
- FirstOpnd = 4;
- NumRegs = 4;
- Offset = 1;
- Stride = 2;
- return true;
-
case ARM::VST4LNq16:
case ARM::VST4LNq32:
FirstOpnd = 2;