summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBob Wilson <bob.wilson@apple.com>2011-02-07 17:43:21 +0000
committerBob Wilson <bob.wilson@apple.com>2011-02-07 17:43:21 +0000
commit1c3ef90cab9a563427bdd3c2fcd875c717750562 (patch)
tree96e3030f27ad79d3147140f5ae0ecabf8f6b11c0
parent7de6814405ab02591235f0826b8e6d98fd76c8ba (diff)
downloadllvm-1c3ef90cab9a563427bdd3c2fcd875c717750562.tar.gz
llvm-1c3ef90cab9a563427bdd3c2fcd875c717750562.tar.bz2
llvm-1c3ef90cab9a563427bdd3c2fcd875c717750562.tar.xz
Add codegen support for using post-increment NEON load/store instructions.
The vld1-lane, vld1-dup and vst1-lane instructions do not yet support using post-increment versions, but all the rest of the NEON load/store instructions should be handled now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@125014 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/ARM/ARMISelDAGToDAG.cpp492
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp176
-rw-r--r--lib/Target/ARM/ARMISelLowering.h23
-rw-r--r--test/CodeGen/ARM/vld1.ll35
-rw-r--r--test/CodeGen/ARM/vld2.ll29
-rw-r--r--test/CodeGen/ARM/vld3.ll31
-rw-r--r--test/CodeGen/ARM/vld4.ll30
-rw-r--r--test/CodeGen/ARM/vlddup.ll61
-rw-r--r--test/CodeGen/ARM/vldlane.ll53
-rw-r--r--test/CodeGen/ARM/vst1.ll26
-rw-r--r--test/CodeGen/ARM/vst2.ll25
-rw-r--r--test/CodeGen/ARM/vst3.ll27
-rw-r--r--test/CodeGen/ARM/vst4.ll26
-rw-r--r--test/CodeGen/ARM/vstlane.ll38
14 files changed, 927 insertions, 145 deletions
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index fbdc2fbc15..5dd84341f8 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -196,26 +196,30 @@ private:
/// 1, 2, 3 or 4. The opcode arrays specify the instructions used for
/// loads of D registers and even subregs and odd subregs of Q registers.
/// For NumVecs <= 2, QOpcodes1 is not used.
- SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
+ SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
+ unsigned *DOpcodes,
unsigned *QOpcodes0, unsigned *QOpcodes1);
/// SelectVST - Select NEON store intrinsics. NumVecs should
/// be 1, 2, 3 or 4. The opcode arrays specify the instructions used for
/// stores of D registers and even subregs and odd subregs of Q registers.
/// For NumVecs <= 2, QOpcodes1 is not used.
- SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
+ SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
+ unsigned *DOpcodes,
unsigned *QOpcodes0, unsigned *QOpcodes1);
/// SelectVLDSTLane - Select NEON load/store lane intrinsics. NumVecs should
/// be 2, 3 or 4. The opcode arrays specify the instructions used for
/// load/store of D registers and Q registers.
- SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs,
+ SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad,
+ bool isUpdating, unsigned NumVecs,
unsigned *DOpcodes, unsigned *QOpcodes);
/// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs
/// should be 2, 3 or 4. The opcode array specifies the instructions used
/// for loading D registers. (Q registers are not supported.)
- SDNode *SelectVLDDup(SDNode *N, unsigned NumVecs, unsigned *Opcodes);
+ SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+ unsigned *Opcodes);
/// SelectVTBL - Select NEON VTBL and VTBX intrinsics. NumVecs should be 2,
/// 3 or 4. These are custom-selected so that a REG_SEQUENCE can be
@@ -1439,14 +1443,15 @@ SDValue ARMDAGToDAGISel::GetVLDSTAlign(SDValue Align, unsigned NumVecs,
return CurDAG->getTargetConstant(Alignment, MVT::i32);
}
-SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
+SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
unsigned *DOpcodes, unsigned *QOpcodes0,
unsigned *QOpcodes1) {
assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
DebugLoc dl = N->getDebugLoc();
SDValue MemAddr, Align;
- if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align))
+ unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
return NULL;
SDValue Chain = N->getOperand(0);
@@ -1482,46 +1487,39 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
ResTyElts *= 2;
ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
}
+ std::vector<EVT> ResTys;
+ ResTys.push_back(ResTy);
+ if (isUpdating)
+ ResTys.push_back(MVT::i32);
+ ResTys.push_back(MVT::Other);
SDValue Pred = getAL(CurDAG);
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
- SDValue SuperReg;
- if (is64BitVector) {
- const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
- SDNode *VLd = CurDAG->getMachineNode(DOpcodes[OpcodeIndex], dl,
- ResTy, MVT::Other, Ops, 5);
- if (NumVecs == 1)
- return VLd;
-
- SuperReg = SDValue(VLd, 0);
- assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
- for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
- SDValue D = CurDAG->getTargetExtractSubreg(ARM::dsub_0+Vec,
- dl, VT, SuperReg);
- ReplaceUses(SDValue(N, Vec), D);
- }
- ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
- return NULL;
- }
-
- if (NumVecs <= 2) {
- // Quad registers are directly supported for VLD1 and VLD2,
- // loading pairs of D regs.
- const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
- SDNode *VLd = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
- ResTy, MVT::Other, Ops, 5);
- if (NumVecs == 1)
- return VLd;
+ SDNode *VLd;
+ SmallVector<SDValue, 7> Ops;
- SuperReg = SDValue(VLd, 0);
- Chain = SDValue(VLd, 1);
+ // Double registers and VLD1/VLD2 quad registers are directly supported.
+ if (is64BitVector || NumVecs <= 2) {
+ unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+ QOpcodes0[OpcodeIndex]);
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+ }
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+ VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
} else {
// Otherwise, quad registers are loaded with two separate instructions,
// where one loads the even registers and the other loads the odd registers.
EVT AddrTy = MemAddr.getValueType();
- // Load the even subregs.
+ // Load the even subregs. This is always an updating load, so that it
+ // provides the address to the second load for the odd subregs.
SDValue ImplDef =
SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, ResTy), 0);
const SDValue OpsA[] = { MemAddr, Align, Reg0, ImplDef, Pred, Reg0, Chain };
@@ -1530,37 +1528,54 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
Chain = SDValue(VLdA, 2);
// Load the odd subregs.
- const SDValue OpsB[] = { SDValue(VLdA, 1), Align, SDValue(VLdA, 0),
- Pred, Reg0, Chain };
- SDNode *VLdB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl,
- ResTy, MVT::Other, OpsB, 6);
- SuperReg = SDValue(VLdB, 0);
- Chain = SDValue(VLdB, 1);
- }
-
- // Extract out the Q registers.
- assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
- for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
- SDValue Q = CurDAG->getTargetExtractSubreg(ARM::qsub_0+Vec,
- dl, VT, SuperReg);
- ReplaceUses(SDValue(N, Vec), Q);
- }
- ReplaceUses(SDValue(N, NumVecs), Chain);
+ Ops.push_back(SDValue(VLdA, 1));
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ assert(isa<ConstantSDNode>(Inc.getNode()) &&
+ "only constant post-increment update allowed for VLD3/4");
+ (void)Inc;
+ Ops.push_back(Reg0);
+ }
+ Ops.push_back(SDValue(VLdA, 0));
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+ VLd = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+ Ops.data(), Ops.size());
+ }
+
+ if (NumVecs == 1)
+ return VLd;
+
+ // Extract out the subregisters.
+ SDValue SuperReg = SDValue(VLd, 0);
+ assert(ARM::dsub_7 == ARM::dsub_0+7 &&
+ ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+ unsigned Sub0 = (is64BitVector ? ARM::dsub_0 : ARM::qsub_0);
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+ ReplaceUses(SDValue(N, Vec),
+ CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+ ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
+ if (isUpdating)
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
return NULL;
}
-SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
+SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
unsigned *DOpcodes, unsigned *QOpcodes0,
unsigned *QOpcodes1) {
assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
DebugLoc dl = N->getDebugLoc();
SDValue MemAddr, Align;
- if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align))
+ unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+ if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
return NULL;
SDValue Chain = N->getOperand(0);
- EVT VT = N->getOperand(3).getValueType();
+ EVT VT = N->getOperand(Vec0Idx).getValueType();
bool is64BitVector = VT.is64BitVector();
Align = GetVLDSTAlign(Align, NumVecs, is64BitVector);
@@ -1583,64 +1598,71 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
break;
}
+ std::vector<EVT> ResTys;
+ if (isUpdating)
+ ResTys.push_back(MVT::i32);
+ ResTys.push_back(MVT::Other);
+
SDValue Pred = getAL(CurDAG);
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+ SmallVector<SDValue, 7> Ops;
- if (is64BitVector) {
+ // Double registers and VST1/VST2 quad registers are directly supported.
+ if (is64BitVector || NumVecs <= 2) {
SDValue SrcReg;
if (NumVecs == 1) {
- SrcReg = N->getOperand(3);
- } else {
- SDValue V0 = N->getOperand(0+3);
- SDValue V1 = N->getOperand(1+3);
-
+ SrcReg = N->getOperand(Vec0Idx);
+ } else if (is64BitVector) {
// Form a REG_SEQUENCE to force register allocation.
+ SDValue V0 = N->getOperand(Vec0Idx + 0);
+ SDValue V1 = N->getOperand(Vec0Idx + 1);
if (NumVecs == 2)
SrcReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
else {
- SDValue V2 = N->getOperand(2+3);
+ SDValue V2 = N->getOperand(Vec0Idx + 2);
// If it's a vst3, form a quad D-register and leave the last part as
// an undef.
SDValue V3 = (NumVecs == 3)
? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
- : N->getOperand(3+3);
+ : N->getOperand(Vec0Idx + 3);
SrcReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
}
- }
- const SDValue Ops[] = { MemAddr, Align, SrcReg, Pred, Reg0, Chain };
- return CurDAG->getMachineNode(DOpcodes[OpcodeIndex], dl,
- MVT::Other, Ops, 6);
- }
-
- if (NumVecs <= 2) {
- // Quad registers are directly supported for VST1 and VST2.
- SDValue SrcReg;
- if (NumVecs == 1) {
- SrcReg = N->getOperand(3);
} else {
// Form a QQ register.
- SDValue Q0 = N->getOperand(3);
- SDValue Q1 = N->getOperand(4);
+ SDValue Q0 = N->getOperand(Vec0Idx);
+ SDValue Q1 = N->getOperand(Vec0Idx + 1);
SrcReg = SDValue(PairQRegs(MVT::v4i64, Q0, Q1), 0);
}
- const SDValue Ops[] = { MemAddr, Align, SrcReg, Pred, Reg0, Chain };
- return CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
- MVT::Other, Ops, 6);
+
+ unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+ QOpcodes0[OpcodeIndex]);
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+ }
+ Ops.push_back(SrcReg);
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+ return CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
}
// Otherwise, quad registers are stored with two separate instructions,
// where one stores the even registers and the other stores the odd registers.
// Form the QQQQ REG_SEQUENCE.
- SDValue V0 = N->getOperand(0+3);
- SDValue V1 = N->getOperand(1+3);
- SDValue V2 = N->getOperand(2+3);
+ SDValue V0 = N->getOperand(Vec0Idx + 0);
+ SDValue V1 = N->getOperand(Vec0Idx + 1);
+ SDValue V2 = N->getOperand(Vec0Idx + 2);
SDValue V3 = (NumVecs == 3)
? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
- : N->getOperand(3+3);
+ : N->getOperand(Vec0Idx + 3);
SDValue RegSeq = SDValue(QuadQRegs(MVT::v8i64, V0, V1, V2, V3), 0);
- // Store the even D registers.
+ // Store the even D registers. This is always an updating store, so that it
+ // provides the address to the second store for the odd subregs.
const SDValue OpsA[] = { MemAddr, Align, Reg0, RegSeq, Pred, Reg0, Chain };
SDNode *VStA = CurDAG->getMachineNode(QOpcodes0[OpcodeIndex], dl,
MemAddr.getValueType(),
@@ -1648,28 +1670,40 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
Chain = SDValue(VStA, 1);
// Store the odd D registers.
- const SDValue OpsB[] = { SDValue(VStA, 0), Align, RegSeq, Pred, Reg0, Chain };
- SDNode *VStB = CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl,
- MVT::Other, OpsB, 6);
- Chain = SDValue(VStB, 0);
- ReplaceUses(SDValue(N, 0), Chain);
- return NULL;
+ Ops.push_back(SDValue(VStA, 0));
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ assert(isa<ConstantSDNode>(Inc.getNode()) &&
+ "only constant post-increment update allowed for VST3/4");
+ (void)Inc;
+ Ops.push_back(Reg0);
+ }
+ Ops.push_back(RegSeq);
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
+ return CurDAG->getMachineNode(QOpcodes1[OpcodeIndex], dl, ResTys,
+ Ops.data(), Ops.size());
}
SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
- unsigned NumVecs, unsigned *DOpcodes,
+ bool isUpdating, unsigned NumVecs,
+ unsigned *DOpcodes,
unsigned *QOpcodes) {
assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
DebugLoc dl = N->getDebugLoc();
SDValue MemAddr, Align;
- if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, Align))
+ unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+ if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
return NULL;
SDValue Chain = N->getOperand(0);
unsigned Lane =
- cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue();
- EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType();
+ cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+ EVT VT = N->getOperand(Vec0Idx).getValueType();
bool is64BitVector = VT.is64BitVector();
unsigned Alignment = 0;
@@ -1701,29 +1735,42 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
case MVT::v4i32: OpcodeIndex = 1; break;
}
+ std::vector<EVT> ResTys;
+ if (IsLoad) {
+ unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
+ if (!is64BitVector)
+ ResTyElts *= 2;
+ ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(),
+ MVT::i64, ResTyElts));
+ }
+ if (isUpdating)
+ ResTys.push_back(MVT::i32);
+ ResTys.push_back(MVT::Other);
+
SDValue Pred = getAL(CurDAG);
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
- SmallVector<SDValue, 7> Ops;
+ SmallVector<SDValue, 8> Ops;
Ops.push_back(MemAddr);
Ops.push_back(Align);
-
- unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
- QOpcodes[OpcodeIndex]);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+ }
SDValue SuperReg;
- SDValue V0 = N->getOperand(0+3);
- SDValue V1 = N->getOperand(1+3);
+ SDValue V0 = N->getOperand(Vec0Idx + 0);
+ SDValue V1 = N->getOperand(Vec0Idx + 1);
if (NumVecs == 2) {
if (is64BitVector)
SuperReg = SDValue(PairDRegs(MVT::v2i64, V0, V1), 0);
else
SuperReg = SDValue(PairQRegs(MVT::v4i64, V0, V1), 0);
} else {
- SDValue V2 = N->getOperand(2+3);
+ SDValue V2 = N->getOperand(Vec0Idx + 2);
SDValue V3 = (NumVecs == 3)
- ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,dl,VT), 0)
- : N->getOperand(3+3);
+ ? SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0)
+ : N->getOperand(Vec0Idx + 3);
if (is64BitVector)
SuperReg = SDValue(QuadDRegs(MVT::v4i64, V0, V1, V2, V3), 0);
else
@@ -1735,33 +1782,29 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
Ops.push_back(Reg0);
Ops.push_back(Chain);
+ unsigned Opc = (is64BitVector ? DOpcodes[OpcodeIndex] :
+ QOpcodes[OpcodeIndex]);
+ SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys,
+ Ops.data(), Ops.size());
if (!IsLoad)
- return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 7);
-
- EVT ResTy;
- unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
- if (!is64BitVector)
- ResTyElts *= 2;
- ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
-
- SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other,
- Ops.data(), 7);
- SuperReg = SDValue(VLdLn, 0);
- Chain = SDValue(VLdLn, 1);
+ return VLdLn;
// Extract the subregisters.
- assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
- assert(ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
- unsigned SubIdx = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
+ SuperReg = SDValue(VLdLn, 0);
+ assert(ARM::dsub_7 == ARM::dsub_0+7 &&
+ ARM::qsub_3 == ARM::qsub_0+3 && "Unexpected subreg numbering");
+ unsigned Sub0 = is64BitVector ? ARM::dsub_0 : ARM::qsub_0;
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
ReplaceUses(SDValue(N, Vec),
- CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
- ReplaceUses(SDValue(N, NumVecs), Chain);
+ CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+ ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+ if (isUpdating)
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
return NULL;
}
-SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs,
- unsigned *Opcodes) {
+SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
+ unsigned NumVecs, unsigned *Opcodes) {
assert(NumVecs >=2 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
DebugLoc dl = N->getDebugLoc();
@@ -1800,13 +1843,26 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs,
SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
SDValue SuperReg;
unsigned Opc = Opcodes[OpcodeIndex];
- const SDValue Ops[] = { MemAddr, Align, Pred, Reg0, Chain };
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(MemAddr);
+ Ops.push_back(Align);
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(2);
+ Ops.push_back(isa<ConstantSDNode>(Inc.getNode()) ? Reg0 : Inc);
+ }
+ Ops.push_back(Pred);
+ Ops.push_back(Reg0);
+ Ops.push_back(Chain);
unsigned ResTyElts = (NumVecs == 3) ? 4 : NumVecs;
- EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts);
- SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTy, MVT::Other, Ops, 5);
+ std::vector<EVT> ResTys;
+ ResTys.push_back(EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, ResTyElts));
+ if (isUpdating)
+ ResTys.push_back(MVT::i32);
+ ResTys.push_back(MVT::Other);
+ SDNode *VLdDup =
+ CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), Ops.size());
SuperReg = SDValue(VLdDup, 0);
- Chain = SDValue(VLdDup, 1);
// Extract the subregisters.
assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
@@ -1814,7 +1870,9 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, unsigned NumVecs,
for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
ReplaceUses(SDValue(N, Vec),
CurDAG->getTargetExtractSubreg(SubIdx+Vec, dl, VT, SuperReg));
- ReplaceUses(SDValue(N, NumVecs), Chain);
+ ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+ if (isUpdating)
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
return NULL;
}
@@ -2470,19 +2528,165 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
case ARMISD::VLD2DUP: {
unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo, ARM::VLD2DUPd16Pseudo,
ARM::VLD2DUPd32Pseudo };
- return SelectVLDDup(N, 2, Opcodes);
+ return SelectVLDDup(N, false, 2, Opcodes);
}
case ARMISD::VLD3DUP: {
unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo, ARM::VLD3DUPd16Pseudo,
ARM::VLD3DUPd32Pseudo };
- return SelectVLDDup(N, 3, Opcodes);
+ return SelectVLDDup(N, false, 3, Opcodes);
}
case ARMISD::VLD4DUP: {
unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo, ARM::VLD4DUPd16Pseudo,
ARM::VLD4DUPd32Pseudo };
- return SelectVLDDup(N, 4, Opcodes);
+ return SelectVLDDup(N, false, 4, Opcodes);
+ }
+
+ case ARMISD::VLD2DUP_UPD: {
+ unsigned Opcodes[] = { ARM::VLD2DUPd8Pseudo_UPD, ARM::VLD2DUPd16Pseudo_UPD,
+ ARM::VLD2DUPd32Pseudo_UPD };
+ return SelectVLDDup(N, true, 2, Opcodes);
+ }
+
+ case ARMISD::VLD3DUP_UPD: {
+ unsigned Opcodes[] = { ARM::VLD3DUPd8Pseudo_UPD, ARM::VLD3DUPd16Pseudo_UPD,
+ ARM::VLD3DUPd32Pseudo_UPD };
+ return SelectVLDDup(N, true, 3, Opcodes);
+ }
+
+ case ARMISD::VLD4DUP_UPD: {
+ unsigned Opcodes[] = { ARM::VLD4DUPd8Pseudo_UPD, ARM::VLD4DUPd16Pseudo_UPD,
+ ARM::VLD4DUPd32Pseudo_UPD };
+ return SelectVLDDup(N, true, 4, Opcodes);
+ }
+
+ case ARMISD::VLD1_UPD: {
+ unsigned DOpcodes[] = { ARM::VLD1d8_UPD, ARM::VLD1d16_UPD,
+ ARM::VLD1d32_UPD, ARM::VLD1d64_UPD };
+ unsigned QOpcodes[] = { ARM::VLD1q8Pseudo_UPD, ARM::VLD1q16Pseudo_UPD,
+ ARM::VLD1q32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD };
+ return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0);
+ }
+
+ case ARMISD::VLD2_UPD: {
+ unsigned DOpcodes[] = { ARM::VLD2d8Pseudo_UPD, ARM::VLD2d16Pseudo_UPD,
+ ARM::VLD2d32Pseudo_UPD, ARM::VLD1q64Pseudo_UPD };
+ unsigned QOpcodes[] = { ARM::VLD2q8Pseudo_UPD, ARM::VLD2q16Pseudo_UPD,
+ ARM::VLD2q32Pseudo_UPD };
+ return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0);
+ }
+
+ case ARMISD::VLD3_UPD: {
+ unsigned DOpcodes[] = { ARM::VLD3d8Pseudo_UPD, ARM::VLD3d16Pseudo_UPD,
+ ARM::VLD3d32Pseudo_UPD, ARM::VLD1d64TPseudo_UPD };
+ unsigned QOpcodes0[] = { ARM::VLD3q8Pseudo_UPD,
+ ARM::VLD3q16Pseudo_UPD,
+ ARM::VLD3q32Pseudo_UPD };
+ unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo_UPD,
+ ARM::VLD3q16oddPseudo_UPD,
+ ARM::VLD3q32oddPseudo_UPD };
+ return SelectVLD(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case ARMISD::VLD4_UPD: {
+ unsigned DOpcodes[] = { ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD,
+ ARM::VLD4d32Pseudo_UPD, ARM::VLD1d64QPseudo_UPD };
+ unsigned QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
+ ARM::VLD4q16Pseudo_UPD,
+ ARM::VLD4q32Pseudo_UPD };
+ unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
+ ARM::VLD4q16oddPseudo_UPD,
+ ARM::VLD4q32oddPseudo_UPD };
+ return SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case ARMISD::VLD2LN_UPD: {
+ unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo_UPD, ARM::VLD2LNd16Pseudo_UPD,
+ ARM::VLD2LNd32Pseudo_UPD };
+ unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo_UPD,
+ ARM::VLD2LNq32Pseudo_UPD };
+ return SelectVLDSTLane(N, true, true, 2, DOpcodes, QOpcodes);
+ }
+
+ case ARMISD::VLD3LN_UPD: {
+ unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo_UPD, ARM::VLD3LNd16Pseudo_UPD,
+ ARM::VLD3LNd32Pseudo_UPD };
+ unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo_UPD,
+ ARM::VLD3LNq32Pseudo_UPD };
+ return SelectVLDSTLane(N, true, true, 3, DOpcodes, QOpcodes);
+ }
+
+ case ARMISD::VLD4LN_UPD: {
+ unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo_UPD, ARM::VLD4LNd16Pseudo_UPD,
+ ARM::VLD4LNd32Pseudo_UPD };
+ unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo_UPD,
+ ARM::VLD4LNq32Pseudo_UPD };
+ return SelectVLDSTLane(N, true, true, 4, DOpcodes, QOpcodes);
+ }
+
+ case ARMISD::VST1_UPD: {
+ unsigned DOpcodes[] = { ARM::VST1d8_UPD, ARM::VST1d16_UPD,
+ ARM::VST1d32_UPD, ARM::VST1d64_UPD };
+ unsigned QOpcodes[] = { ARM::VST1q8Pseudo_UPD, ARM::VST1q16Pseudo_UPD,
+ ARM::VST1q32Pseudo_UPD, ARM::VST1q64Pseudo_UPD };
+ return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0);
+ }
+
+ case ARMISD::VST2_UPD: {
+ unsigned DOpcodes[] = { ARM::VST2d8Pseudo_UPD, ARM::VST2d16Pseudo_UPD,
+ ARM::VST2d32Pseudo_UPD, ARM::VST1q64Pseudo_UPD };
+ unsigned QOpcodes[] = { ARM::VST2q8Pseudo_UPD, ARM::VST2q16Pseudo_UPD,
+ ARM::VST2q32Pseudo_UPD };
+ return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0);
+ }
+
+ case ARMISD::VST3_UPD: {
+ unsigned DOpcodes[] = { ARM::VST3d8Pseudo_UPD, ARM::VST3d16Pseudo_UPD,
+ ARM::VST3d32Pseudo_UPD, ARM::VST1d64TPseudo_UPD };
+ unsigned QOpcodes0[] = { ARM::VST3q8Pseudo_UPD,
+ ARM::VST3q16Pseudo_UPD,
+ ARM::VST3q32Pseudo_UPD };
+ unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo_UPD,
+ ARM::VST3q16oddPseudo_UPD,
+ ARM::VST3q32oddPseudo_UPD };
+ return SelectVST(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case ARMISD::VST4_UPD: {
+ unsigned DOpcodes[] = { ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD,
+ ARM::VST4d32Pseudo_UPD, ARM::VST1d64QPseudo_UPD };
+ unsigned QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
+ ARM::VST4q16Pseudo_UPD,
+ ARM::VST4q32Pseudo_UPD };
+ unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
+ ARM::VST4q16oddPseudo_UPD,
+ ARM::VST4q32oddPseudo_UPD };
+ return SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ }
+
+ case ARMISD::VST2LN_UPD: {
+ unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo_UPD, ARM::VST2LNd16Pseudo_UPD,
+ ARM::VST2LNd32Pseudo_UPD };
+ unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo_UPD,
+ ARM::VST2LNq32Pseudo_UPD };
+ return SelectVLDSTLane(N, false, true, 2, DOpcodes, QOpcodes);
+ }
+
+ case ARMISD::VST3LN_UPD: {
+ unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo_UPD, ARM::VST3LNd16Pseudo_UPD,
+ ARM::VST3LNd32Pseudo_UPD };
+ unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo_UPD,
+ ARM::VST3LNq32Pseudo_UPD };
+ return SelectVLDSTLane(N, false, true, 3, DOpcodes, QOpcodes);
+ }
+
+ case ARMISD::VST4LN_UPD: {
+ unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo_UPD, ARM::VST4LNd16Pseudo_UPD,
+ ARM::VST4LNd32Pseudo_UPD };
+ unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo_UPD,
+ ARM::VST4LNq32Pseudo_UPD };
+ return SelectVLDSTLane(N, false, true, 4, DOpcodes, QOpcodes);
}
case ISD::INTRINSIC_VOID:
@@ -2497,7 +2701,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
ARM::VLD1d32, ARM::VLD1d64 };
unsigned QOpcodes[] = { ARM::VLD1q8Pseudo, ARM::VLD1q16Pseudo,
ARM::VLD1q32Pseudo, ARM::VLD1q64Pseudo };
- return SelectVLD(N, 1, DOpcodes, QOpcodes, 0);
+ return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0);
}
case Intrinsic::arm_neon_vld2: {
@@ -2505,7 +2709,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
ARM::VLD2d32Pseudo, ARM::VLD1q64Pseudo };
unsigned QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
ARM::VLD2q32Pseudo };
- return SelectVLD(N, 2, DOpcodes, QOpcodes, 0);
+ return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0);
}
case Intrinsic::arm_neon_vld3: {
@@ -2517,7 +2721,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
unsigned QOpcodes1[] = { ARM::VLD3q8oddPseudo,
ARM::VLD3q16oddPseudo,
ARM::VLD3q32oddPseudo };
- return SelectVLD(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return SelectVLD(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
}
case Intrinsic::arm_neon_vld4: {
@@ -2529,28 +2733,28 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
unsigned QOpcodes1[] = { ARM::VLD4q8oddPseudo,
ARM::VLD4q16oddPseudo,
ARM::VLD4q32oddPseudo };
- return SelectVLD(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return SelectVLD(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
}
case Intrinsic::arm_neon_vld2lane: {
unsigned DOpcodes[] = { ARM::VLD2LNd8Pseudo, ARM::VLD2LNd16Pseudo,
ARM::VLD2LNd32Pseudo };
unsigned QOpcodes[] = { ARM::VLD2LNq16Pseudo, ARM::VLD2LNq32Pseudo };
- return SelectVLDSTLane(N, true, 2, DOpcodes, QOpcodes);
+ return SelectVLDSTLane(N, true, false, 2, DOpcodes, QOpcodes);
}
case Intrinsic::arm_neon_vld3lane: {
unsigned DOpcodes[] = { ARM::VLD3LNd8Pseudo, ARM::VLD3LNd16Pseudo,
ARM::VLD3LNd32Pseudo };
unsigned QOpcodes[] = { ARM::VLD3LNq16Pseudo, ARM::VLD3LNq32Pseudo };
- return SelectVLDSTLane(N, true, 3, DOpcodes, QOpcodes);
+ return SelectVLDSTLane(N, true, false, 3, DOpcodes, QOpcodes);
}
case Intrinsic::arm_neon_vld4lane: {
unsigned DOpcodes[] = { ARM::VLD4LNd8Pseudo, ARM::VLD4LNd16Pseudo,
ARM::VLD4LNd32Pseudo };
unsigned QOpcodes[] = { ARM::VLD4LNq16Pseudo, ARM::VLD4LNq32Pseudo };
- return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes);
+ return SelectVLDSTLane(N, true, false, 4, DOpcodes, QOpcodes);
}
case Intrinsic::arm_neon_vst1: {
@@ -2558,7 +2762,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
ARM::VST1d32, ARM::VST1d64 };
unsigned QOpcodes[] = { ARM::VST1q8Pseudo, ARM::VST1q16Pseudo,
ARM::VST1q32Pseudo, ARM::VST1q64Pseudo };
- return SelectVST(N, 1, DOpcodes, QOpcodes, 0);
+ return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0);
}
case Intrinsic::arm_neon_vst2: {
@@ -2566,7 +2770,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
ARM::VST2d32Pseudo, ARM::VST1q64Pseudo };
unsigned QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
ARM::VST2q32Pseudo };
- return SelectVST(N, 2, DOpcodes, QOpcodes, 0);
+ return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0);
}
case Intrinsic::arm_neon_vst3: {
@@ -2578,7 +2782,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
unsigned QOpcodes1[] = { ARM::VST3q8oddPseudo,
ARM::VST3q16oddPseudo,
ARM::VST3q32oddPseudo };
- return SelectVST(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
+ return SelectVST(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
}
case Intrinsic::arm_neon_vst4: {
@@ -2590,28 +2794,28 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
unsigned QOpcodes1[] = { ARM::VST4q8oddPseudo,
ARM::VST4q16oddPseudo,
ARM::VST4q32oddPseudo };
- return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
+ return SelectVST(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
}
case Intrinsic::arm_neon_vst2lane: {
unsigned DOpcodes[] = { ARM::VST2LNd8Pseudo, ARM::VST2LNd16Pseudo,
ARM::VST2LNd32Pseudo };
unsigned QOpcodes[] = { ARM::VST2LNq16Pseudo, ARM::VST2LNq32Pseudo };
- return SelectVLDSTLane(N, false, 2, DOpcodes, QOpcodes);
+ return SelectVLDSTLane(N, false, false, 2, DOpcodes, QOpcodes);
}
case Intrinsic::arm_neon_vst3lane: {
unsigned DOpcodes[] = { ARM::VST3LNd8Pseudo, ARM::VST3LNd16Pseudo,
ARM::VST3LNd32Pseudo };
unsigned QOpcodes[] = { ARM::VST3LNq16Pseudo, ARM::VST3LNq32Pseudo };
- return SelectVLDSTLane(N, false, 3, DOpcodes, QOpcodes);
+ return SelectVLDSTLane(N, false, false, 3, DOpcodes, QOpcodes);
}
case Intrinsic::arm_neon_vst4lane: {
unsigned DOpcodes[] = { ARM::VST4LNd8Pseudo, ARM::VST4LNd16Pseudo,
ARM::VST4LNd32Pseudo };
unsigned QOpcodes[] = { ARM::VST4LNq16Pseudo, ARM::VST4LNq32Pseudo };
- return SelectVLDSTLane(N, false, 4, DOpcodes, QOpcodes);
+ return SelectVLDSTLane(N, false, false, 4, DOpcodes, QOpcodes);
}
}
break;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 92ea6cb0f8..59a71553bf 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -457,6 +457,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
+ setTargetDAGCombine(ISD::INTRINSIC_VOID);
+ setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRL);
@@ -857,6 +859,23 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
case ARMISD::VLD2DUP: return "ARMISD::VLD2DUP";
case ARMISD::VLD3DUP: return "ARMISD::VLD3DUP";
case ARMISD::VLD4DUP: return "ARMISD::VLD4DUP";
+ case ARMISD::VLD1_UPD: return "ARMISD::VLD1_UPD";
+ case ARMISD::VLD2_UPD: return "ARMISD::VLD2_UPD";
+ case ARMISD::VLD3_UPD: return "ARMISD::VLD3_UPD";
+ case ARMISD::VLD4_UPD: return "ARMISD::VLD4_UPD";
+ case ARMISD::VLD2LN_UPD: return "ARMISD::VLD2LN_UPD";
+ case ARMISD::VLD3LN_UPD: return "ARMISD::VLD3LN_UPD";
+ case ARMISD::VLD4LN_UPD: return "ARMISD::VLD4LN_UPD";
+ case ARMISD::VLD2DUP_UPD: return "ARMISD::VLD2DUP_UPD";
+ case ARMISD::VLD3DUP_UPD: return "ARMISD::VLD3DUP_UPD";
+ case ARMISD::VLD4DUP_UPD: return "ARMISD::VLD4DUP_UPD";
+ case ARMISD::VST1_UPD: return "ARMISD::VST1_UPD";
+ case ARMISD::VST2_UPD: return "ARMISD::VST2_UPD";
+ case ARMISD::VST3_UPD: return "ARMISD::VST3_UPD";
+ case ARMISD::VST4_UPD: return "ARMISD::VST4_UPD";
+ case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD";
+ case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD";
+ case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD";
}
}
@@ -5210,6 +5229,138 @@ static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
DAG.getUNDEF(VT), NewMask.data());
}
+/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP and
+/// NEON load/store intrinsics to merge base address updates.
+static SDValue CombineBaseUpdate(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+ N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+ unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+ SDValue Addr = N->getOperand(AddrOpIdx);
+
+ // Search for a use of the address operand that is an increment.
+ for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+ UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+ SDNode *User = *UI;
+ if (User->getOpcode() != ISD::ADD ||
+ UI.getUse().getResNo() != Addr.getResNo())
+ continue;
+
+ // Check that the add is independent of the load/store. Otherwise, folding
+ // it would create a cycle.
+ if (User->isPredecessorOf(N) || N->isPredecessorOf(User))
+ continue;
+
+ // Find the new opcode for the updating load/store.
+ bool isLoad = true;
+ bool isLaneOp = false;
+ unsigned NewOpc = 0;
+ unsigned NumVecs = 0;
+ if (isIntrinsic) {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: assert(0 && "unexpected intrinsic for Neon base update");
+ case Intrinsic::arm_neon_vld1: NewOpc = ARMISD::VLD1_UPD;
+ NumVecs = 1; break;
+ case Intrinsic::arm_neon_vld2: NewOpc = ARMISD::VLD2_UPD;
+ NumVecs = 2; break;
+ case Intrinsic::arm_neon_vld3: NewOpc = ARMISD::VLD3_UPD;
+ NumVecs = 3; break;
+ case Intrinsic::arm_neon_vld4: NewOpc = ARMISD::VLD4_UPD;
+ NumVecs = 4; break;
+ case Intrinsic::arm_neon_vld2lane: NewOpc = ARMISD::VLD2LN_UPD;
+ NumVecs = 2; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vld3lane: NewOpc = ARMISD::VLD3LN_UPD;
+ NumVecs = 3; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vld4lane: NewOpc = ARMISD::VLD4LN_UPD;
+ NumVecs = 4; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst1: NewOpc = ARMISD::VST1_UPD;
+ NumVecs = 1; isLoad = false; break;
+ case Intrinsic::arm_neon_vst2: NewOpc = ARMISD::VST2_UPD;
+ NumVecs = 2; isLoad = false; break;
+ case Intrinsic::arm_neon_vst3: NewOpc = ARMISD::VST3_UPD;
+ NumVecs = 3; isLoad = false; break;
+ case Intrinsic::arm_neon_vst4: NewOpc = ARMISD::VST4_UPD;
+ NumVecs = 4; isLoad = false; break;
+ case Intrinsic::arm_neon_vst2lane: NewOpc = ARMISD::VST2LN_UPD;
+ NumVecs = 2; isLoad = false; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst3lane: NewOpc = ARMISD::VST3LN_UPD;
+ NumVecs = 3; isLoad = false; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst4lane: NewOpc = ARMISD::VST4LN_UPD;
+ NumVecs = 4; isLoad = false; isLaneOp = true; break;
+ }
+ } else {
+ isLaneOp = true;
+ switch (N->getOpcode()) {
+ default: assert(0 && "unexpected opcode for Neon base update");
+ case ARMISD::VLD2DUP: NewOpc = ARMISD::VLD2DUP_UPD; NumVecs = 2; break;
+ case ARMISD::VLD3DUP: NewOpc = ARMISD::VLD3DUP_UPD; NumVecs = 3; break;
+ case ARMISD::VLD4DUP: NewOpc = ARMISD::VLD4DUP_UPD; NumVecs = 4; break;
+ }
+ }
+
+ // Find the size of memory referenced by the load/store.
+ EVT VecTy;
+ if (isLoad)
+ VecTy = N->getValueType(0);
+ else
+ VecTy = N->getOperand(AddrOpIdx+1).getValueType();
+ unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+ if (isLaneOp)
+ NumBytes /= VecTy.getVectorNumElements();
+
+ // If the increment is a constant, it must match the memory ref size.
+ SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+ if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+ uint64_t IncVal = CInc->getZExtValue();
+ if (IncVal != NumBytes)
+ continue;
+ } else if (NumBytes >= 3 * 16) {
+ // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
+ // separate instructions that make it harder to use a non-constant update.
+ continue;
+ }
+
+ // Create the new updating load/store node.
+ EVT Tys[6];
+ unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+ unsigned n;
+ for (n = 0; n < NumResultVecs; ++n)
+ Tys[n] = VecTy;
+ Tys[n++] = MVT::i32;
+ Tys[n] = MVT::Other;
+ SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
+ SmallVector<SDValue, 8> Ops;
+ Ops.push_back(N->getOperand(0)); // incoming chain
+ Ops.push_back(N->getOperand(AddrOpIdx));
+ Ops.push_back(Inc);
+ for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
+ Ops.push_back(N->getOperand(i));
+ }
+ MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
+ SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, N->getDebugLoc(), SDTys,
+ Ops.data(), Ops.size(),
+ MemInt->getMemoryVT(),
+ MemInt->getMemOperand());
+
+ // Update the uses.
+ std::vector<SDValue> NewResults;
+ for (unsigned i = 0; i < NumResultVecs; ++i) {
+ NewResults.push_back(SDValue(UpdN.getNode(), i));
+ }
+ NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs+1)); // chain
+ DCI.CombineTo(N, NewResults);
+ DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+ break;
+ }
+ return SDValue();
+}
+
/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
@@ -5720,6 +5871,31 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ZERO_EXTEND:
case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
case ISD::SELECT_CC: return PerformSELECT_CCCombine(N, DCI.DAG, Subtarget);
+ case ARMISD::VLD2DUP:
+ case ARMISD::VLD3DUP:
+ case ARMISD::VLD4DUP:
+ return CombineBaseUpdate(N, DCI);
+ case ISD::INTRINSIC_VOID:
+ case ISD::INTRINSIC_W_CHAIN:
+ switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+ case Intrinsic::arm_neon_vld1:
+ case Intrinsic::arm_neon_vld2:
+ case Intrinsic::arm_neon_vld3:
+ case Intrinsic::arm_neon_vld4:
+ case Intrinsic::arm_neon_vld2lane:
+ case Intrinsic::arm_neon_vld3lane:
+ case Intrinsic::arm_neon_vld4lane:
+ case Intrinsic::arm_neon_vst1:
+ case Intrinsic::arm_neon_vst2:
+ case Intrinsic::arm_neon_vst3:
+ case Intrinsic::arm_neon_vst4:
+ case Intrinsic::arm_neon_vst2lane:
+ case Intrinsic::arm_neon_vst3lane:
+ case Intrinsic::arm_neon_vst4lane:
+ return CombineBaseUpdate(N, DCI);
+ default: break;
+ }
+ break;
}
return SDValue();
}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index b06b8d3e15..dc400c485e 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -181,7 +181,28 @@ namespace llvm {
// Vector load N-element structure to all lanes:
VLD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
VLD3DUP,
- VLD4DUP
+ VLD4DUP,
+
+ // NEON loads with post-increment base updates:
+ VLD1_UPD,
+ VLD2_UPD,
+ VLD3_UPD,
+ VLD4_UPD,
+ VLD2LN_UPD,
+ VLD3LN_UPD,
+ VLD4LN_UPD,
+ VLD2DUP_UPD,
+ VLD3DUP_UPD,
+ VLD4DUP_UPD,
+
+ // NEON stores with post-increment base updates:
+ VST1_UPD,
+ VST2_UPD,
+ VST3_UPD,
+ VST4_UPD,
+ VST2LN_UPD,
+ VST3LN_UPD,
+ VST4LN_UPD
};
}
diff --git a/test/CodeGen/ARM/vld1.ll b/test/CodeGen/ARM/vld1.ll
index 16bd832bb1..c886125a2f 100644
--- a/test/CodeGen/ARM/vld1.ll
+++ b/test/CodeGen/ARM/vld1.ll
@@ -16,6 +16,18 @@ define <4 x i16> @vld1i16(i16* %A) nounwind {
ret <4 x i16> %tmp1
}
+;Check for a post-increment updating load.
+define <4 x i16> @vld1i16_update(i16** %ptr) nounwind {
+;CHECK: vld1i16_update:
+;CHECK: vld1.16 {d16}, [r1]!
+ %A = load i16** %ptr
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %tmp0, i32 1)
+ %tmp2 = getelementptr i16* %A, i32 4
+ store i16* %tmp2, i16** %ptr
+ ret <4 x i16> %tmp1
+}
+
define <2 x i32> @vld1i32(i32* %A) nounwind {
;CHECK: vld1i32:
;CHECK: vld1.32
@@ -24,6 +36,18 @@ define <2 x i32> @vld1i32(i32* %A) nounwind {
ret <2 x i32> %tmp1
}
+;Check for a post-increment updating load with register increment.
+define <2 x i32> @vld1i32_update(i32** %ptr, i32 %inc) nounwind {
+;CHECK: vld1i32_update:
+;CHECK: vld1.32 {d16}, [r2], r1
+ %A = load i32** %ptr
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %tmp0, i32 1)
+ %tmp2 = getelementptr i32* %A, i32 %inc
+ store i32* %tmp2, i32** %ptr
+ ret <2 x i32> %tmp1
+}
+
define <2 x float> @vld1f(float* %A) nounwind {
;CHECK: vld1f:
;CHECK: vld1.32
@@ -48,6 +72,17 @@ define <16 x i8> @vld1Qi8(i8* %A) nounwind {
ret <16 x i8> %tmp1
}
+;Check for a post-increment updating load.
+define <16 x i8> @vld1Qi8_update(i8** %ptr) nounwind {
+;CHECK: vld1Qi8_update:
+;CHECK: vld1.8 {d16, d17}, [r1, :64]!
+ %A = load i8** %ptr
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %A, i32 8)
+ %tmp2 = getelementptr i8* %A, i32 16
+ store i8* %tmp2, i8** %ptr
+ ret <16 x i8> %tmp1
+}
+
define <8 x i16> @vld1Qi16(i16* %A) nounwind {
;CHECK: vld1Qi16:
;Check the alignment value. Max for this instruction is 128 bits:
diff --git a/test/CodeGen/ARM/vld2.ll b/test/CodeGen/ARM/vld2.ll
index 3fdd7b7aa8..29b379465d 100644
--- a/test/CodeGen/ARM/vld2.ll
+++ b/test/CodeGen/ARM/vld2.ll
@@ -56,6 +56,21 @@ define <2 x float> @vld2f(float* %A) nounwind {
ret <2 x float> %tmp4
}
+;Check for a post-increment updating load.
+define <2 x float> @vld2f_update(float** %ptr) nounwind {
+;CHECK: vld2f_update:
+;CHECK: vld2.32 {d16, d17}, [r1]!
+ %A = load float** %ptr
+ %tmp0 = bitcast float* %A to i8*
+ %tmp1 = call %struct.__neon_float32x2x2_t @llvm.arm.neon.vld2.v2f32(i8* %tmp0, i32 1)
+ %tmp2 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_float32x2x2_t %tmp1, 1
+ %tmp4 = fadd <2 x float> %tmp2, %tmp3
+ %tmp5 = getelementptr float* %A, i32 4
+ store float* %tmp5, float** %ptr
+ ret <2 x float> %tmp4
+}
+
define <1 x i64> @vld2i64(i64* %A) nounwind {
;CHECK: vld2i64:
;Check the alignment value. Max for this instruction is 128 bits:
@@ -79,6 +94,20 @@ define <16 x i8> @vld2Qi8(i8* %A) nounwind {
ret <16 x i8> %tmp4
}
+;Check for a post-increment updating load with register increment.
+define <16 x i8> @vld2Qi8_update(i8** %ptr, i32 %inc) nounwind {
+;CHECK: vld2Qi8_update:
+;CHECK: vld2.8 {d16, d17, d18, d19}, [r2, :128], r1
+ %A = load i8** %ptr
+ %tmp1 = call %struct.__neon_int8x16x2_t @llvm.arm.neon.vld2.v16i8(i8* %A, i32 16)
+ %tmp2 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x16x2_t %tmp1, 1
+ %tmp4 = add <16 x i8> %tmp2, %tmp3
+ %tmp5 = getelementptr i8* %A, i32 %inc
+ store i8* %tmp5, i8** %ptr
+ ret <16 x i8> %tmp4
+}
+
define <8 x i16> @vld2Qi16(i16* %A) nounwind {
;CHECK: vld2Qi16:
;Check the alignment value. Max for this instruction is 256 bits:
diff --git a/test/CodeGen/ARM/vld3.ll b/test/CodeGen/ARM/vld3.ll
index 0e541bbb5a..dde530f6df 100644
--- a/test/CodeGen/ARM/vld3.ll
+++ b/test/CodeGen/ARM/vld3.ll
@@ -33,6 +33,21 @@ define <4 x i16> @vld3i16(i16* %A) nounwind {
ret <4 x i16> %tmp4
}
+;Check for a post-increment updating load with register increment.
+define <4 x i16> @vld3i16_update(i16** %ptr, i32 %inc) nounwind {
+;CHECK: vld3i16_update:
+;CHECK: vld3.16 {d16, d17, d18}, [r2], r1
+ %A = load i16** %ptr
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3.v4i16(i8* %tmp0, i32 1)
+ %tmp2 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x4x3_t %tmp1, 2
+ %tmp4 = add <4 x i16> %tmp2, %tmp3
+ %tmp5 = getelementptr i16* %A, i32 %inc
+ store i16* %tmp5, i16** %ptr
+ ret <4 x i16> %tmp4
+}
+
define <2 x i32> @vld3i32(i32* %A) nounwind {
;CHECK: vld3i32:
;CHECK: vld3.32
@@ -103,6 +118,22 @@ define <4 x i32> @vld3Qi32(i32* %A) nounwind {
ret <4 x i32> %tmp4
}
+;Check for a post-increment updating load.
+define <4 x i32> @vld3Qi32_update(i32** %ptr) nounwind {
+;CHECK: vld3Qi32_update:
+;CHECK: vld3.32 {d16, d18, d20}, [r1]!
+;CHECK: vld3.32 {d17, d19, d21}, [r1]!
+ %A = load i32** %ptr
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = call %struct.__neon_int32x4x3_t @llvm.arm.neon.vld3.v4i32(i8* %tmp0, i32 1)
+ %tmp2 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int32x4x3_t %tmp1, 2
+ %tmp4 = add <4 x i32> %tmp2, %tmp3
+ %tmp5 = getelementptr i32* %A, i32 12
+ store i32* %tmp5, i32** %ptr
+ ret <4 x i32> %tmp4
+}
+
define <4 x float> @vld3Qf(float* %A) nounwind {
;CHECK: vld3Qf:
;CHECK: vld3.32
diff --git a/test/CodeGen/ARM/vld4.ll b/test/CodeGen/ARM/vld4.ll
index a616a98a25..59a73db318 100644
--- a/test/CodeGen/ARM/vld4.ll
+++ b/test/CodeGen/ARM/vld4.ll
@@ -22,6 +22,20 @@ define <8 x i8> @vld4i8(i8* %A) nounwind {
ret <8 x i8> %tmp4
}
+;Check for a post-increment updating load with register increment.
+define <8 x i8> @vld4i8_update(i8** %ptr, i32 %inc) nounwind {
+;CHECK: vld4i8_update:
+;CHECK: vld4.8 {d16, d17, d18, d19}, [r2, :128], r1
+ %A = load i8** %ptr
+ %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8(i8* %A, i32 16)
+ %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 2
+ %tmp4 = add <8 x i8> %tmp2, %tmp3
+ %tmp5 = getelementptr i8* %A, i32 %inc
+ store i8* %tmp5, i8** %ptr
+ ret <8 x i8> %tmp4
+}
+
define <4 x i16> @vld4i16(i16* %A) nounwind {
;CHECK: vld4i16:
;Check the alignment value. Max for this instruction is 256 bits:
@@ -94,6 +108,22 @@ define <8 x i16> @vld4Qi16(i16* %A) nounwind {
ret <8 x i16> %tmp4
}
+;Check for a post-increment updating load.
+define <8 x i16> @vld4Qi16_update(i16** %ptr) nounwind {
+;CHECK: vld4Qi16_update:
+;CHECK: vld4.16 {d16, d18, d20, d22}, [r1, :64]!
+;CHECK: vld4.16 {d17, d19, d21, d23}, [r1, :64]!
+ %A = load i16** %ptr
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = call %struct.__neon_int16x8x4_t @llvm.arm.neon.vld4.v8i16(i8* %tmp0, i32 8)
+ %tmp2 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 0
+ %tmp3 = extractvalue %struct.__neon_int16x8x4_t %tmp1, 2
+ %tmp4 = add <8 x i16> %tmp2, %tmp3
+ %tmp5 = getelementptr i16* %A, i32 32
+ store i16* %tmp5, i16** %ptr
+ ret <8 x i16> %tmp4
+}
+
define <4 x i32> @vld4Qi32(i32* %A) nounwind {
;CHECK: vld4Qi32:
;CHECK: vld4.32
diff --git a/test/CodeGen/ARM/vlddup.ll b/test/CodeGen/ARM/vlddup.ll
index 033febbb24..d0e9ac3ad3 100644
--- a/test/CodeGen/ARM/vlddup.ll
+++ b/test/CodeGen/ARM/vlddup.ll
@@ -89,6 +89,22 @@ define <4 x i16> @vld2dupi16(i16* %A) nounwind {
ret <4 x i16> %tmp5
}
+;Check for a post-increment updating load.
+define <4 x i16> @vld2dupi16_update(i16** %ptr) nounwind {
+;CHECK: vld2dupi16_update:
+;CHECK: vld2.16 {d16[], d17[]}, [r1]!
+ %A = load i16** %ptr
+ %tmp0 = tail call %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+ %tmp1 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 0
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int4x16x2_t %tmp0, 1
+ %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp5 = add <4 x i16> %tmp2, %tmp4
+ %tmp6 = getelementptr i16* %A, i32 2
+ store i16* %tmp6, i16** %ptr
+ ret <4 x i16> %tmp5
+}
+
define <2 x i32> @vld2dupi32(i32* %A) nounwind {
;CHECK: vld2dupi32:
;Check the alignment value. Max for this instruction is 64 bits:
@@ -106,8 +122,28 @@ declare %struct.__neon_int8x8x2_t @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8
declare %struct.__neon_int4x16x2_t @llvm.arm.neon.vld2lane.v4i16(i16*, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int2x32x2_t @llvm.arm.neon.vld2lane.v2i32(i32*, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
+%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
%struct.__neon_int16x4x3_t = type { <4 x i16>, <4 x i16>, <4 x i16> }
+;Check for a post-increment updating load with register increment.
+define <8 x i8> @vld3dupi8_update(i8** %ptr, i32 %inc) nounwind {
+;CHECK: vld3dupi8_update:
+;CHECK: vld3.8 {d16[], d17[], d18[]}, [r2], r1
+ %A = load i8** %ptr
+ %tmp0 = tail call %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8* %A, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 8)
+ %tmp1 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 0
+ %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 1
+ %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <8 x i32> zeroinitializer
+ %tmp5 = extractvalue %struct.__neon_int8x8x3_t %tmp0, 2
+ %tmp6 = shufflevector <8 x i8> %tmp5, <8 x i8> undef, <8 x i32> zeroinitializer
+ %tmp7 = add <8 x i8> %tmp2, %tmp4
+ %tmp8 = add <8 x i8> %tmp7, %tmp6
+ %tmp9 = getelementptr i8* %A, i32 %inc
+ store i8* %tmp9, i8** %ptr
+ ret <8 x i8> %tmp8
+}
+
define <4 x i16> @vld3dupi16(i16* %A) nounwind {
;CHECK: vld3dupi16:
;Check the (default) alignment value. VLD3 does not support alignment.
@@ -124,10 +160,34 @@ define <4 x i16> @vld3dupi16(i16* %A) nounwind {
ret <4 x i16> %tmp8
}
+declare %struct.__neon_int8x8x3_t @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind readonly
declare %struct.__neon_int16x4x3_t @llvm.arm.neon.vld3lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
+%struct.__neon_int16x4x4_t = type { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> }
%struct.__neon_int32x2x4_t = type { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> }
+;Check for a post-increment updating load.
+define <4 x i16> @vld4dupi16_update(i16** %ptr) nounwind {
+;CHECK: vld4dupi16_update:
+;CHECK: vld4.16 {d16[], d17[], d18[], d19[]}, [r1]!
+ %A = load i16** %ptr
+ %tmp0 = tail call %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16* %A, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 1)
+ %tmp1 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 0
+ %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp3 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 1
+ %tmp4 = shufflevector <4 x i16> %tmp3, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp5 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 2
+ %tmp6 = shufflevector <4 x i16> %tmp5, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp7 = extractvalue %struct.__neon_int16x4x4_t %tmp0, 3
+ %tmp8 = shufflevector <4 x i16> %tmp7, <4 x i16> undef, <4 x i32> zeroinitializer
+ %tmp9 = add <4 x i16> %tmp2, %tmp4
+ %tmp10 = add <4 x i16> %tmp6, %tmp8
+ %tmp11 = add <4 x i16> %tmp9, %tmp10
+ %tmp12 = getelementptr i16* %A, i32 4
+ store i16* %tmp12, i16** %ptr
+ ret <4 x i16> %tmp11
+}
+
define <2 x i32> @vld4dupi32(i32* %A) nounwind {
;CHECK: vld4dupi32:
;Check the alignment value. An 8-byte alignment is allowed here even though
@@ -148,4 +208,5 @@ define <2 x i32> @vld4dupi32(i32* %A) nounwind {
ret <2 x i32> %tmp11
}
+declare %struct.__neon_int16x4x4_t @llvm.arm.neon.vld4lane.v4i16(i16*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind readonly
declare %struct.__neon_int32x2x4_t @llvm.arm.neon.vld4lane.v2i32(i32*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind readonly
diff --git a/test/CodeGen/ARM/vldlane.ll b/test/CodeGen/ARM/vldlane.ll
index c5514a63fd..770ed071ac 100644
--- a/test/CodeGen/ARM/vldlane.ll
+++ b/test/CodeGen/ARM/vldlane.ll
@@ -121,6 +121,22 @@ define <2 x i32> @vld2lanei32(i32* %A, <2 x i32>* %B) nounwind {
ret <2 x i32> %tmp5
}
+;Check for a post-increment updating load.
+define <2 x i32> @vld2lanei32_update(i32** %ptr, <2 x i32>* %B) nounwind {
+;CHECK: vld2lanei32_update:
+;CHECK: vld2.32 {d16[1], d17[1]}, [r1]!
+ %A = load i32** %ptr
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = load <2 x i32>* %B
+ %tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm.neon.vld2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+ %tmp3 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int32x2x2_t %tmp2, 1
+ %tmp5 = add <2 x i32> %tmp3, %tmp4
+ %tmp6 = getelementptr i32* %A, i32 2
+ store i32* %tmp6, i32** %ptr
+ ret <2 x i32> %tmp5
+}
+
define <2 x float> @vld2lanef(float* %A, <2 x float>* %B) nounwind {
;CHECK: vld2lanef:
;CHECK: vld2.32
@@ -260,6 +276,24 @@ define <8 x i16> @vld3laneQi16(i16* %A, <8 x i16>* %B) nounwind {
ret <8 x i16> %tmp7
}
+;Check for a post-increment updating load with register increment.
+define <8 x i16> @vld3laneQi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
+;CHECK: vld3laneQi16_update:
+;CHECK: vld3.16 {d16[1], d18[1], d20[1]}, [r2], r1
+ %A = load i16** %ptr
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <8 x i16>* %B
+ %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm.neon.vld3lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1, i32 8)
+ %tmp3 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int16x8x3_t %tmp2, 2
+ %tmp6 = add <8 x i16> %tmp3, %tmp4
+ %tmp7 = add <8 x i16> %tmp5, %tmp6
+ %tmp8 = getelementptr i16* %A, i32 %inc
+ store i16* %tmp8, i16** %ptr
+ ret <8 x i16> %tmp7
+}
+
define <4 x i32> @vld3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vld3laneQi32:
;CHECK: vld3.32
@@ -322,6 +356,25 @@ define <8 x i8> @vld4lanei8(i8* %A, <8 x i8>* %B) nounwind {
ret <8 x i8> %tmp9
}
+;Check for a post-increment updating load.
+define <8 x i8> @vld4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
+;CHECK: vld4lanei8_update:
+;CHECK: vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
+ %A = load i8** %ptr
+ %tmp1 = load <8 x i8>* %B
+ %tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+ %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0
+ %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1
+ %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2
+ %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3
+ %tmp7 = add <8 x i8> %tmp3, %tmp4
+ %tmp8 = add <8 x i8> %tmp5, %tmp6
+ %tmp9 = add <8 x i8> %tmp7, %tmp8
+ %tmp10 = getelementptr i8* %A, i32 4
+ store i8* %tmp10, i8** %ptr
+ ret <8 x i8> %tmp9
+}
+
define <4 x i16> @vld4lanei16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vld4lanei16:
;Check that a power-of-two alignment smaller than the total size of the memory
diff --git a/test/CodeGen/ARM/vst1.ll b/test/CodeGen/ARM/vst1.ll
index 70f3a4cfa2..364d44b711 100644
--- a/test/CodeGen/ARM/vst1.ll
+++ b/test/CodeGen/ARM/vst1.ll
@@ -36,6 +36,19 @@ define void @vst1f(float* %A, <2 x float>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store.
+define void @vst1f_update(float** %ptr, <2 x float>* %B) nounwind {
+;CHECK: vst1f_update:
+;CHECK: vst1.32 {d16}, [r1]!
+ %A = load float** %ptr
+ %tmp0 = bitcast float* %A to i8*
+ %tmp1 = load <2 x float>* %B
+ call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
+ %tmp2 = getelementptr float* %A, i32 2
+ store float* %tmp2, float** %ptr
+ ret void
+}
+
define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
;CHECK: vst1i64:
;CHECK: vst1.64
@@ -64,6 +77,19 @@ define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store with register increment.
+define void @vst1Qi16_update(i16** %ptr, <8 x i16>* %B, i32 %inc) nounwind {
+;CHECK: vst1Qi16_update:
+;CHECK: vst1.16 {d16, d17}, [r1, :64], r2
+ %A = load i16** %ptr
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <8 x i16>* %B
+ call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 8)
+ %tmp2 = getelementptr i16* %A, i32 %inc
+ store i16* %tmp2, i16** %ptr
+ ret void
+}
+
define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst1Qi32:
;CHECK: vst1.32
diff --git a/test/CodeGen/ARM/vst2.ll b/test/CodeGen/ARM/vst2.ll
index ed2498b276..915a84b677 100644
--- a/test/CodeGen/ARM/vst2.ll
+++ b/test/CodeGen/ARM/vst2.ll
@@ -9,6 +9,18 @@ define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store with register increment.
+define void @vst2i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
+;CHECK: vst2i8_update:
+;CHECK: vst2.8 {d16, d17}, [r1], r2
+ %A = load i8** %ptr
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
+ %tmp2 = getelementptr i8* %A, i32 %inc
+ store i8* %tmp2, i8** %ptr
+ ret void
+}
+
define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst2i16:
;Check the alignment value. Max for this instruction is 128 bits:
@@ -47,6 +59,19 @@ define void @vst2i64(i64* %A, <1 x i64>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store.
+define void @vst2i64_update(i64** %ptr, <1 x i64>* %B) nounwind {
+;CHECK: vst2i64_update:
+;CHECK: vst1.64 {d16, d17}, [r1, :64]!
+ %A = load i64** %ptr
+ %tmp0 = bitcast i64* %A to i8*
+ %tmp1 = load <1 x i64>* %B
+ call void @llvm.arm.neon.vst2.v1i64(i8* %tmp0, <1 x i64> %tmp1, <1 x i64> %tmp1, i32 8)
+ %tmp2 = getelementptr i64* %A, i32 2
+ store i64* %tmp2, i64** %ptr
+ ret void
+}
+
define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
;CHECK: vst2Qi8:
;Check the alignment value. Max for this instruction is 256 bits:
diff --git a/test/CodeGen/ARM/vst3.ll b/test/CodeGen/ARM/vst3.ll
index 0a2df77aa0..d262303bc6 100644
--- a/test/CodeGen/ARM/vst3.ll
+++ b/test/CodeGen/ARM/vst3.ll
@@ -28,6 +28,19 @@ define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store.
+define void @vst3i32_update(i32** %ptr, <2 x i32>* %B) nounwind {
+;CHECK: vst3i32_update:
+;CHECK: vst3.32 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
+ %A = load i32** %ptr
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = load <2 x i32>* %B
+ call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+ %tmp2 = getelementptr i32* %A, i32 6
+ store i32* %tmp2, i32** %ptr
+ ret void
+}
+
define void @vst3f(float* %A, <2 x float>* %B) nounwind {
;CHECK: vst3f:
;CHECK: vst3.32
@@ -69,6 +82,20 @@ define void @vst3Qi16(i16* %A, <8 x i16>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store.
+define void @vst3Qi16_update(i16** %ptr, <8 x i16>* %B) nounwind {
+;CHECK: vst3Qi16_update:
+;CHECK: vst3.16 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
+;CHECK: vst3.16 {d{{.*}}, d{{.*}}, d{{.*}}}, [r{{.*}}]!
+ %A = load i16** %ptr
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <8 x i16>* %B
+ call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+ %tmp2 = getelementptr i16* %A, i32 24
+ store i16* %tmp2, i16** %ptr
+ ret void
+}
+
define void @vst3Qi32(i32* %A, <4 x i32>* %B) nounwind {
;CHECK: vst3Qi32:
;CHECK: vst3.32
diff --git a/test/CodeGen/ARM/vst4.ll b/test/CodeGen/ARM/vst4.ll
index 6a7e91dee9..e94acb66bf 100644
--- a/test/CodeGen/ARM/vst4.ll
+++ b/test/CodeGen/ARM/vst4.ll
@@ -9,6 +9,18 @@ define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store with register increment.
+define void @vst4i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
+;CHECK: vst4i8_update:
+;CHECK: vst4.8 {d16, d17, d18, d19}, [r1, :128], r2
+ %A = load i8** %ptr
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 16)
+ %tmp2 = getelementptr i8* %A, i32 %inc
+ store i8* %tmp2, i8** %ptr
+ ret void
+}
+
define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst4i16:
;Check the alignment value. Max for this instruction is 256 bits:
@@ -89,6 +101,20 @@ define void @vst4Qf(float* %A, <4 x float>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store.
+define void @vst4Qf_update(float** %ptr, <4 x float>* %B) nounwind {
+;CHECK: vst4Qf_update:
+;CHECK: vst4.32 {d16, d18, d20, d22}, [r1]!
+;CHECK: vst4.32 {d17, d19, d21, d23}, [r1]!
+ %A = load float** %ptr
+ %tmp0 = bitcast float* %A to i8*
+ %tmp1 = load <4 x float>* %B
+ call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+ %tmp2 = getelementptr float* %A, i32 16
+ store float* %tmp2, float** %ptr
+ ret void
+}
+
declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
diff --git a/test/CodeGen/ARM/vstlane.ll b/test/CodeGen/ARM/vstlane.ll
index 9aa8d59a28..6cc052bbeb 100644
--- a/test/CodeGen/ARM/vstlane.ll
+++ b/test/CodeGen/ARM/vstlane.ll
@@ -94,6 +94,19 @@ define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store with register increment.
+define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind {
+;CHECK: vst2lanei16_update:
+;CHECK: vst2.16 {d16[1], d17[1]}, [r1], r2
+ %A = load i16** %ptr
+ %tmp0 = bitcast i16* %A to i8*
+ %tmp1 = load <4 x i16>* %B
+ call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
+ %tmp2 = getelementptr i16* %A, i32 %inc
+ store i16* %tmp2, i16** %ptr
+ ret void
+}
+
define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
;CHECK: vst2lanei32:
;CHECK: vst2.32
@@ -205,6 +218,19 @@ define void @vst3laneQi32(i32* %A, <4 x i32>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store.
+define void @vst3laneQi32_update(i32** %ptr, <4 x i32>* %B) nounwind {
+;CHECK: vst3laneQi32_update:
+;CHECK: vst3.32 {d16[0], d18[0], d20[0]}, [r1]!
+ %A = load i32** %ptr
+ %tmp0 = bitcast i32* %A to i8*
+ %tmp1 = load <4 x i32>* %B
+ call void @llvm.arm.neon.vst3lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 0, i32 1)
+ %tmp2 = getelementptr i32* %A, i32 3
+ store i32* %tmp2, i32** %ptr
+ ret void
+}
+
define void @vst3laneQf(float* %A, <4 x float>* %B) nounwind {
;CHECK: vst3laneQf:
;CHECK: vst3.32
@@ -233,6 +259,18 @@ define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
ret void
}
+;Check for a post-increment updating store.
+define void @vst4lanei8_update(i8** %ptr, <8 x i8>* %B) nounwind {
+;CHECK: vst4lanei8_update:
+;CHECK: vst4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1, :32]!
+ %A = load i8** %ptr
+ %tmp1 = load <8 x i8>* %B
+ call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+ %tmp2 = getelementptr i8* %A, i32 4
+ store i8* %tmp2, i8** %ptr
+ ret void
+}
+
define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
;CHECK: vst4lanei16:
;CHECK: vst4.16