summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHao Liu <Hao.Liu@arm.com>2013-11-19 02:17:05 +0000
committerHao Liu <Hao.Liu@arm.com>2013-11-19 02:17:05 +0000
commit36c7806f4eacd676932ba630246f88e0e37b1cd4 (patch)
tree2c9884d3bdad08211208fbb8e21a6ed8d423d93e
parente40e68add7f17f6ad5cd5e85ea44b149f6935147 (diff)
downloadllvm-36c7806f4eacd676932ba630246f88e0e37b1cd4.tar.gz
llvm-36c7806f4eacd676932ba630246f88e0e37b1cd4.tar.bz2
llvm-36c7806f4eacd676932ba630246f88e0e37b1cd4.tar.xz
Implement AArch64 neon instructions class SIMD lsone and SIMD lone-post.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@195078 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp589
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp217
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h24
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td79
-rw-r--r--lib/Target/AArch64/AArch64InstrNEON.td654
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp20
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp429
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp2
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h17
-rw-r--r--test/CodeGen/AArch64/neon-simd-ldst-one.ll2113
-rw-r--r--test/CodeGen/AArch64/neon-simd-post-ldst-one.ll319
-rw-r--r--test/MC/AArch64/neon-diagnostics.s119
-rw-r--r--test/MC/AArch64/neon-simd-ldst-one-elem.s325
-rw-r--r--test/MC/Disassembler/AArch64/neon-instructions.txt84
14 files changed, 4800 insertions, 191 deletions
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index a6ebfe31b4..ef99541c17 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -117,11 +117,11 @@ private:
SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt);
/// Select NEON load intrinsics. NumVecs should be 1, 2, 3 or 4.
- SDNode *SelectVLD(SDNode *N, unsigned NumVecs, bool isUpdating,
+ SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
const uint16_t *Opcode);
/// Select NEON store intrinsics. NumVecs should be 1, 2, 3 or 4.
- SDNode *SelectVST(SDNode *N, unsigned NumVecs, bool isUpdating,
+ SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
const uint16_t *Opcodes);
/// Form sequences of consecutive 64/128-bit registers for use in NEON
@@ -135,6 +135,19 @@ private:
/// functions. Those should almost always be called instead.
SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
unsigned SubRegs[]);
+
+ /// Select NEON load-duplicate intrinsics. NumVecs should be 2, 3 or 4.
+ /// The opcode array specifies the instructions used for load.
+ SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
+ const uint16_t *Opcodes);
+
+ /// Select NEON load/store lane intrinsics. NumVecs should be 2, 3 or 4.
+ /// The opcode arrays specify the instructions used for load/store.
+ SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
+ unsigned NumVecs, const uint16_t *Opcodes);
+
+ SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
+ SDValue Operand);
};
}
@@ -590,32 +603,84 @@ static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register;
case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register;
case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register;
+
+ // Post-index of duplicate loads
+ case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register;
+ case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register;
+ case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register;
+ case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register;
+ case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register;
+ case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register;
+ case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register;
+ case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register;
+
+ case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register;
+ case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register;
+ case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register;
+ case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register;
+ case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register;
+ case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register;
+ case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register;
+ case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register;
+
+ case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register;
+ case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register;
+ case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register;
+ case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register;
+ case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register;
+ case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register;
+ case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register;
+ case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register;
+
+ // Post-index of lane loads
+ case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register;
+ case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register;
+ case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register;
+ case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register;
+
+ case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register;
+ case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register;
+ case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register;
+ case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register;
+
+ case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register;
+ case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register;
+ case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register;
+ case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register;
+
+ // Post-index of lane stores
+ case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register;
+ case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register;
+ case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register;
+ case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register;
+
+ case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register;
+ case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register;
+ case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register;
+ case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register;
+
+ case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register;
+ case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register;
+ case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register;
+ case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register;
}
return Opc; // If not one we handle, return it unchanged.
}
-SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
- bool isUpdating,
+SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating,
+ unsigned NumVecs,
const uint16_t *Opcodes) {
assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
EVT VT = N->getValueType(0);
unsigned OpcodeIndex;
- switch (VT.getSimpleVT().SimpleTy) {
+ bool is64BitVector = VT.is64BitVector();
+ switch (VT.getScalarType().getSizeInBits()) {
+ case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+ case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+ case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+ case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
default: llvm_unreachable("unhandled vector load type");
- case MVT::v8i8: OpcodeIndex = 0; break;
- case MVT::v4i16: OpcodeIndex = 1; break;
- case MVT::v2f32:
- case MVT::v2i32: OpcodeIndex = 2; break;
- case MVT::v1f64:
- case MVT::v1i64: OpcodeIndex = 3; break;
- case MVT::v16i8: OpcodeIndex = 4; break;
- case MVT::v8f16:
- case MVT::v8i16: OpcodeIndex = 5; break;
- case MVT::v4f32:
- case MVT::v4i32: OpcodeIndex = 6; break;
- case MVT::v2f64:
- case MVT::v2i64: OpcodeIndex = 7; break;
}
unsigned Opc = Opcodes[OpcodeIndex];
@@ -632,9 +697,8 @@ SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
Ops.push_back(N->getOperand(0)); // Push back the Chain
- std::vector<EVT> ResTys;
- bool is64BitVector = VT.is64BitVector();
-
+ SmallVector<EVT, 3> ResTys;
+ // Push back the type of return super register
if (NumVecs == 1)
ResTys.push_back(VT);
else if (NumVecs == 3)
@@ -675,8 +739,8 @@ SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
return NULL;
}
-SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
- bool isUpdating,
+SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating,
+ unsigned NumVecs,
const uint16_t *Opcodes) {
assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
SDLoc dl(N);
@@ -685,28 +749,20 @@ SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
unsigned AddrOpIdx = isUpdating ? 1 : 2;
- unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
+ unsigned Vec0Idx = 3;
EVT VT = N->getOperand(Vec0Idx).getValueType();
unsigned OpcodeIndex;
- switch (VT.getSimpleVT().SimpleTy) {
+ bool is64BitVector = VT.is64BitVector();
+ switch (VT.getScalarType().getSizeInBits()) {
+ case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+ case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+ case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+ case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
default: llvm_unreachable("unhandled vector store type");
- case MVT::v8i8: OpcodeIndex = 0; break;
- case MVT::v4i16: OpcodeIndex = 1; break;
- case MVT::v2f32:
- case MVT::v2i32: OpcodeIndex = 2; break;
- case MVT::v1f64:
- case MVT::v1i64: OpcodeIndex = 3; break;
- case MVT::v16i8: OpcodeIndex = 4; break;
- case MVT::v8f16:
- case MVT::v8i16: OpcodeIndex = 5; break;
- case MVT::v4f32:
- case MVT::v4i32: OpcodeIndex = 6; break;
- case MVT::v2f64:
- case MVT::v2i64: OpcodeIndex = 7; break;
}
unsigned Opc = Opcodes[OpcodeIndex];
- std::vector<EVT> ResTys;
+ SmallVector<EVT, 2> ResTys;
if (isUpdating)
ResTys.push_back(MVT::i64);
ResTys.push_back(MVT::Other); // Type for the Chain
@@ -720,7 +776,6 @@ SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
Opc = getVLDSTRegisterUpdateOpcode(Opc);
Ops.push_back(Inc);
}
- bool is64BitVector = VT.is64BitVector();
SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
N->op_begin() + Vec0Idx + NumVecs);
@@ -737,6 +792,172 @@ SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
return VSt;
}
+SDValue
+AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
+ SDValue Operand) {
+ SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL,
+ VT, VTD, MVT::Other,
+ CurDAG->getTargetConstant(0, MVT::i64),
+ Operand,
+ CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32));
+ return SDValue(Reg, 0);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
+ unsigned NumVecs,
+ const uint16_t *Opcodes) {
+ assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range");
+ SDLoc dl(N);
+
+ EVT VT = N->getValueType(0);
+ unsigned OpcodeIndex;
+ bool is64BitVector = VT.is64BitVector();
+ switch (VT.getScalarType().getSizeInBits()) {
+ case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
+ case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
+ case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
+ case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
+ default: llvm_unreachable("unhandled vector duplicate lane load type");
+ }
+ unsigned Opc = Opcodes[OpcodeIndex];
+
+ SDValue SuperReg;
+ SmallVector<SDValue, 6> Ops;
+ Ops.push_back(N->getOperand(1)); // Push back the Memory Address
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(2);
+ if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
+ Ops.push_back(Inc);
+ }
+ Ops.push_back(N->getOperand(0)); // Push back the Chain
+
+ SmallVector<EVT, 3> ResTys;
+ // Push back the type of return super register
+ if (NumVecs == 3)
+ ResTys.push_back(MVT::Untyped);
+ else {
+ EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+ is64BitVector ? NumVecs : NumVecs * 2);
+ ResTys.push_back(ResTy);
+ }
+ if (isUpdating)
+ ResTys.push_back(MVT::i64); // Type of the updated register
+ ResTys.push_back(MVT::Other); // Type of the Chain
+ SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+ // Transfer memoperands.
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
+
+ SuperReg = SDValue(VLdDup, 0);
+ unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
+ // Update uses of each registers in super register
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+ ReplaceUses(SDValue(N, Vec),
+ CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
+ // Update uses of the Chain
+ ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
+ if (isUpdating)
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
+ return NULL;
+}
+
+// We only have 128-bit vector type of load/store lane instructions.
+// If it is 64-bit vector, we also select it to the 128-bit instructions.
+// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and
+// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output.
+SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
+ bool isUpdating, unsigned NumVecs,
+ const uint16_t *Opcodes) {
+ assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+ SDLoc dl(N);
+ unsigned AddrOpIdx = isUpdating ? 1 : 2;
+ unsigned Vec0Idx = 3;
+
+ SDValue Chain = N->getOperand(0);
+ unsigned Lane =
+ cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
+ EVT VT = N->getOperand(Vec0Idx).getValueType();
+ bool is64BitVector = VT.is64BitVector();
+ EVT VT64; // 64-bit Vector Type
+
+ if (is64BitVector) {
+ VT64 = VT;
+ VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(),
+ VT.getVectorNumElements() * 2);
+ }
+
+ unsigned OpcodeIndex;
+ switch (VT.getScalarType().getSizeInBits()) {
+ case 8: OpcodeIndex = 0; break;
+ case 16: OpcodeIndex = 1; break;
+ case 32: OpcodeIndex = 2; break;
+ case 64: OpcodeIndex = 3; break;
+ default: llvm_unreachable("unhandled vector lane load/store type");
+ }
+ unsigned Opc = Opcodes[OpcodeIndex];
+
+ SmallVector<EVT, 3> ResTys;
+ if (IsLoad) {
+ // Push back the type of return super register
+ if (NumVecs == 3)
+ ResTys.push_back(MVT::Untyped);
+ else {
+ EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
+ is64BitVector ? NumVecs : NumVecs * 2);
+ ResTys.push_back(ResTy);
+ }
+ }
+ if (isUpdating)
+ ResTys.push_back(MVT::i64); // Type of the updated register
+ ResTys.push_back(MVT::Other); // Type of Chain
+ SmallVector<SDValue, 5> Ops;
+ Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+ if (isUpdating) {
+ SDValue Inc = N->getOperand(AddrOpIdx + 1);
+ if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
+ Opc = getVLDSTRegisterUpdateOpcode(Opc);
+ Ops.push_back(Inc);
+ }
+
+ SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
+ N->op_begin() + Vec0Idx + NumVecs);
+ if (is64BitVector)
+ for (unsigned i = 0; i < Regs.size(); i++)
+ Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]);
+ SDValue SuperReg = createQTuple(Regs);
+
+ Ops.push_back(SuperReg); // Source Reg
+ SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32);
+ Ops.push_back(LaneValue);
+ Ops.push_back(Chain); // Push back the Chain
+
+ SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+ cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
+ if (!IsLoad)
+ return VLdLn;
+
+ // Extract the subregisters.
+ SuperReg = SDValue(VLdLn, 0);
+ unsigned Sub0 = AArch64::qsub_0;
+ // Update uses of each registers in super register
+ for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+ SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg);
+ if (is64BitVector) {
+ SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0);
+ }
+ ReplaceUses(SDValue(N, Vec), SUB0);
+ }
+ ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
+ if (isUpdating)
+ ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
+ return NULL;
+}
+
unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit,
unsigned NumOfVec) {
assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range");
@@ -955,7 +1176,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed,
AArch64::LD1WB_4S_fixed, AArch64::LD1WB_2D_fixed
};
- return SelectVLD(Node, 1, true, Opcodes);
+ return SelectVLD(Node, true, 1, Opcodes);
}
case AArch64ISD::NEON_LD2_UPD: {
static const uint16_t Opcodes[] = {
@@ -964,7 +1185,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed,
AArch64::LD2WB_4S_fixed, AArch64::LD2WB_2D_fixed
};
- return SelectVLD(Node, 2, true, Opcodes);
+ return SelectVLD(Node, true, 2, Opcodes);
}
case AArch64ISD::NEON_LD3_UPD: {
static const uint16_t Opcodes[] = {
@@ -973,7 +1194,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed,
AArch64::LD3WB_4S_fixed, AArch64::LD3WB_2D_fixed
};
- return SelectVLD(Node, 3, true, Opcodes);
+ return SelectVLD(Node, true, 3, Opcodes);
}
case AArch64ISD::NEON_LD4_UPD: {
static const uint16_t Opcodes[] = {
@@ -982,7 +1203,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed,
AArch64::LD4WB_4S_fixed, AArch64::LD4WB_2D_fixed
};
- return SelectVLD(Node, 4, true, Opcodes);
+ return SelectVLD(Node, true, 4, Opcodes);
}
case AArch64ISD::NEON_LD1x2_UPD: {
static const uint16_t Opcodes[] = {
@@ -991,7 +1212,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed,
AArch64::LD1x2WB_4S_fixed, AArch64::LD1x2WB_2D_fixed
};
- return SelectVLD(Node, 2, true, Opcodes);
+ return SelectVLD(Node, true, 2, Opcodes);
}
case AArch64ISD::NEON_LD1x3_UPD: {
static const uint16_t Opcodes[] = {
@@ -1000,7 +1221,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed,
AArch64::LD1x3WB_4S_fixed, AArch64::LD1x3WB_2D_fixed
};
- return SelectVLD(Node, 3, true, Opcodes);
+ return SelectVLD(Node, true, 3, Opcodes);
}
case AArch64ISD::NEON_LD1x4_UPD: {
static const uint16_t Opcodes[] = {
@@ -1009,7 +1230,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed,
AArch64::LD1x4WB_4S_fixed, AArch64::LD1x4WB_2D_fixed
};
- return SelectVLD(Node, 4, true, Opcodes);
+ return SelectVLD(Node, true, 4, Opcodes);
}
case AArch64ISD::NEON_ST1_UPD: {
static const uint16_t Opcodes[] = {
@@ -1018,7 +1239,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed,
AArch64::ST1WB_4S_fixed, AArch64::ST1WB_2D_fixed
};
- return SelectVST(Node, 1, true, Opcodes);
+ return SelectVST(Node, true, 1, Opcodes);
}
case AArch64ISD::NEON_ST2_UPD: {
static const uint16_t Opcodes[] = {
@@ -1027,7 +1248,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed,
AArch64::ST2WB_4S_fixed, AArch64::ST2WB_2D_fixed
};
- return SelectVST(Node, 2, true, Opcodes);
+ return SelectVST(Node, true, 2, Opcodes);
}
case AArch64ISD::NEON_ST3_UPD: {
static const uint16_t Opcodes[] = {
@@ -1036,7 +1257,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed,
AArch64::ST3WB_4S_fixed, AArch64::ST3WB_2D_fixed
};
- return SelectVST(Node, 3, true, Opcodes);
+ return SelectVST(Node, true, 3, Opcodes);
}
case AArch64ISD::NEON_ST4_UPD: {
static const uint16_t Opcodes[] = {
@@ -1045,7 +1266,100 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed,
AArch64::ST4WB_4S_fixed, AArch64::ST4WB_2D_fixed
};
- return SelectVST(Node, 4, true, Opcodes);
+ return SelectVST(Node, true, 4, Opcodes);
+ }
+ case AArch64ISD::NEON_LD2DUP: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S,
+ AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H,
+ AArch64::LD2R_4S, AArch64::LD2R_2D
+ };
+ return SelectVLDDup(Node, false, 2, Opcodes);
+ }
+ case AArch64ISD::NEON_LD3DUP: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S,
+ AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H,
+ AArch64::LD3R_4S, AArch64::LD3R_2D
+ };
+ return SelectVLDDup(Node, false, 3, Opcodes);
+ }
+ case AArch64ISD::NEON_LD4DUP: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S,
+ AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H,
+ AArch64::LD4R_4S, AArch64::LD4R_2D
+ };
+ return SelectVLDDup(Node, false, 4, Opcodes);
+ }
+ case AArch64ISD::NEON_LD2DUP_UPD: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD2R_WB_8B_fixed, AArch64::LD2R_WB_4H_fixed,
+ AArch64::LD2R_WB_2S_fixed, AArch64::LD2R_WB_1D_fixed,
+ AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed,
+ AArch64::LD2R_WB_4S_fixed, AArch64::LD2R_WB_2D_fixed
+ };
+ return SelectVLDDup(Node, true, 2, Opcodes);
+ }
+ case AArch64ISD::NEON_LD3DUP_UPD: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD3R_WB_8B_fixed, AArch64::LD3R_WB_4H_fixed,
+ AArch64::LD3R_WB_2S_fixed, AArch64::LD3R_WB_1D_fixed,
+ AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed,
+ AArch64::LD3R_WB_4S_fixed, AArch64::LD3R_WB_2D_fixed
+ };
+ return SelectVLDDup(Node, true, 3, Opcodes);
+ }
+ case AArch64ISD::NEON_LD4DUP_UPD: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD4R_WB_8B_fixed, AArch64::LD4R_WB_4H_fixed,
+ AArch64::LD4R_WB_2S_fixed, AArch64::LD4R_WB_1D_fixed,
+ AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed,
+ AArch64::LD4R_WB_4S_fixed, AArch64::LD4R_WB_2D_fixed
+ };
+ return SelectVLDDup(Node, true, 4, Opcodes);
+ }
+ case AArch64ISD::NEON_LD2LN_UPD: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed,
+ AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed
+ };
+ return SelectVLDSTLane(Node, true, true, 2, Opcodes);
+ }
+ case AArch64ISD::NEON_LD3LN_UPD: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed,
+ AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed
+ };
+ return SelectVLDSTLane(Node, true, true, 3, Opcodes);
+ }
+ case AArch64ISD::NEON_LD4LN_UPD: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed,
+ AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed
+ };
+ return SelectVLDSTLane(Node, true, true, 4, Opcodes);
+ }
+ case AArch64ISD::NEON_ST2LN_UPD: {
+ static const uint16_t Opcodes[] = {
+ AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed,
+ AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed
+ };
+ return SelectVLDSTLane(Node, false, true, 2, Opcodes);
+ }
+ case AArch64ISD::NEON_ST3LN_UPD: {
+ static const uint16_t Opcodes[] = {
+ AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed,
+ AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed
+ };
+ return SelectVLDSTLane(Node, false, true, 3, Opcodes);
+ }
+ case AArch64ISD::NEON_ST4LN_UPD: {
+ static const uint16_t Opcodes[] = {
+ AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed,
+ AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed
+ };
+ return SelectVLDSTLane(Node, false, true, 4, Opcodes);
}
case AArch64ISD::NEON_ST1x2_UPD: {
static const uint16_t Opcodes[] = {
@@ -1054,7 +1368,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed,
AArch64::ST1x2WB_4S_fixed, AArch64::ST1x2WB_2D_fixed
};
- return SelectVST(Node, 2, true, Opcodes);
+ return SelectVST(Node, true, 2, Opcodes);
}
case AArch64ISD::NEON_ST1x3_UPD: {
static const uint16_t Opcodes[] = {
@@ -1063,7 +1377,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed,
AArch64::ST1x3WB_4S_fixed, AArch64::ST1x3WB_2D_fixed
};
- return SelectVST(Node, 3, true, Opcodes);
+ return SelectVST(Node, true, 3, Opcodes);
}
case AArch64ISD::NEON_ST1x4_UPD: {
static const uint16_t Opcodes[] = {
@@ -1072,7 +1386,7 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed,
AArch64::ST1x4WB_4S_fixed, AArch64::ST1x4WB_2D_fixed
};
- return SelectVST(Node, 4, true, Opcodes);
+ return SelectVST(Node, true, 4, Opcodes);
}
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
@@ -1105,114 +1419,149 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
switch (IntNo) {
default:
break;
-
case Intrinsic::arm_neon_vld1: {
- static const uint16_t Opcodes[] = { AArch64::LD1_8B, AArch64::LD1_4H,
- AArch64::LD1_2S, AArch64::LD1_1D,
- AArch64::LD1_16B, AArch64::LD1_8H,
- AArch64::LD1_4S, AArch64::LD1_2D };
- return SelectVLD(Node, 1, false, Opcodes);
+ static const uint16_t Opcodes[] = {
+ AArch64::LD1_8B, AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D,
+ AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D
+ };
+ return SelectVLD(Node, false, 1, Opcodes);
}
case Intrinsic::arm_neon_vld2: {
- static const uint16_t Opcodes[] = { AArch64::LD2_8B, AArch64::LD2_4H,
- AArch64::LD2_2S, AArch64::LD1x2_1D,
- AArch64::LD2_16B, AArch64::LD2_8H,
- AArch64::LD2_4S, AArch64::LD2_2D };
- return SelectVLD(Node, 2, false, Opcodes);
+ static const uint16_t Opcodes[] = {
+ AArch64::LD2_8B, AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D,
+ AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D
+ };
+ return SelectVLD(Node, false, 2, Opcodes);
}
case Intrinsic::arm_neon_vld3: {
- static const uint16_t Opcodes[] = { AArch64::LD3_8B, AArch64::LD3_4H,
- AArch64::LD3_2S, AArch64::LD1x3_1D,
- AArch64::LD3_16B, AArch64::LD3_8H,
- AArch64::LD3_4S, AArch64::LD3_2D };
- return SelectVLD(Node, 3, false, Opcodes);
+ static const uint16_t Opcodes[] = {
+ AArch64::LD3_8B, AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D,
+ AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D
+ };
+ return SelectVLD(Node, false, 3, Opcodes);
}
case Intrinsic::arm_neon_vld4: {
- static const uint16_t Opcodes[] = { AArch64::LD4_8B, AArch64::LD4_4H,
- AArch64::LD4_2S, AArch64::LD1x4_1D,
- AArch64::LD4_16B, AArch64::LD4_8H,
- AArch64::LD4_4S, AArch64::LD4_2D };
- return SelectVLD(Node, 4, false, Opcodes);
+ static const uint16_t Opcodes[] = {
+ AArch64::LD4_8B, AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D,
+ AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D
+ };
+ return SelectVLD(Node, false, 4, Opcodes);
}
case Intrinsic::aarch64_neon_vld1x2: {
static const uint16_t Opcodes[] = {
- AArch64::LD1x2_8B, AArch64::LD1x2_4H, AArch64::LD1x2_2S,
- AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H,
- AArch64::LD1x2_4S, AArch64::LD1x2_2D
+ AArch64::LD1x2_8B, AArch64::LD1x2_4H, AArch64::LD1x2_2S,
+ AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H,
+ AArch64::LD1x2_4S, AArch64::LD1x2_2D
};
- return SelectVLD(Node, 2, false, Opcodes);
+ return SelectVLD(Node, false, 2, Opcodes);
}
case Intrinsic::aarch64_neon_vld1x3: {
static const uint16_t Opcodes[] = {
- AArch64::LD1x3_8B, AArch64::LD1x3_4H, AArch64::LD1x3_2S,
- AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H,
- AArch64::LD1x3_4S, AArch64::LD1x3_2D
+ AArch64::LD1x3_8B, AArch64::LD1x3_4H, AArch64::LD1x3_2S,
+ AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H,
+ AArch64::LD1x3_4S, AArch64::LD1x3_2D
};
- return SelectVLD(Node, 3, false, Opcodes);
+ return SelectVLD(Node, false, 3, Opcodes);
}
case Intrinsic::aarch64_neon_vld1x4: {
static const uint16_t Opcodes[] = {
- AArch64::LD1x4_8B, AArch64::LD1x4_4H, AArch64::LD1x4_2S,
- AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H,
- AArch64::LD1x4_4S, AArch64::LD1x4_2D
+ AArch64::LD1x4_8B, AArch64::LD1x4_4H, AArch64::LD1x4_2S,
+ AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H,
+ AArch64::LD1x4_4S, AArch64::LD1x4_2D
};
- return SelectVLD(Node, 4, false, Opcodes);
+ return SelectVLD(Node, false, 4, Opcodes);
}
case Intrinsic::arm_neon_vst1: {
- static const uint16_t Opcodes[] = { AArch64::ST1_8B, AArch64::ST1_4H,
- AArch64::ST1_2S, AArch64::ST1_1D,
- AArch64::ST1_16B, AArch64::ST1_8H,
- AArch64::ST1_4S, AArch64::ST1_2D };
- return SelectVST(Node, 1, false, Opcodes);
+ static const uint16_t Opcodes[] = {
+ AArch64::ST1_8B, AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D,
+ AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D
+ };
+ return SelectVST(Node, false, 1, Opcodes);
}
case Intrinsic::arm_neon_vst2: {
- static const uint16_t Opcodes[] = { AArch64::ST2_8B, AArch64::ST2_4H,
- AArch64::ST2_2S, AArch64::ST1x2_1D,
- AArch64::ST2_16B, AArch64::ST2_8H,
- AArch64::ST2_4S, AArch64::ST2_2D };
- return SelectVST(Node, 2, false, Opcodes);
+ static const uint16_t Opcodes[] = {
+ AArch64::ST2_8B, AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D,
+ AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D
+ };
+ return SelectVST(Node, false, 2, Opcodes);
}
case Intrinsic::arm_neon_vst3: {
- static const uint16_t Opcodes[] = { AArch64::ST3_8B, AArch64::ST3_4H,
- AArch64::ST3_2S, AArch64::ST1x3_1D,
- AArch64::ST3_16B, AArch64::ST3_8H,
- AArch64::ST3_4S, AArch64::ST3_2D };
- return SelectVST(Node, 3, false, Opcodes);
+ static const uint16_t Opcodes[] = {
+ AArch64::ST3_8B, AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D,
+ AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D
+ };
+ return SelectVST(Node, false, 3, Opcodes);
}
case Intrinsic::arm_neon_vst4: {
- static const uint16_t Opcodes[] = { AArch64::ST4_8B, AArch64::ST4_4H,
- AArch64::ST4_2S, AArch64::ST1x4_1D,
- AArch64::ST4_16B, AArch64::ST4_8H,
- AArch64::ST4_4S, AArch64::ST4_2D };
- return SelectVST(Node, 4, false, Opcodes);
+ static const uint16_t Opcodes[] = {
+ AArch64::ST4_8B, AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D,
+ AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D
+ };
+ return SelectVST(Node, false, 4, Opcodes);
}
case Intrinsic::aarch64_neon_vst1x2: {
static const uint16_t Opcodes[] = {
- AArch64::ST1x2_8B, AArch64::ST1x2_4H, AArch64::ST1x2_2S,
- AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H,
- AArch64::ST1x2_4S, AArch64::ST1x2_2D
+ AArch64::ST1x2_8B, AArch64::ST1x2_4H, AArch64::ST1x2_2S,
+ AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H,
+ AArch64::ST1x2_4S, AArch64::ST1x2_2D
};
- return SelectVST(Node, 2, false, Opcodes);
+ return SelectVST(Node, false, 2, Opcodes);
}
case Intrinsic::aarch64_neon_vst1x3: {
static const uint16_t Opcodes[] = {
- AArch64::ST1x3_8B, AArch64::ST1x3_4H, AArch64::ST1x3_2S,
- AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H,
- AArch64::ST1x3_4S, AArch64::ST1x3_2D
+ AArch64::ST1x3_8B, AArch64::ST1x3_4H, AArch64::ST1x3_2S,
+ AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H,
+ AArch64::ST1x3_4S, AArch64::ST1x3_2D
};
- return SelectVST(Node, 3, false, Opcodes);
+ return SelectVST(Node, false, 3, Opcodes);
}
case Intrinsic::aarch64_neon_vst1x4: {
static const uint16_t Opcodes[] = {
- AArch64::ST1x4_8B, AArch64::ST1x4_4H, AArch64::ST1x4_2S,
- AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H,
- AArch64::ST1x4_4S, AArch64::ST1x4_2D
+ AArch64::ST1x4_8B, AArch64::ST1x4_4H, AArch64::ST1x4_2S,
+ AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H,
+ AArch64::ST1x4_4S, AArch64::ST1x4_2D
+ };
+ return SelectVST(Node, false, 4, Opcodes);
+ }
+ case Intrinsic::arm_neon_vld2lane: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D
+ };
+ return SelectVLDSTLane(Node, true, false, 2, Opcodes);
+ }
+ case Intrinsic::arm_neon_vld3lane: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D
};
- return SelectVST(Node, 4, false, Opcodes);
+ return SelectVLDSTLane(Node, true, false, 3, Opcodes);
}
+ case Intrinsic::arm_neon_vld4lane: {
+ static const uint16_t Opcodes[] = {
+ AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D
+ };
+ return SelectVLDSTLane(Node, true, false, 4, Opcodes);
}
+ case Intrinsic::arm_neon_vst2lane: {
+ static const uint16_t Opcodes[] = {
+ AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D
+ };
+ return SelectVLDSTLane(Node, false, false, 2, Opcodes);
+ }
+ case Intrinsic::arm_neon_vst3lane: {
+ static const uint16_t Opcodes[] = {
+ AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D
+ };
+ return SelectVLDSTLane(Node, false, false, 3, Opcodes);
+ }
+ case Intrinsic::arm_neon_vst4lane: {
+ static const uint16_t Opcodes[] = {
+ AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D
+ };
+ return SelectVLDSTLane(Node, false, false, 4, Opcodes);
+ }
+ } // End of switch IntNo
break;
- }
+ } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN
default:
break; // Let generic code handle it
}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index bf04bf3747..003359d1b5 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -949,6 +949,30 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
return "AArch64ISD::NEON_ST1x3_UPD";
case AArch64ISD::NEON_ST1x4_UPD:
return "AArch64ISD::NEON_ST1x4_UPD";
+ case AArch64ISD::NEON_LD2DUP:
+ return "AArch64ISD::NEON_LD2DUP";
+ case AArch64ISD::NEON_LD3DUP:
+ return "AArch64ISD::NEON_LD3DUP";
+ case AArch64ISD::NEON_LD4DUP:
+ return "AArch64ISD::NEON_LD4DUP";
+ case AArch64ISD::NEON_LD2DUP_UPD:
+ return "AArch64ISD::NEON_LD2DUP_UPD";
+ case AArch64ISD::NEON_LD3DUP_UPD:
+ return "AArch64ISD::NEON_LD3DUP_UPD";
+ case AArch64ISD::NEON_LD4DUP_UPD:
+ return "AArch64ISD::NEON_LD4DUP_UPD";
+ case AArch64ISD::NEON_LD2LN_UPD:
+ return "AArch64ISD::NEON_LD2LN_UPD";
+ case AArch64ISD::NEON_LD3LN_UPD:
+ return "AArch64ISD::NEON_LD3LN_UPD";
+ case AArch64ISD::NEON_LD4LN_UPD:
+ return "AArch64ISD::NEON_LD4LN_UPD";
+ case AArch64ISD::NEON_ST2LN_UPD:
+ return "AArch64ISD::NEON_ST2LN_UPD";
+ case AArch64ISD::NEON_ST3LN_UPD:
+ return "AArch64ISD::NEON_ST3LN_UPD";
+ case AArch64ISD::NEON_ST4LN_UPD:
+ return "AArch64ISD::NEON_ST4LN_UPD";
case AArch64ISD::NEON_VEXTRACT:
return "AArch64ISD::NEON_VEXTRACT";
default:
@@ -3518,7 +3542,9 @@ static SDValue CombineBaseUpdate(SDNode *N,
return SDValue();
SelectionDAG &DAG = DCI.DAG;
- unsigned AddrOpIdx = 2;
+ bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
+ N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+ unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
SDValue Addr = N->getOperand(AddrOpIdx);
// Search for a use of the address operand that is an increment.
@@ -3536,39 +3562,65 @@ static SDValue CombineBaseUpdate(SDNode *N,
// Find the new opcode for the updating load/store.
bool isLoad = true;
+ bool isLaneOp = false;
unsigned NewOpc = 0;
unsigned NumVecs = 0;
- unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
- switch (IntNo) {
- default: llvm_unreachable("unexpected intrinsic for Neon base update");
- case Intrinsic::arm_neon_vld1: NewOpc = AArch64ISD::NEON_LD1_UPD;
- NumVecs = 1; break;
- case Intrinsic::arm_neon_vld2: NewOpc = AArch64ISD::NEON_LD2_UPD;
- NumVecs = 2; break;
- case Intrinsic::arm_neon_vld3: NewOpc = AArch64ISD::NEON_LD3_UPD;
- NumVecs = 3; break;
- case Intrinsic::arm_neon_vld4: NewOpc = AArch64ISD::NEON_LD4_UPD;
- NumVecs = 4; break;
- case Intrinsic::arm_neon_vst1: NewOpc = AArch64ISD::NEON_ST1_UPD;
- NumVecs = 1; isLoad = false; break;
- case Intrinsic::arm_neon_vst2: NewOpc = AArch64ISD::NEON_ST2_UPD;
- NumVecs = 2; isLoad = false; break;
- case Intrinsic::arm_neon_vst3: NewOpc = AArch64ISD::NEON_ST3_UPD;
- NumVecs = 3; isLoad = false; break;
- case Intrinsic::arm_neon_vst4: NewOpc = AArch64ISD::NEON_ST4_UPD;
- NumVecs = 4; isLoad = false; break;
- case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
- NumVecs = 2; break;
- case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
- NumVecs = 3; break;
- case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
- NumVecs = 4; break;
- case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
- NumVecs = 2; isLoad = false; break;
- case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
- NumVecs = 3; isLoad = false; break;
- case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
- NumVecs = 4; isLoad = false; break;
+ if (isIntrinsic) {
+ unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+ switch (IntNo) {
+ default: llvm_unreachable("unexpected intrinsic for Neon base update");
+ case Intrinsic::arm_neon_vld1: NewOpc = AArch64ISD::NEON_LD1_UPD;
+ NumVecs = 1; break;
+ case Intrinsic::arm_neon_vld2: NewOpc = AArch64ISD::NEON_LD2_UPD;
+ NumVecs = 2; break;
+ case Intrinsic::arm_neon_vld3: NewOpc = AArch64ISD::NEON_LD3_UPD;
+ NumVecs = 3; break;
+ case Intrinsic::arm_neon_vld4: NewOpc = AArch64ISD::NEON_LD4_UPD;
+ NumVecs = 4; break;
+ case Intrinsic::arm_neon_vst1: NewOpc = AArch64ISD::NEON_ST1_UPD;
+ NumVecs = 1; isLoad = false; break;
+ case Intrinsic::arm_neon_vst2: NewOpc = AArch64ISD::NEON_ST2_UPD;
+ NumVecs = 2; isLoad = false; break;
+ case Intrinsic::arm_neon_vst3: NewOpc = AArch64ISD::NEON_ST3_UPD;
+ NumVecs = 3; isLoad = false; break;
+ case Intrinsic::arm_neon_vst4: NewOpc = AArch64ISD::NEON_ST4_UPD;
+ NumVecs = 4; isLoad = false; break;
+ case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
+ NumVecs = 2; break;
+ case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
+ NumVecs = 3; break;
+ case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
+ NumVecs = 4; break;
+ case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
+ NumVecs = 2; isLoad = false; break;
+ case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
+ NumVecs = 3; isLoad = false; break;
+ case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
+ NumVecs = 4; isLoad = false; break;
+ case Intrinsic::arm_neon_vld2lane: NewOpc = AArch64ISD::NEON_LD2LN_UPD;
+ NumVecs = 2; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vld3lane: NewOpc = AArch64ISD::NEON_LD3LN_UPD;
+ NumVecs = 3; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vld4lane: NewOpc = AArch64ISD::NEON_LD4LN_UPD;
+ NumVecs = 4; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst2lane: NewOpc = AArch64ISD::NEON_ST2LN_UPD;
+ NumVecs = 2; isLoad = false; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst3lane: NewOpc = AArch64ISD::NEON_ST3LN_UPD;
+ NumVecs = 3; isLoad = false; isLaneOp = true; break;
+ case Intrinsic::arm_neon_vst4lane: NewOpc = AArch64ISD::NEON_ST4LN_UPD;
+ NumVecs = 4; isLoad = false; isLaneOp = true; break;
+ }
+ } else {
+ isLaneOp = true;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("unexpected opcode for Neon base update");
+ case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
+ NumVecs = 2; break;
+ case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
+ NumVecs = 3; break;
+ case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
+ NumVecs = 4; break;
+ }
}
// Find the size of memory referenced by the load/store.
@@ -3578,6 +3630,8 @@ static SDValue CombineBaseUpdate(SDNode *N,
else
VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+ if (isLaneOp)
+ NumBytes /= VecTy.getVectorNumElements();
// If the increment is a constant, it must match the memory ref size.
SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
@@ -3624,6 +3678,83 @@ static SDValue CombineBaseUpdate(SDNode *N,
return SDValue();
}
+/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
+/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
+/// If so, combine them to a vldN-dup operation and return true.
+static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ // Check if the VDUPLANE operand is a vldN-dup intrinsic.
+ SDNode *VLD = N->getOperand(0).getNode();
+ if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+ return SDValue();
+ unsigned NumVecs = 0;
+ unsigned NewOpc = 0;
+ unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
+ if (IntNo == Intrinsic::arm_neon_vld2lane) {
+ NumVecs = 2;
+ NewOpc = AArch64ISD::NEON_LD2DUP;
+ } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
+ NumVecs = 3;
+ NewOpc = AArch64ISD::NEON_LD3DUP;
+ } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
+ NumVecs = 4;
+ NewOpc = AArch64ISD::NEON_LD4DUP;
+ } else {
+ return SDValue();
+ }
+
+ // First check that all the vldN-lane uses are VDUPLANEs and that the lane
+ // numbers match the load.
+ unsigned VLDLaneNo =
+ cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
+ for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+ UI != UE; ++UI) {
+ // Ignore uses of the chain result.
+ if (UI.getUse().getResNo() == NumVecs)
+ continue;
+ SDNode *User = *UI;
+ if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
+ VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
+ return SDValue();
+ }
+
+ // Create the vldN-dup node.
+ EVT Tys[5];
+ unsigned n;
+ for (n = 0; n < NumVecs; ++n)
+ Tys[n] = VT;
+ Tys[n] = MVT::Other;
+ SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
+ SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
+ MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
+ SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
+ VLDMemInt->getMemoryVT(),
+ VLDMemInt->getMemOperand());
+
+ // Update the uses.
+ for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
+ UI != UE; ++UI) {
+ unsigned ResNo = UI.getUse().getResNo();
+ // Ignore uses of the chain result.
+ if (ResNo == NumVecs)
+ continue;
+ SDNode *User = *UI;
+ DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
+ }
+
+ // Now the vldN-lane intrinsic is dead except for its chain result.
+ // Update uses of the chain.
+ std::vector<SDValue> VLDDupResults;
+ for (unsigned n = 0; n < NumVecs; ++n)
+ VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
+ VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
+ DCI.CombineTo(VLD, VLDDupResults);
+
+ return SDValue(N, 0);
+}
+
SDValue
AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -3637,6 +3768,12 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return PerformShiftCombine(N, DCI, getSubtarget());
case ISD::INTRINSIC_WO_CHAIN:
return PerformIntrinsicCombine(N, DCI.DAG);
+ case AArch64ISD::NEON_VDUPLANE:
+ return CombineVLDDUP(N, DCI);
+ case AArch64ISD::NEON_LD2DUP:
+ case AArch64ISD::NEON_LD3DUP:
+ case AArch64ISD::NEON_LD4DUP:
+ return CombineBaseUpdate(N, DCI);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN:
switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -3648,12 +3785,18 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::arm_neon_vst2:
case Intrinsic::arm_neon_vst3:
case Intrinsic::arm_neon_vst4:
+ case Intrinsic::arm_neon_vld2lane:
+ case Intrinsic::arm_neon_vld3lane:
+ case Intrinsic::arm_neon_vld4lane:
case Intrinsic::aarch64_neon_vld1x2:
case Intrinsic::aarch64_neon_vld1x3:
case Intrinsic::aarch64_neon_vld1x4:
case Intrinsic::aarch64_neon_vst1x2:
case Intrinsic::aarch64_neon_vst1x3:
case Intrinsic::aarch64_neon_vst1x4:
+ case Intrinsic::arm_neon_vst2lane:
+ case Intrinsic::arm_neon_vst3lane:
+ case Intrinsic::arm_neon_vst4lane:
return CombineBaseUpdate(N, DCI);
default:
break;
@@ -4203,7 +4346,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::arm_neon_vld4:
case Intrinsic::aarch64_neon_vld1x2:
case Intrinsic::aarch64_neon_vld1x3:
- case Intrinsic::aarch64_neon_vld1x4: {
+ case Intrinsic::aarch64_neon_vld1x4:
+ case Intrinsic::arm_neon_vld2lane:
+ case Intrinsic::arm_neon_vld3lane:
+ case Intrinsic::arm_neon_vld4lane: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
// Conservatively set memVT to the entire set of vectors loaded.
uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
@@ -4223,7 +4369,10 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::arm_neon_vst4:
case Intrinsic::aarch64_neon_vst1x2:
case Intrinsic::aarch64_neon_vst1x3:
- case Intrinsic::aarch64_neon_vst1x4: {
+ case Intrinsic::aarch64_neon_vst1x4:
+ case Intrinsic::arm_neon_vst2lane:
+ case Intrinsic::arm_neon_vst3lane:
+ case Intrinsic::arm_neon_vst4lane: {
Info.opc = ISD::INTRINSIC_VOID;
// Conservatively set memVT to the entire set of vectors stored.
unsigned NumElts = 0;
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 0f30a7a9d2..a51d10f01c 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -152,8 +152,13 @@ namespace AArch64ISD {
// Vector extract
NEON_VEXTRACT,
+ // NEON duplicate lane loads
+ NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ NEON_LD3DUP,
+ NEON_LD4DUP,
+
// NEON loads with post-increment base updates:
- NEON_LD1_UPD = ISD::FIRST_TARGET_MEMORY_OPCODE,
+ NEON_LD1_UPD,
NEON_LD2_UPD,
NEON_LD3_UPD,
NEON_LD4_UPD,
@@ -168,7 +173,22 @@ namespace AArch64ISD {
NEON_ST4_UPD,
NEON_ST1x2_UPD,
NEON_ST1x3_UPD,
- NEON_ST1x4_UPD
+ NEON_ST1x4_UPD,
+
+ // NEON duplicate lane loads with post-increment base updates:
+ NEON_LD2DUP_UPD,
+ NEON_LD3DUP_UPD,
+ NEON_LD4DUP_UPD,
+
+ // NEON lane loads with post-increment base updates:
+ NEON_LD2LN_UPD,
+ NEON_LD3LN_UPD,
+ NEON_LD4LN_UPD,
+
+ // NEON lane store with post-increment base updates:
+ NEON_ST2LN_UPD,
+ NEON_ST3LN_UPD,
+ NEON_ST4LN_UPD
};
}
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 2a0cca8618..34f917caab 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1297,6 +1297,85 @@ class NeonI_LdStMult_Post<bit q, bit l, bits<4> opcode, bits<2> size,
// Inherit Rt in 4-0
}
+// Format AdvSIMD vector load Single N-element structure to all lanes
+class NeonI_LdOne_Dup<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
+ dag ins, string asmstr, list<dag> patterns,
+ InstrItinClass itin>
+ : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+ let Inst{31} = 0b0;
+ let Inst{30} = q;
+ let Inst{29-23} = 0b0011010;
+ let Inst{22} = 0b1;
+ let Inst{21} = r;
+ let Inst{20-16} = 0b00000;
+ let Inst{15-13} = opcode;
+ let Inst{12} = 0b0;
+ let Inst{11-10} = size;
+
+ // Inherit Rn in 9-5
+ // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD vector load/store Single N-element structure to/from one lane
+class NeonI_LdStOne_Lane<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
+ dag ins, string asmstr,
+ list<dag> patterns, InstrItinClass itin>
+ : A64InstRtn<outs, ins, asmstr, patterns, itin>
+{
+ bits<4> lane;
+ let Inst{31} = 0b0;
+ let Inst{29-23} = 0b0011010;
+ let Inst{22} = l;
+ let Inst{21} = r;
+ let Inst{20-16} = 0b00000;
+ let Inst{15-14} = op2_1;
+ let Inst{13} = op0;
+
+ // Inherit Rn in 9-5
+ // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD post-index vector load Single N-element structure to all lanes
+class NeonI_LdOne_Dup_Post<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
+ dag ins, string asmstr, list<dag> patterns,
+ InstrItinClass itin>
+ : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+ let Inst{31} = 0b0;
+ let Inst{30} = q;
+ let Inst{29-23} = 0b0011011;
+ let Inst{22} = 0b1;
+ let Inst{21} = r;
+ // Inherit Rm in 20-16
+ let Inst{15-13} = opcode;
+ let Inst{12} = 0b0;
+ let Inst{11-10} = size;
+
+ // Inherit Rn in 9-5
+ // Inherit Rt in 4-0
+}
+
+// Format AdvSIMD post-index vector load/store Single N-element structure
+// to/from one lane
+class NeonI_LdStOne_Lane_Post<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
+ dag ins, string asmstr,
+ list<dag> patterns, InstrItinClass itin>
+ : A64InstRtnm<outs, ins, asmstr, patterns, itin>
+{
+ bits<4> lane;
+ let Inst{31} = 0b0;
+ let Inst{29-23} = 0b0011011;
+ let Inst{22} = l;
+ let Inst{21} = r;
+ // Inherit Rm in 20-16
+ let Inst{15-14} = op2_1;
+ let Inst{13} = op0;
+
+ // Inherit Rn in 9-5
+ // Inherit Rt in 4-0
+}
+
// Format AdvSIMD 3 scalar registers with different type
class NeonI_Scalar3Diff<bit u, bits<2> size, bits<4> opcode,
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
index b6fa6fa893..bcd59bd2e7 100644
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -3456,6 +3456,51 @@ def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
// The followings are post-index vector load/store multiple N-element
// structure(class SIMD lselem-post)
+def exact1_asmoperand : AsmOperandClass {
+ let Name = "Exact1";
+ let PredicateMethod = "isExactImm<1>";
+ let RenderMethod = "addImmOperands";
+}
+def uimm_exact1 : Operand<i32>, ImmLeaf<i32, [{return Imm == 1;}]> {
+ let ParserMatchClass = exact1_asmoperand;
+}
+
+def exact2_asmoperand : AsmOperandClass {
+ let Name = "Exact2";
+ let PredicateMethod = "isExactImm<2>";
+ let RenderMethod = "addImmOperands";
+}
+def uimm_exact2 : Operand<i32>, ImmLeaf<i32, [{return Imm == 2;}]> {
+ let ParserMatchClass = exact2_asmoperand;
+}
+
+def exact3_asmoperand : AsmOperandClass {
+ let Name = "Exact3";
+ let PredicateMethod = "isExactImm<3>";
+ let RenderMethod = "addImmOperands";
+}
+def uimm_exact3 : Operand<i32>, ImmLeaf<i32, [{return Imm == 3;}]> {
+ let ParserMatchClass = exact3_asmoperand;
+}
+
+def exact4_asmoperand : AsmOperandClass {
+ let Name = "Exact4";
+ let PredicateMethod = "isExactImm<4>";
+ let RenderMethod = "addImmOperands";
+}
+def uimm_exact4 : Operand<i32>, ImmLeaf<i32, [{return Imm == 4;}]> {
+ let ParserMatchClass = exact4_asmoperand;
+}
+
+def exact6_asmoperand : AsmOperandClass {
+ let Name = "Exact6";
+ let PredicateMethod = "isExactImm<6>";
+ let RenderMethod = "addImmOperands";
+}
+def uimm_exact6 : Operand<i32>, ImmLeaf<i32, [{return Imm == 6;}]> {
+ let ParserMatchClass = exact6_asmoperand;
+}
+
def exact8_asmoperand : AsmOperandClass {
let Name = "Exact8";
let PredicateMethod = "isExactImm<8>";
@@ -3465,6 +3510,15 @@ def uimm_exact8 : Operand<i32>, ImmLeaf<i32, [{return Imm == 8;}]> {
let ParserMatchClass = exact8_asmoperand;
}
+def exact12_asmoperand : AsmOperandClass {
+ let Name = "Exact12";
+ let PredicateMethod = "isExactImm<12>";
+ let RenderMethod = "addImmOperands";
+}
+def uimm_exact12 : Operand<i32>, ImmLeaf<i32, [{return Imm == 12;}]> {
+ let ParserMatchClass = exact12_asmoperand;
+}
+
def exact16_asmoperand : AsmOperandClass {
let Name = "Exact16";
let PredicateMethod = "isExactImm<16>";
@@ -3678,6 +3732,574 @@ defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
// End of post-index vector load/store multiple N-element structure
// (class SIMD lselem-post)
+// The followings are vector load/store single N-element structure
+// (class SIMD lsone).
+def neon_uimm0_bare : Operand<i64>,
+ ImmLeaf<i64, [{return Imm == 0;}]> {
+ let ParserMatchClass = neon_uimm0_asmoperand;
+ let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm1_bare : Operand<i64>,
+ ImmLeaf<i64, [{return Imm < 2;}]> {
+ let ParserMatchClass = neon_uimm1_asmoperand;
+ let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm2_bare : Operand<i64>,
+ ImmLeaf<i64, [{return Imm < 4;}]> {
+ let ParserMatchClass = neon_uimm2_asmoperand;
+ let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm3_bare : Operand<i64>,
+ ImmLeaf<i64, [{return Imm < 8;}]> {
+ let ParserMatchClass = uimm3_asmoperand;
+ let PrintMethod = "printUImmBareOperand";
+}
+
+def neon_uimm4_bare : Operand<i64>,
+ ImmLeaf<i64, [{return Imm < 16;}]> {
+ let ParserMatchClass = uimm4_asmoperand;
+ let PrintMethod = "printUImmBareOperand";
+}
+
+class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
+ RegisterOperand VecList, string asmop>
+ : NeonI_LdOne_Dup<q, r, opcode, size,
+ (outs VecList:$Rt), (ins GPR64xsp:$Rn),
+ asmop # "\t$Rt, [$Rn]",
+ [],
+ NoItinerary> {
+ let mayLoad = 1;
+ let neverHasSideEffects = 1;
+}
+
+multiclass LDN_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop> {
+ def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00,
+ !cast<RegisterOperand>(List # "8B_operand"), asmop>;
+
+ def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01,
+ !cast<RegisterOperand>(List # "4H_operand"), asmop>;
+
+ def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10,
+ !cast<RegisterOperand>(List # "2S_operand"), asmop>;
+
+ def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11,
+ !cast<RegisterOperand>(List # "1D_operand"), asmop>;
+
+ def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00,
+ !cast<RegisterOperand>(List # "16B_operand"), asmop>;
+
+ def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01,
+ !cast<RegisterOperand>(List # "8H_operand"), asmop>;
+
+ def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10,
+ !cast<RegisterOperand>(List # "4S_operand"), asmop>;
+
+ def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11,
+ !cast<RegisterOperand>(List # "2D_operand"), asmop>;
+}
+
+// Load single 1-element structure to all lanes of 1 register
+defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
+
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
+defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
+defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+
+
+class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
+ Instruction INST>
+ : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))),
+ (VTy (INST GPR64xsp:$Rn))>;
+
+// Match all LD1R instructions
+def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
+
+def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
+
+def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
+
+def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
+
+def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
+def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
+
+def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
+def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
+
+def : LD1R_pattern<v1i64, i64, load, LD1R_1D>;
+def : LD1R_pattern<v1f64, f64, load, LD1R_1D>;
+
+def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
+def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
+
+
+multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
+ RegisterClass RegList> {
+ defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
+ defm H : VectorList_operands<PREFIX, "H", Count, RegList>;
+ defm S : VectorList_operands<PREFIX, "S", Count, RegList>;
+ defm D : VectorList_operands<PREFIX, "D", Count, RegList>;
+}
+
+// Special vector list operand of 128-bit vectors with bare layout.
+// i.e. only show ".b", ".h", ".s", ".d"
+defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>;
+defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>;
+defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>;
+defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>;
+
+class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+ Operand ImmOp, string asmop>
+ : NeonI_LdStOne_Lane<1, r, op2_1, op0,
+ (outs VList:$Rt),
+ (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane),
+ asmop # "\t$Rt[$lane], [$Rn]",
+ [],
+ NoItinerary> {
+ let mayLoad = 1;
+ let neverHasSideEffects = 1;
+ let hasExtraDefRegAllocReq = 1;
+ let Constraints = "$src = $Rt";
+}
+
+multiclass LDN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
+ def _B : NeonI_LDN_Lane<r, 0b00, op0,
+ !cast<RegisterOperand>(List # "B_operand"),
+ neon_uimm4_bare, asmop> {
+ let Inst{12-10} = lane{2-0};
+ let Inst{30} = lane{3};
+ }
+
+ def _H : NeonI_LDN_Lane<r, 0b01, op0,
+ !cast<RegisterOperand>(List # "H_operand"),
+ neon_uimm3_bare, asmop> {
+ let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+ let Inst{30} = lane{2};
+ }
+
+ def _S : NeonI_LDN_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "S_operand"),
+ neon_uimm2_bare, asmop> {
+ let Inst{12-10} = {lane{0}, 0b0, 0b0};
+ let Inst{30} = lane{1};
+ }
+
+ def _D : NeonI_LDN_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "D_operand"),
+ neon_uimm1_bare, asmop> {
+ let Inst{12-10} = 0b001;
+ let Inst{30} = lane{0};
+ }
+}
+
+// Load single 1-element structure to one lane of 1 register.
+defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
+
+// Load single N-element structure to one lane of N consecutive registers
+// (N = 2,3,4)
+defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
+defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
+defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+
+multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+ Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
+ Instruction INST> {
+ def : Pat<(VTy (vector_insert (VTy VPR64:$src),
+ (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
+ (VTy (EXTRACT_SUBREG
+ (INST GPR64xsp:$Rn,
+ (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+ ImmOp:$lane),
+ sub_64))>;
+
+ def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
+ (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
+ (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
+}
+
+// Match all LD1LN instructions
+defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+ extloadi8, LD1LN_B>;
+
+defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+ extloadi16, LD1LN_H>;
+
+defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+ load, LD1LN_S>;
+defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+ load, LD1LN_S>;
+
+defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+ load, LD1LN_D>;
+defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+ load, LD1LN_D>;
+
+class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+ Operand ImmOp, string asmop>
+ : NeonI_LdStOne_Lane<0, r, op2_1, op0,
+ (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane),
+ asmop # "\t$Rt[$lane], [$Rn]",
+ [],
+ NoItinerary> {
+ let mayStore = 1;
+ let neverHasSideEffects = 1;
+ let hasExtraDefRegAllocReq = 1;
+}
+
+multiclass STN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
+ def _B : NeonI_STN_Lane<r, 0b00, op0,
+ !cast<RegisterOperand>(List # "B_operand"),
+ neon_uimm4_bare, asmop> {
+ let Inst{12-10} = lane{2-0};
+ let Inst{30} = lane{3};
+ }
+
+ def _H : NeonI_STN_Lane<r, 0b01, op0,
+ !cast<RegisterOperand>(List # "H_operand"),
+ neon_uimm3_bare, asmop> {
+ let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+ let Inst{30} = lane{2};
+ }
+
+ def _S : NeonI_STN_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "S_operand"),
+ neon_uimm2_bare, asmop> {
+ let Inst{12-10} = {lane{0}, 0b0, 0b0};
+ let Inst{30} = lane{1};
+ }
+
+ def _D : NeonI_STN_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "D_operand"),
+ neon_uimm1_bare, asmop>{
+ let Inst{12-10} = 0b001;
+ let Inst{30} = lane{0};
+ }
+}
+
+// Store single 1-element structure from one lane of 1 register.
+defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
+
+// Store single N-element structure from one lane of N consecutive registers
+// (N = 2,3,4)
+defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
+defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
+defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+
+multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+ Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
+ Instruction INST> {
+ def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)),
+ GPR64xsp:$Rn),
+ (INST GPR64xsp:$Rn,
+ (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64),
+ ImmOp:$lane)>;
+
+ def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)),
+ GPR64xsp:$Rn),
+ (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>;
+}
+
+// Match all ST1LN instructions
+defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+ truncstorei8, ST1LN_B>;
+
+defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+ truncstorei16, ST1LN_H>;
+
+defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+ store, ST1LN_S>;
+defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+ store, ST1LN_S>;
+
+defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+ store, ST1LN_D>;
+defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+ store, ST1LN_D>;
+
+// End of vector load/store single N-element structure (class SIMD lsone).
+
+
+// The following are post-index load/store single N-element instructions
+// (class SIMD lsone-post)
+
+multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
+ RegisterOperand VecList, Operand ImmTy,
+ string asmop> {
+ let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn",
+ DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+ def _fixed : NeonI_LdOne_Dup_Post<q, r, opcode, size,
+ (outs VecList:$Rt, GPR64xsp:$wb),
+ (ins GPR64xsp:$Rn, ImmTy:$amt),
+ asmop # "\t$Rt, [$Rn], $amt",
+ [],
+ NoItinerary> {
+ let Rm = 0b11111;
+ }
+
+ def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size,
+ (outs VecList:$Rt, GPR64xsp:$wb),
+ (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
+ asmop # "\t$Rt, [$Rn], $Rm",
+ [],
+ NoItinerary>;
+ }
+}
+
+multiclass LDWB_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop,
+ Operand uimm_b, Operand uimm_h,
+ Operand uimm_s, Operand uimm_d> {
+ defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00,
+ !cast<RegisterOperand>(List # "8B_operand"),
+ uimm_b, asmop>;
+
+ defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01,
+ !cast<RegisterOperand>(List # "4H_operand"),
+ uimm_h, asmop>;
+
+ defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10,
+ !cast<RegisterOperand>(List # "2S_operand"),
+ uimm_s, asmop>;
+
+ defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11,
+ !cast<RegisterOperand>(List # "1D_operand"),
+ uimm_d, asmop>;
+
+ defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00,
+ !cast<RegisterOperand>(List # "16B_operand"),
+ uimm_b, asmop>;
+
+ defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01,
+ !cast<RegisterOperand>(List # "8H_operand"),
+ uimm_h, asmop>;
+
+ defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10,
+ !cast<RegisterOperand>(List # "4S_operand"),
+ uimm_s, asmop>;
+
+ defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11,
+ !cast<RegisterOperand>(List # "2D_operand"),
+ uimm_d, asmop>;
+}
+
+// Post-index load single 1-element structure to all lanes of 1 register
+defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
+ uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
+ uimm_exact4, uimm_exact8, uimm_exact16>;
+defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
+ uimm_exact6, uimm_exact12, uimm_exact24>;
+defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
+ uimm_exact8, uimm_exact16, uimm_exact32>;
+
+let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
+ Constraints = "$Rn = $wb, $Rt = $src",
+ DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+ class LDN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+ Operand ImmTy, Operand ImmOp, string asmop>
+ : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
+ (outs VList:$Rt, GPR64xsp:$wb),
+ (ins GPR64xsp:$Rn, ImmTy:$amt,
+ VList:$src, ImmOp:$lane),
+ asmop # "\t$Rt[$lane], [$Rn], $amt",
+ [],
+ NoItinerary> {
+ let Rm = 0b11111;
+ }
+
+ class LDN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+ Operand ImmTy, Operand ImmOp, string asmop>
+ : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
+ (outs VList:$Rt, GPR64xsp:$wb),
+ (ins GPR64xsp:$Rn, GPR64noxzr:$Rm,
+ VList:$src, ImmOp:$lane),
+ asmop # "\t$Rt[$lane], [$Rn], $Rm",
+ [],
+ NoItinerary>;
+}
+
+multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
+ Operand uimm_b, Operand uimm_h,
+ Operand uimm_s, Operand uimm_d> {
+ def _B_fixed : LDN_WBFx_Lane<r, 0b00, op0,
+ !cast<RegisterOperand>(List # "B_operand"),
+ uimm_b, neon_uimm4_bare, asmop> {
+ let Inst{12-10} = lane{2-0};
+ let Inst{30} = lane{3};
+ }
+
+ def _B_register : LDN_WBReg_Lane<r, 0b00, op0,
+ !cast<RegisterOperand>(List # "B_operand"),
+ uimm_b, neon_uimm4_bare, asmop> {
+ let Inst{12-10} = lane{2-0};
+ let Inst{30} = lane{3};
+ }
+
+ def _H_fixed : LDN_WBFx_Lane<r, 0b01, op0,
+ !cast<RegisterOperand>(List # "H_operand"),
+ uimm_h, neon_uimm3_bare, asmop> {
+ let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+ let Inst{30} = lane{2};
+ }
+
+ def _H_register : LDN_WBReg_Lane<r, 0b01, op0,
+ !cast<RegisterOperand>(List # "H_operand"),
+ uimm_h, neon_uimm3_bare, asmop> {
+ let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+ let Inst{30} = lane{2};
+ }
+
+ def _S_fixed : LDN_WBFx_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "S_operand"),
+ uimm_s, neon_uimm2_bare, asmop> {
+ let Inst{12-10} = {lane{0}, 0b0, 0b0};
+ let Inst{30} = lane{1};
+ }
+
+ def _S_register : LDN_WBReg_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "S_operand"),
+ uimm_s, neon_uimm2_bare, asmop> {
+ let Inst{12-10} = {lane{0}, 0b0, 0b0};
+ let Inst{30} = lane{1};
+ }
+
+ def _D_fixed : LDN_WBFx_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "D_operand"),
+ uimm_d, neon_uimm1_bare, asmop> {
+ let Inst{12-10} = 0b001;
+ let Inst{30} = lane{0};
+ }
+
+ def _D_register : LDN_WBReg_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "D_operand"),
+ uimm_d, neon_uimm1_bare, asmop> {
+ let Inst{12-10} = 0b001;
+ let Inst{30} = lane{0};
+ }
+}
+
+// Post-index load single 1-element structure to one lane of 1 register.
+defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
+ uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index load single N-element structure to one lane of N consecutive
+// registers
+// (N = 2,3,4)
+defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
+ uimm_exact4, uimm_exact8, uimm_exact16>;
+defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
+ uimm_exact6, uimm_exact12, uimm_exact24>;
+defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
+ uimm_exact8, uimm_exact16, uimm_exact32>;
+
+let mayStore = 1, neverHasSideEffects = 1,
+ hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
+ DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
+ class STN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+ Operand ImmTy, Operand ImmOp, string asmop>
+ : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
+ (outs GPR64xsp:$wb),
+ (ins GPR64xsp:$Rn, ImmTy:$amt,
+ VList:$Rt, ImmOp:$lane),
+ asmop # "\t$Rt[$lane], [$Rn], $amt",
+ [],
+ NoItinerary> {
+ let Rm = 0b11111;
+ }
+
+ class STN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
+ Operand ImmTy, Operand ImmOp, string asmop>
+ : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
+ (outs GPR64xsp:$wb),
+ (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt,
+ ImmOp:$lane),
+ asmop # "\t$Rt[$lane], [$Rn], $Rm",
+ [],
+ NoItinerary>;
+}
+
+multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
+ Operand uimm_b, Operand uimm_h,
+ Operand uimm_s, Operand uimm_d> {
+ def _B_fixed : STN_WBFx_Lane<r, 0b00, op0,
+ !cast<RegisterOperand>(List # "B_operand"),
+ uimm_b, neon_uimm4_bare, asmop> {
+ let Inst{12-10} = lane{2-0};
+ let Inst{30} = lane{3};
+ }
+
+ def _B_register : STN_WBReg_Lane<r, 0b00, op0,
+ !cast<RegisterOperand>(List # "B_operand"),
+ uimm_b, neon_uimm4_bare, asmop> {
+ let Inst{12-10} = lane{2-0};
+ let Inst{30} = lane{3};
+ }
+
+ def _H_fixed : STN_WBFx_Lane<r, 0b01, op0,
+ !cast<RegisterOperand>(List # "H_operand"),
+ uimm_h, neon_uimm3_bare, asmop> {
+ let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+ let Inst{30} = lane{2};
+ }
+
+ def _H_register : STN_WBReg_Lane<r, 0b01, op0,
+ !cast<RegisterOperand>(List # "H_operand"),
+ uimm_h, neon_uimm3_bare, asmop> {
+ let Inst{12-10} = {lane{1}, lane{0}, 0b0};
+ let Inst{30} = lane{2};
+ }
+
+ def _S_fixed : STN_WBFx_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "S_operand"),
+ uimm_s, neon_uimm2_bare, asmop> {
+ let Inst{12-10} = {lane{0}, 0b0, 0b0};
+ let Inst{30} = lane{1};
+ }
+
+ def _S_register : STN_WBReg_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "S_operand"),
+ uimm_s, neon_uimm2_bare, asmop> {
+ let Inst{12-10} = {lane{0}, 0b0, 0b0};
+ let Inst{30} = lane{1};
+ }
+
+ def _D_fixed : STN_WBFx_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "D_operand"),
+ uimm_d, neon_uimm1_bare, asmop> {
+ let Inst{12-10} = 0b001;
+ let Inst{30} = lane{0};
+ }
+
+ def _D_register : STN_WBReg_Lane<r, 0b10, op0,
+ !cast<RegisterOperand>(List # "D_operand"),
+ uimm_d, neon_uimm1_bare, asmop> {
+ let Inst{12-10} = 0b001;
+ let Inst{30} = lane{0};
+ }
+}
+
+// Post-index store single 1-element structure from one lane of 1 register.
+defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
+ uimm_exact2, uimm_exact4, uimm_exact8>;
+
+// Post-index store single N-element structure from one lane of N consecutive
+// registers (N = 2,3,4)
+defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
+ uimm_exact4, uimm_exact8, uimm_exact16>;
+defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
+ uimm_exact6, uimm_exact12, uimm_exact24>;
+defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
+ uimm_exact8, uimm_exact16, uimm_exact32>;
+
+// End of post-index load/store single N-element instructions
+// (class SIMD lsone-post)
// Neon Scalar instructions implementation
// Scalar Three Same
@@ -4737,36 +5359,6 @@ defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm,
int_aarch64_neon_vpfminnmq, FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
-def neon_uimm0_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm == 0;}]> {
- let ParserMatchClass = neon_uimm0_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm1_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 2;}]> {
- let ParserMatchClass = neon_uimm1_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm2_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 4;}]> {
- let ParserMatchClass = neon_uimm2_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm3_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 8;}]> {
- let ParserMatchClass = uimm3_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm4_bare : Operand<i64>,
- ImmLeaf<i64, [{return Imm < 16;}]> {
- let ParserMatchClass = uimm4_asmoperand;
- let PrintMethod = "printUImmBareOperand";
-}
-
// Scalar by element Arithmetic
@@ -5316,6 +5908,8 @@ def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>;
def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>;
def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>;
+// Scalar Three Same
+
def neon_uimm3 : Operand<i64>,
ImmLeaf<i64, [{return Imm < 8;}]> {
let ParserMatchClass = uimm3_asmoperand;
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index c351dbeb97..1e0033c164 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1985,6 +1985,7 @@ bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc,
// Now there are two kinds of vector list when number of vector > 1:
// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
// (2) {Vn.layout - Vm.layout}
+// If the layout is like .b/.h/.s/.d, also parse the lane.
AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
if (Parser.getTok().isNot(AsmToken::LCurly)) {
@@ -2065,7 +2066,7 @@ AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr);
if (Count > 1) { // If count > 1, create vector list using super register.
- bool IsVec64 = (Layout < A64Layout::_16B) ? true : false;
+ bool IsVec64 = (Layout < A64Layout::_16B);
static unsigned SupRegIDs[3][2] = {
{ AArch64::QPairRegClassID, AArch64::DPairRegClassID },
{ AArch64::QTripleRegClassID, AArch64::DTripleRegClassID },
@@ -2080,7 +2081,22 @@ AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
Operands.push_back(
AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc));
- return MatchOperand_Success;
+ if (Parser.getTok().is(AsmToken::LBrac)) {
+ uint32_t NumLanes = 0;
+ switch(Layout) {
+ case A64Layout::_B : NumLanes = 16; break;
+ case A64Layout::_H : NumLanes = 8; break;
+ case A64Layout::_S : NumLanes = 4; break;
+ case A64Layout::_D : NumLanes = 2; break;
+ default:
+ SMLoc Loc = getLexer().getLoc();
+ Error(Loc, "expected comma before next operand");
+ return MatchOperand_ParseFail;
+ }
+ return ParseNEONLane(Operands, NumLanes);
+ } else {
+ return MatchOperand_Success;
+ }
}
// FIXME: We would really like to be able to tablegen'erate this.
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index c4f30628c3..f003d8c04b 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -234,6 +234,10 @@ static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val,
uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder);
+
static bool Check(DecodeStatus &Out, DecodeStatus In);
#include "AArch64GenDisassemblerTables.inc"
@@ -414,7 +418,7 @@ static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
unsigned RegNo,
uint64_t Address,
const void *Decoder) {
- if (RegNo >= 30)
+ if (RegNo > 30)
return MCDisassembler::Fail;
uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo);
@@ -1102,3 +1106,426 @@ static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn,
return MCDisassembler::Success;
}
+
+// Decode post-index vector load/store lane instructions.
+// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
+// operand is an immediate equal the the length of the changed bytes,
+// or Rm is decoded to a GPR64noxzr register.
+static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
+ uint64_t Address,
+ const void *Decoder) {
+ bool Is64bitVec = false;
+ bool IsLoadDup = false;
+ bool IsLoad = false;
+ unsigned TransferBytes = 0; // The total number of bytes transferred.
+ unsigned NumVecs = 0;
+ unsigned Opc = Inst.getOpcode();
+ switch (Opc) {
+ case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
+ case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
+ case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
+ case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: {
+ switch (Opc) {
+ case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
+ TransferBytes = 1; break;
+ case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
+ TransferBytes = 2; break;
+ case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
+ TransferBytes = 4; break;
+ case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register:
+ TransferBytes = 8; break;
+ }
+ Is64bitVec = true;
+ IsLoadDup = true;
+ NumVecs = 1;
+ break;
+ }
+
+ case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
+ case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
+ case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
+ case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: {
+ switch (Opc) {
+ case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
+ TransferBytes = 1; break;
+ case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
+ TransferBytes = 2; break;
+ case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
+ TransferBytes = 4; break;
+ case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register:
+ TransferBytes = 8; break;
+ }
+ IsLoadDup = true;
+ NumVecs = 1;
+ break;
+ }
+
+ case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
+ case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
+ case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
+ case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: {
+ switch (Opc) {
+ case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
+ TransferBytes = 2; break;
+ case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
+ TransferBytes = 4; break;
+ case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
+ TransferBytes = 8; break;
+ case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register:
+ TransferBytes = 16; break;
+ }
+ Is64bitVec = true;
+ IsLoadDup = true;
+ NumVecs = 2;
+ break;
+ }
+
+ case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
+ case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
+ case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
+ case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: {
+ switch (Opc) {
+ case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
+ TransferBytes = 2; break;
+ case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
+ TransferBytes = 4; break;
+ case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
+ TransferBytes = 8; break;
+ case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register:
+ TransferBytes = 16; break;
+ }
+ IsLoadDup = true;
+ NumVecs = 2;
+ break;
+ }
+
+ case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
+ case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
+ case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
+ case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: {
+ switch (Opc) {
+ case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
+ TransferBytes = 3; break;
+ case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
+ TransferBytes = 6; break;
+ case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
+ TransferBytes = 12; break;
+ case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register:
+ TransferBytes = 24; break;
+ }
+ Is64bitVec = true;
+ IsLoadDup = true;
+ NumVecs = 3;
+ break;
+ }
+
+ case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
+ case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register:
+ case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register:
+ case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: {
+ switch (Opc) {
+ case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
+ TransferBytes = 3; break;
+ case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register:
+ TransferBytes = 6; break;
+ case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register:
+ TransferBytes = 12; break;
+ case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register:
+ TransferBytes = 24; break;
+ }
+ IsLoadDup = true;
+ NumVecs = 3;
+ break;
+ }
+
+ case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
+ case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
+ case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
+ case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: {
+ switch (Opc) {
+ case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
+ TransferBytes = 4; break;
+ case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
+ TransferBytes = 8; break;
+ case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
+ TransferBytes = 16; break;
+ case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register:
+ TransferBytes = 32; break;
+ }
+ Is64bitVec = true;
+ IsLoadDup = true;
+ NumVecs = 4;
+ break;
+ }
+
+ case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
+ case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register:
+ case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register:
+ case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: {
+ switch (Opc) {
+ case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
+ TransferBytes = 4; break;
+ case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register:
+ TransferBytes = 8; break;
+ case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register:
+ TransferBytes = 16; break;
+ case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register:
+ TransferBytes = 32; break;
+ }
+ IsLoadDup = true;
+ NumVecs = 4;
+ break;
+ }
+
+ case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
+ case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
+ case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
+ case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: {
+ switch (Opc) {
+ case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
+ TransferBytes = 1; break;
+ case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
+ TransferBytes = 2; break;
+ case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
+ TransferBytes = 4; break;
+ case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register:
+ TransferBytes = 8; break;
+ }
+ IsLoad = true;
+ NumVecs = 1;
+ break;
+ }
+
+ case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
+ case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
+ case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
+ case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: {
+ switch (Opc) {
+ case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
+ TransferBytes = 2; break;
+ case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
+ TransferBytes = 4; break;
+ case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
+ TransferBytes = 8; break;
+ case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register:
+ TransferBytes = 16; break;
+ }
+ IsLoad = true;
+ NumVecs = 2;
+ break;
+ }
+
+ case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
+ case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
+ case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
+ case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: {
+ switch (Opc) {
+ case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
+ TransferBytes = 3; break;
+ case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
+ TransferBytes = 6; break;
+ case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
+ TransferBytes = 12; break;
+ case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register:
+ TransferBytes = 24; break;
+ }
+ IsLoad = true;
+ NumVecs = 3;
+ break;
+ }
+
+ case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
+ case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
+ case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
+ case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: {
+ switch (Opc) {
+ case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
+ TransferBytes = 3; break;
+ case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
+ TransferBytes = 6; break;
+ case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
+ TransferBytes = 12; break;
+ case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register:
+ TransferBytes = 24; break;
+ }
+ IsLoad = true;
+ NumVecs = 4;
+ break;
+ }
+
+ case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
+ case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
+ case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
+ case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: {
+ switch (Opc) {
+ case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
+ TransferBytes = 1; break;
+ case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
+ TransferBytes = 2; break;
+ case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
+ TransferBytes = 4; break;
+ case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register:
+ TransferBytes = 8; break;
+ }
+ NumVecs = 1;
+ break;
+ }
+
+ case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
+ case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
+ case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
+ case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: {
+ switch (Opc) {
+ case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
+ TransferBytes = 2; break;
+ case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
+ TransferBytes = 4; break;
+ case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
+ TransferBytes = 8; break;
+ case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register:
+ TransferBytes = 16; break;
+ }
+ NumVecs = 2;
+ break;
+ }
+
+ case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
+ case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
+ case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
+ case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: {
+ switch (Opc) {
+ case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
+ TransferBytes = 3; break;
+ case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
+ TransferBytes = 6; break;
+ case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
+ TransferBytes = 12; break;
+ case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register:
+ TransferBytes = 24; break;
+ }
+ NumVecs = 3;
+ break;
+ }
+
+ case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
+ case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
+ case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
+ case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: {
+ switch (Opc) {
+ case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
+ TransferBytes = 4; break;
+ case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
+ TransferBytes = 8; break;
+ case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
+ TransferBytes = 16; break;
+ case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register:
+ TransferBytes = 32; break;
+ }
+ NumVecs = 4;
+ break;
+ }
+
+ default:
+ return MCDisassembler::Fail;
+ } // End of switch (Opc)
+
+ unsigned Rt = fieldFromInstruction(Insn, 0, 5);
+ unsigned Rn = fieldFromInstruction(Insn, 5, 5);
+ unsigned Rm = fieldFromInstruction(Insn, 16, 5);
+
+ // Decode post-index of load duplicate lane
+ if (IsLoadDup) {
+ switch (NumVecs) {
+ case 1:
+ Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder)
+ : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+ break;
+ case 2:
+ Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder)
+ : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+ break;
+ case 3:
+ Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder)
+ : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+ break;
+ case 4:
+ Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder)
+ : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+ }
+
+ // Decode write back register, which is equal to Rn.
+ DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+ DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+ if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
+ Inst.addOperand(MCOperand::CreateImm(TransferBytes));
+ else // Decode Rm
+ DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+ return MCDisassembler::Success;
+ }
+
+ // Decode post-index of load/store lane
+ // Loads have a vector list as output.
+ if (IsLoad) {
+ switch (NumVecs) {
+ case 1:
+ DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+ break;
+ case 2:
+ DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+ break;
+ case 3:
+ DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+ break;
+ case 4:
+ DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+ }
+ }
+
+ // Decode write back register, which is equal to Rn.
+ DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+ DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+
+ if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
+ Inst.addOperand(MCOperand::CreateImm(TransferBytes));
+ else // Decode Rm
+ DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+
+ // Decode the source vector list.
+ switch (NumVecs) {
+ case 1:
+ DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
+ break;
+ case 2:
+ DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+ break;
+ case 3:
+ DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+ break;
+ case 4:
+ DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
+ }
+
+ // Decode lane
+ unsigned Q = fieldFromInstruction(Insn, 30, 1);
+ unsigned S = fieldFromInstruction(Insn, 10, 3);
+ unsigned lane = 0;
+ switch (NumVecs) {
+ case 1:
+ lane = (Q << 3) & S;
+ break;
+ case 2:
+ lane = (Q << 2) & (S >> 1);
+ break;
+ case 3:
+ lane = (Q << 1) & (S >> 2);
+ break;
+ case 4:
+ lane = Q;
+ break;
+ }
+ Inst.addOperand(MCOperand::CreateImm(lane));
+
+ return MCDisassembler::Success;
+}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index c081691756..24205b57b9 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -521,7 +521,7 @@ void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
std::string LayoutStr = A64VectorLayoutToString(Layout);
O << "{";
if (Count > 1) { // Print sub registers separately
- bool IsVec64 = (Layout < A64Layout::_16B) ? true : false;
+ bool IsVec64 = (Layout < A64Layout::_16B);
unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
for (unsigned I = 0; I < Count; I++) {
std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++));
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 7db5238181..d6ae147182 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -317,7 +317,14 @@ namespace A64Layout {
_16B,
_8H,
_4S,
- _2D
+ _2D,
+
+ // Bare layout for the 128-bit vector
+ // (only show ".b", ".h", ".s", ".d" without vector number)
+ _B,
+ _H,
+ _S,
+ _D
};
}
@@ -332,6 +339,10 @@ A64VectorLayoutToString(A64Layout::VectorLayout Layout) {
case A64Layout::_8H: return ".8h";
case A64Layout::_4S: return ".4s";
case A64Layout::_2D: return ".2d";
+ case A64Layout::_B: return ".b";
+ case A64Layout::_H: return ".h";
+ case A64Layout::_S: return ".s";
+ case A64Layout::_D: return ".d";
default: llvm_unreachable("Unknown Vector Layout");
}
}
@@ -347,6 +358,10 @@ A64StringToVectorLayout(StringRef LayoutStr) {
.Case(".8h", A64Layout::_8H)
.Case(".4s", A64Layout::_4S)
.Case(".2d", A64Layout::_2D)
+ .Case(".b", A64Layout::_B)
+ .Case(".h", A64Layout::_H)
+ .Case(".s", A64Layout::_S)
+ .Case(".d", A64Layout::_D)
.Default(A64Layout::Invalid);
}
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
new file mode 100644
index 0000000000..3f28320f23
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-ldst-one.ll
@@ -0,0 +1,2113 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+%struct.int8x16x2_t = type { [2 x <16 x i8>] }
+%struct.int16x8x2_t = type { [2 x <8 x i16>] }
+%struct.int32x4x2_t = type { [2 x <4 x i32>] }
+%struct.int64x2x2_t = type { [2 x <2 x i64>] }
+%struct.float32x4x2_t = type { [2 x <4 x float>] }
+%struct.float64x2x2_t = type { [2 x <2 x double>] }
+%struct.int8x8x2_t = type { [2 x <8 x i8>] }
+%struct.int16x4x2_t = type { [2 x <4 x i16>] }
+%struct.int32x2x2_t = type { [2 x <2 x i32>] }
+%struct.int64x1x2_t = type { [2 x <1 x i64>] }
+%struct.float32x2x2_t = type { [2 x <2 x float>] }
+%struct.float64x1x2_t = type { [2 x <1 x double>] }
+%struct.int8x16x3_t = type { [3 x <16 x i8>] }
+%struct.int16x8x3_t = type { [3 x <8 x i16>] }
+%struct.int32x4x3_t = type { [3 x <4 x i32>] }
+%struct.int64x2x3_t = type { [3 x <2 x i64>] }
+%struct.float32x4x3_t = type { [3 x <4 x float>] }
+%struct.float64x2x3_t = type { [3 x <2 x double>] }
+%struct.int8x8x3_t = type { [3 x <8 x i8>] }
+%struct.int16x4x3_t = type { [3 x <4 x i16>] }
+%struct.int32x2x3_t = type { [3 x <2 x i32>] }
+%struct.int64x1x3_t = type { [3 x <1 x i64>] }
+%struct.float32x2x3_t = type { [3 x <2 x float>] }
+%struct.float64x1x3_t = type { [3 x <1 x double>] }
+%struct.int8x16x4_t = type { [4 x <16 x i8>] }
+%struct.int16x8x4_t = type { [4 x <8 x i16>] }
+%struct.int32x4x4_t = type { [4 x <4 x i32>] }
+%struct.int64x2x4_t = type { [4 x <2 x i64>] }
+%struct.float32x4x4_t = type { [4 x <4 x float>] }
+%struct.float64x2x4_t = type { [4 x <2 x double>] }
+%struct.int8x8x4_t = type { [4 x <8 x i8>] }
+%struct.int16x4x4_t = type { [4 x <4 x i16>] }
+%struct.int32x2x4_t = type { [4 x <2 x i32>] }
+%struct.int64x1x4_t = type { [4 x <1 x i64>] }
+%struct.float32x2x4_t = type { [4 x <2 x float>] }
+%struct.float64x1x4_t = type { [4 x <1 x double>] }
+
+define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1q_dup_s8
+; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
+entry:
+ %0 = load i8* %a, align 1
+ %1 = insertelement <16 x i8> undef, i8 %0, i32 0
+ %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+ ret <16 x i8> %lane
+}
+
+define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1q_dup_s16
+; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
+entry:
+ %0 = load i16* %a, align 2
+ %1 = insertelement <8 x i16> undef, i16 %0, i32 0
+ %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %lane
+}
+
+define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1q_dup_s32
+; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
+entry:
+ %0 = load i32* %a, align 4
+ %1 = insertelement <4 x i32> undef, i32 %0, i32 0
+ %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %lane
+}
+
+define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1q_dup_s64
+; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
+entry:
+ %0 = load i64* %a, align 8
+ %1 = insertelement <2 x i64> undef, i64 %0, i32 0
+ %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+ ret <2 x i64> %lane
+}
+
+define <4 x float> @test_vld1q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1q_dup_f32
+; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
+entry:
+ %0 = load float* %a, align 4
+ %1 = insertelement <4 x float> undef, float %0, i32 0
+ %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+ ret <4 x float> %lane
+}
+
+define <2 x double> @test_vld1q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1q_dup_f64
+; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
+entry:
+ %0 = load double* %a, align 8
+ %1 = insertelement <2 x double> undef, double %0, i32 0
+ %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+ ret <2 x double> %lane
+}
+
+define <8 x i8> @test_vld1_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld1_dup_s8
+; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
+entry:
+ %0 = load i8* %a, align 1
+ %1 = insertelement <8 x i8> undef, i8 %0, i32 0
+ %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+ ret <8 x i8> %lane
+}
+
+define <4 x i16> @test_vld1_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld1_dup_s16
+; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
+entry:
+ %0 = load i16* %a, align 2
+ %1 = insertelement <4 x i16> undef, i16 %0, i32 0
+ %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %lane
+}
+
+define <2 x i32> @test_vld1_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld1_dup_s32
+; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
+entry:
+ %0 = load i32* %a, align 4
+ %1 = insertelement <2 x i32> undef, i32 %0, i32 0
+ %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+ ret <2 x i32> %lane
+}
+
+define <1 x i64> @test_vld1_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld1_dup_s64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = load i64* %a, align 8
+ %1 = insertelement <1 x i64> undef, i64 %0, i32 0
+ ret <1 x i64> %1
+}
+
+define <2 x float> @test_vld1_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld1_dup_f32
+; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
+entry:
+ %0 = load float* %a, align 4
+ %1 = insertelement <2 x float> undef, float %0, i32 0
+ %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+ ret <2 x float> %lane
+}
+
+define <1 x double> @test_vld1_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld1_dup_f64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = load double* %a, align 8
+ %1 = insertelement <1 x double> undef, double %0, i32 0
+ ret <1 x double> %1
+}
+
+define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld2q_dup_s8
+; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+ %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+ %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
+ %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+ %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
+ %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+ ret %struct.int8x16x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld2q_dup_s16
+; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+ %0 = bitcast i16* %a to i8*
+ %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+ %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
+ %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+ %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
+ %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+ ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld2q_dup_s32
+; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+ %0 = bitcast i32* %a to i8*
+ %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+ %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
+ %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+ %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
+ %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+ ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld2q_dup_s64
+; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+ %0 = bitcast i64* %a to i8*
+ %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+ %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
+ %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
+ %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+ ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld2q_dup_f32
+; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+ %0 = bitcast float* %a to i8*
+ %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+ %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
+ %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
+ %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+ ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld2q_dup_f64
+; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+ %0 = bitcast double* %a to i8*
+ %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+ %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
+ %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
+ %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+ ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld2_dup_s8
+; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+ %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+ %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
+ %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+ %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
+ %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+ ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld2_dup_s16
+; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+ %0 = bitcast i16* %a to i8*
+ %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+ %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
+ %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+ %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
+ %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+ ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld2_dup_s32
+; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+ %0 = bitcast i32* %a to i8*
+ %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+ %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
+ %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
+ %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+ ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld2_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = bitcast i64* %a to i8*
+ %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
+ %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
+ %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
+ %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+ ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld2_dup_f32
+; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+ %0 = bitcast float* %a to i8*
+ %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+ %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
+ %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
+ %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+ ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld2_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = bitcast double* %a to i8*
+ %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
+ %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
+ %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
+ %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+ ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld3q_dup_s8
+; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+ %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+ %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
+ %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+ %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
+ %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+ %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
+ %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
+ ret %struct.int8x16x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld3q_dup_s16
+; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+ %0 = bitcast i16* %a to i8*
+ %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+ %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
+ %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+ %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
+ %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+ %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
+ %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
+ ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld3q_dup_s32
+; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+ %0 = bitcast i32* %a to i8*
+ %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+ %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
+ %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+ %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
+ %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+ %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
+ %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
+ ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld3q_dup_s64
+; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+ %0 = bitcast i64* %a to i8*
+ %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+ %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
+ %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
+ %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+ %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
+ %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
+ ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld3q_dup_f32
+; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+ %0 = bitcast float* %a to i8*
+ %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+ %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
+ %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
+ %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+ %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
+ %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
+ ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld3q_dup_f64
+; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+ %0 = bitcast double* %a to i8*
+ %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+ %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
+ %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
+ %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+ %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
+ %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
+ ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld3_dup_s8
+; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+ %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+ %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
+ %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+ %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
+ %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+ %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
+ %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
+ ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld3_dup_s16
+; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+ %0 = bitcast i16* %a to i8*
+ %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+ %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
+ %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+ %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
+ %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+ %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
+ %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
+ ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld3_dup_s32
+; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+ %0 = bitcast i32* %a to i8*
+ %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+ %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
+ %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
+ %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+ %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
+ %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
+ ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld3_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = bitcast i64* %a to i8*
+ %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
+ %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
+ %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
+ %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
+ %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
+ ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld3_dup_f32
+; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+ %0 = bitcast float* %a to i8*
+ %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+ %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
+ %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
+ %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+ %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
+ %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
+ ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld3_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = bitcast double* %a to i8*
+ %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
+ %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
+ %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
+ %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
+ %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
+ ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld4q_dup_s8
+; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
+entry:
+ %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+ %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
+ %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+ %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
+ %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+ %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
+ %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+ %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
+ %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
+ ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld4q_dup_s16
+; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
+entry:
+ %0 = bitcast i16* %a to i8*
+ %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
+ %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
+ %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
+ %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
+ %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
+ %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
+ %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
+ %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
+ %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
+ ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld4q_dup_s32
+; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+ %0 = bitcast i32* %a to i8*
+ %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+ %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
+ %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
+ %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
+ %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
+ %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
+ %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+ %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
+ %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
+ ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld4q_dup_s64
+; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+ %0 = bitcast i64* %a to i8*
+ %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
+ %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
+ %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
+ %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
+ %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
+ %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
+ %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
+ %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
+ ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld4q_dup_f32
+; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
+entry:
+ %0 = bitcast float* %a to i8*
+ %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
+ %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
+ %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
+ %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
+ %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
+ %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
+ %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
+ %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
+ %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
+ ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld4q_dup_f64
+; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
+entry:
+ %0 = bitcast double* %a to i8*
+ %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+ %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
+ %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
+ %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
+ %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
+ %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+ %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
+ %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
+ ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
+; CHECK-LABEL: test_vld4_dup_s8
+; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
+entry:
+ %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+ %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
+ %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
+ %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
+ %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+ %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
+ %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+ %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
+ %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
+ ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
+; CHECK-LABEL: test_vld4_dup_s16
+; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
+entry:
+ %0 = bitcast i16* %a to i8*
+ %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+ %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
+ %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
+ %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
+ %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
+ %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
+ %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+ %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
+ %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
+ ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
+; CHECK-LABEL: test_vld4_dup_s32
+; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+ %0 = bitcast i32* %a to i8*
+ %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+ %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
+ %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
+ %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
+ %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
+ %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+ %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
+ %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
+ ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
+; CHECK-LABEL: test_vld4_dup_s64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = bitcast i64* %a to i8*
+ %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
+ %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
+ %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
+ %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
+ %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
+ %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
+ ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
+; CHECK-LABEL: test_vld4_dup_f32
+; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
+entry:
+ %0 = bitcast float* %a to i8*
+ %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
+ %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
+ %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
+ %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
+ %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
+ %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
+ %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
+ %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
+ %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
+ %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
+ ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
+; CHECK-LABEL: test_vld4_dup_f64
+; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = bitcast double* %a to i8*
+ %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
+ %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
+ %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
+ %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
+ %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
+ %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
+ ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vld1q_lane_s8
+; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i8* %a, align 1
+ %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
+ ret <16 x i8> %vld1_lane
+}
+
+define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vld1q_lane_s16
+; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i16* %a, align 2
+ %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
+ ret <8 x i16> %vld1_lane
+}
+
+define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vld1q_lane_s32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i32* %a, align 4
+ %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
+ ret <4 x i32> %vld1_lane
+}
+
+define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vld1q_lane_s64
+; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i64* %a, align 8
+ %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
+ ret <2 x i64> %vld1_lane
+}
+
+define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vld1q_lane_f32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load float* %a, align 4
+ %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
+ ret <4 x float> %vld1_lane
+}
+
+define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vld1q_lane_f64
+; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load double* %a, align 8
+ %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
+ ret <2 x double> %vld1_lane
+}
+
+define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vld1_lane_s8
+; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i8* %a, align 1
+ %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
+ ret <8 x i8> %vld1_lane
+}
+
+define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vld1_lane_s16
+; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i16* %a, align 2
+ %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
+ ret <4 x i16> %vld1_lane
+}
+
+define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vld1_lane_s32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load i32* %a, align 4
+ %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
+ ret <2 x i32> %vld1_lane
+}
+
+define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vld1_lane_s64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = load i64* %a, align 8
+ %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
+ ret <1 x i64> %vld1_lane
+}
+
+define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vld1_lane_f32
+; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %0 = load float* %a, align 4
+ %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
+ ret <2 x float> %vld1_lane
+}
+
+define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vld1_lane_f64
+; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
+entry:
+ %0 = load double* %a, align 8
+ %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
+ ret <1 x double> %vld1_lane
+}
+
+define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s16
+; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+ %0 = bitcast i16* %a to i8*
+ %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
+ %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.int16x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+ %0 = bitcast i32* %a to i8*
+ %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
+ %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.int32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_s64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+ %0 = bitcast i64* %a to i8*
+ %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
+ %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.int64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_f32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+ %0 = bitcast float* %a to i8*
+ %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
+ %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.float32x4x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld2q_lane_f64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+ %0 = bitcast double* %a to i8*
+ %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
+ %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.float64x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s8
+; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+ %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
+ %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.int8x8x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s16
+; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+ %0 = bitcast i16* %a to i8*
+ %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
+ %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.int16x4x2_t %.fca.0.1.insert
+}
+
+define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+ %0 = bitcast i32* %a to i8*
+ %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
+ %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.int32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_s64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+ %0 = bitcast i64* %a to i8*
+ %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
+ %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.int64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_f32
+; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+ %0 = bitcast float* %a to i8*
+ %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
+ %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.float32x2x2_t %.fca.0.1.insert
+}
+
+define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld2_lane_f64
+; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+ %0 = bitcast double* %a to i8*
+ %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
+ %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
+ %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
+ %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
+ ret %struct.float64x1x2_t %.fca.0.1.insert
+}
+
+define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s16
+; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+ %0 = bitcast i16* %a to i8*
+ %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
+ %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.int16x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+ %0 = bitcast i32* %a to i8*
+ %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
+ %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.int32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_s64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+ %0 = bitcast i64* %a to i8*
+ %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
+ %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.int64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_f32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+ %0 = bitcast float* %a to i8*
+ %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
+ %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.float32x4x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld3q_lane_f64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+ %0 = bitcast double* %a to i8*
+ %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
+ %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.float64x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s8
+; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+ %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
+ %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.int8x8x3_t %.fca.0.2.insert
+}
+
+define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s16
+; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+ %0 = bitcast i16* %a to i8*
+ %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
+ %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.int16x4x3_t %.fca.0.2.insert
+}
+
+define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+ %0 = bitcast i32* %a to i8*
+ %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
+ %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.int32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_s64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+ %0 = bitcast i64* %a to i8*
+ %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
+ %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.int64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_f32
+; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+ %0 = bitcast float* %a to i8*
+ %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
+ %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.float32x2x3_t %.fca.0.2.insert
+}
+
+define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld3_lane_f64
+; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+ %0 = bitcast double* %a to i8*
+ %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
+ %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
+ %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
+ ret %struct.float64x1x3_t %.fca.0.2.insert
+}
+
+define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s8
+; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+ %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
+ %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.int8x16x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s16
+; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+ %0 = bitcast i16* %a to i8*
+ %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
+ %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.int16x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+ %0 = bitcast i32* %a to i8*
+ %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
+ %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.int32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_s64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+ %0 = bitcast i64* %a to i8*
+ %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
+ %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.int64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_f32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+ %0 = bitcast float* %a to i8*
+ %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
+ %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.float32x4x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld4q_lane_f64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+ %0 = bitcast double* %a to i8*
+ %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
+ %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.float64x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s8
+; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+ %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
+ %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.int8x8x4_t %.fca.0.3.insert
+}
+
+define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s16
+; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+ %0 = bitcast i16* %a to i8*
+ %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
+ %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.int16x4x4_t %.fca.0.3.insert
+}
+
+define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+ %0 = bitcast i32* %a to i8*
+ %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
+ %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.int32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_s64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+ %0 = bitcast i64* %a to i8*
+ %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
+ %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.int64x1x4_t %.fca.0.3.insert
+}
+
+define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_f32
+; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+ %0 = bitcast float* %a to i8*
+ %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
+ %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.float32x2x4_t %.fca.0.3.insert
+}
+
+define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vld4_lane_f64
+; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+ %0 = bitcast double* %a to i8*
+ %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
+ %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
+ %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
+ %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
+ %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
+ %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
+ %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
+ %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
+ ret %struct.float64x1x4_t %.fca.0.3.insert
+}
+
+define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
+; CHECK-LABEL: test_vst1q_lane_s8
+; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <16 x i8> %b, i32 15
+ store i8 %0, i8* %a, align 1
+ ret void
+}
+
+define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
+; CHECK-LABEL: test_vst1q_lane_s16
+; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <8 x i16> %b, i32 7
+ store i16 %0, i16* %a, align 2
+ ret void
+}
+
+define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vst1q_lane_s32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <4 x i32> %b, i32 3
+ store i32 %0, i32* %a, align 4
+ ret void
+}
+
+define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vst1q_lane_s64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <2 x i64> %b, i32 1
+ store i64 %0, i64* %a, align 8
+ ret void
+}
+
+define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
+; CHECK-LABEL: test_vst1q_lane_f32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <4 x float> %b, i32 3
+ store float %0, float* %a, align 4
+ ret void
+}
+
+define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
+; CHECK-LABEL: test_vst1q_lane_f64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <2 x double> %b, i32 1
+ store double %0, double* %a, align 8
+ ret void
+}
+
+define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
+; CHECK-LABEL: test_vst1_lane_s8
+; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <8 x i8> %b, i32 7
+ store i8 %0, i8* %a, align 1
+ ret void
+}
+
+define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
+; CHECK-LABEL: test_vst1_lane_s16
+; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <4 x i16> %b, i32 3
+ store i16 %0, i16* %a, align 2
+ ret void
+}
+
+define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
+; CHECK-LABEL: test_vst1_lane_s32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <2 x i32> %b, i32 1
+ store i32 %0, i32* %a, align 4
+ ret void
+}
+
+define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
+; CHECK-LABEL: test_vst1_lane_s64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <1 x i64> %b, i32 0
+ store i64 %0, i64* %a, align 8
+ ret void
+}
+
+define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
+; CHECK-LABEL: test_vst1_lane_f32
+; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <2 x float> %b, i32 1
+ store float %0, float* %a, align 4
+ ret void
+}
+
+define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
+; CHECK-LABEL: test_vst1_lane_f64
+; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %0 = extractelement <1 x double> %b, i32 0
+ store double %0, double* %a, align 8
+ ret void
+}
+
+define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s8
+; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
+ tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
+ ret void
+}
+
+define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s16
+; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
+ %0 = bitcast i16* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
+ ret void
+}
+
+define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
+ %0 = bitcast i32* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
+ ret void
+}
+
+define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_s64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
+ %0 = bitcast i64* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
+ ret void
+}
+
+define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_f32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
+ %0 = bitcast float* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
+ ret void
+}
+
+define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2q_lane_f64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
+ %0 = bitcast double* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
+ ret void
+}
+
+define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s8
+; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
+ tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
+ ret void
+}
+
+define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s16
+; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
+ %0 = bitcast i16* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
+ ret void
+}
+
+define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
+ %0 = bitcast i32* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
+ ret void
+}
+
+define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_s64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
+ %0 = bitcast i64* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
+ ret void
+}
+
+define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_f32
+; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
+ %0 = bitcast float* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
+ ret void
+}
+
+define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst2_lane_f64
+; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
+ %0 = bitcast double* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
+ ret void
+}
+
+define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s8
+; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
+ tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
+ ret void
+}
+
+define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s16
+; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
+ %0 = bitcast i16* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
+ ret void
+}
+
+define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
+ %0 = bitcast i32* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
+ ret void
+}
+
+define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_s64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
+ %0 = bitcast i64* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
+ ret void
+}
+
+define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_f32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
+ %0 = bitcast float* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
+ ret void
+}
+
+define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3q_lane_f64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
+ %0 = bitcast double* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
+ ret void
+}
+
+define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s8
+; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
+ tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
+ ret void
+}
+
+define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s16
+; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
+ %0 = bitcast i16* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
+ ret void
+}
+
+define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
+ %0 = bitcast i32* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
+ ret void
+}
+
+define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_s64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
+ %0 = bitcast i64* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
+ ret void
+}
+
+define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_f32
+; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
+ %0 = bitcast float* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
+ ret void
+}
+
+define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst3_lane_f64
+; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
+ %0 = bitcast double* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
+ ret void
+}
+
+define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s8
+; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
+ %0 = bitcast i16* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
+ ret void
+}
+
+define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s16
+; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
+ %0 = bitcast i16* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
+ ret void
+}
+
+define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
+ %0 = bitcast i32* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
+ ret void
+}
+
+define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_s64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
+ %0 = bitcast i64* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
+ ret void
+}
+
+define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_f32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
+ %0 = bitcast float* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
+ ret void
+}
+
+define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4q_lane_f64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
+ %0 = bitcast double* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
+ ret void
+}
+
+define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s8
+; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
+ tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
+ ret void
+}
+
+define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s16
+; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
+ %0 = bitcast i16* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
+ ret void
+}
+
+define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
+ %0 = bitcast i32* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
+ ret void
+}
+
+define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_s64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
+ %0 = bitcast i64* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
+ ret void
+}
+
+define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_f32
+; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
+ %0 = bitcast float* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
+ ret void
+}
+
+define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
+; CHECK-LABEL: test_vst4_lane_f64
+; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
+entry:
+ %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
+ %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
+ %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
+ %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
+ %0 = bitcast double* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
+ ret void
+}
+
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
+declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
+declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32) \ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
new file mode 100644
index 0000000000..80a934700c
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
@@ -0,0 +1,319 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
+; CHECK-LABEL: test_vld2q_dup_fx_update
+; CHECK: ld2r {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}], #2
+ %1 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
+ %2 = extractvalue { <16 x i8>, <16 x i8> } %1, 0
+ %3 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
+ %4 = extractvalue { <16 x i8>, <16 x i8> } %1, 1
+ %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer
+ %6 = insertvalue { [2 x <16 x i8>] } undef, <16 x i8> %3, 0, 0
+ %7 = insertvalue { [2 x <16 x i8>] } %6, <16 x i8> %5, 0, 1
+ %tmp1 = getelementptr i8* %a, i32 2
+ store i8* %tmp1, i8** %ptr
+ ret { [2 x <16 x i8>] } %7
+}
+
+define { [2 x <4 x i32>] } @test_vld2q_dup_reg_update(i32* %a, i32** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld2q_dup_reg_update
+; CHECK: ld2r {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+ %1 = bitcast i32* %a to i8*
+ %2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %1, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
+ %3 = extractvalue { <4 x i32>, <4 x i32> } %2, 0
+ %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
+ %5 = extractvalue { <4 x i32>, <4 x i32> } %2, 1
+ %6 = shufflevector <4 x i32> %5, <4 x i32> undef, <4 x i32> zeroinitializer
+ %7 = insertvalue { [2 x <4 x i32>] } undef, <4 x i32> %4, 0, 0
+ %8 = insertvalue { [2 x <4 x i32>] } %7, <4 x i32> %6, 0, 1
+ %tmp1 = getelementptr i32* %a, i32 %inc
+ store i32* %tmp1, i32** %ptr
+ ret { [2 x <4 x i32>] } %8
+}
+
+define { [3 x <4 x i16>] } @test_vld3_dup_fx_update(i16* %a, i16** %ptr) {
+; CHECK-LABEL: test_vld3_dup_fx_update
+; CHECK: ld3r {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}], #6
+ %1 = bitcast i16* %a to i8*
+ %2 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %1, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
+ %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 0
+ %4 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
+ %5 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 1
+ %6 = shufflevector <4 x i16> %5, <4 x i16> undef, <4 x i32> zeroinitializer
+ %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %2, 2
+ %8 = shufflevector <4 x i16> %7, <4 x i16> undef, <4 x i32> zeroinitializer
+ %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %4, 0, 0
+ %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %6, 0, 1
+ %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
+ %tmp1 = getelementptr i16* %a, i32 3
+ store i16* %tmp1, i16** %ptr
+ ret { [3 x <4 x i16>] } %11
+}
+
+define { [3 x <8 x i8>] } @test_vld3_dup_reg_update(i8* %a, i8** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld3_dup_reg_update
+; CHECK: ld3r {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+ %1 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
+ %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 0
+ %3 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
+ %4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 1
+ %5 = shufflevector <8 x i8> %4, <8 x i8> undef, <8 x i32> zeroinitializer
+ %6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %1, 2
+ %7 = shufflevector <8 x i8> %6, <8 x i8> undef, <8 x i32> zeroinitializer
+ %8 = insertvalue { [3 x <8 x i8>] } undef, <8 x i8> %3, 0, 0
+ %9 = insertvalue { [3 x <8 x i8>] } %8, <8 x i8> %5, 0, 1
+ %10 = insertvalue { [3 x <8 x i8>] } %9, <8 x i8> %7, 0, 2
+ %tmp1 = getelementptr i8* %a, i32 %inc
+ store i8* %tmp1, i8** %ptr
+ ret { [3 x <8 x i8>] }%10
+}
+
+define { [4 x <2 x i32>] } @test_vld4_dup_fx_update(i32* %a, i32** %ptr) #0 {
+; CHECK-LABEL: test_vld4_dup_fx_update
+; CHECK: ld4r {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}], #16
+ %1 = bitcast i32* %a to i8*
+ %2 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %1, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
+ %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 0
+ %4 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
+ %5 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 1
+ %6 = shufflevector <2 x i32> %5, <2 x i32> undef, <2 x i32> zeroinitializer
+ %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 2
+ %8 = shufflevector <2 x i32> %7, <2 x i32> undef, <2 x i32> zeroinitializer
+ %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %2, 3
+ %10 = shufflevector <2 x i32> %9, <2 x i32> undef, <2 x i32> zeroinitializer
+ %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %4, 0, 0
+ %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %6, 0, 1
+ %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %8, 0, 2
+ %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
+ %tmp1 = getelementptr i32* %a, i32 4
+ store i32* %tmp1, i32** %ptr
+ ret { [4 x <2 x i32>] } %14
+}
+
+define { [4 x <2 x double>] } @test_vld4_dup_reg_update(double* %a, double** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld4_dup_reg_update
+; CHECK: ld4r {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}], x{{[0-9]+}}
+ %1 = bitcast double* %a to i8*
+ %2 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %1, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
+ %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 0
+ %4 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
+ %5 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 1
+ %6 = shufflevector <2 x double> %5, <2 x double> undef, <2 x i32> zeroinitializer
+ %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 2
+ %8 = shufflevector <2 x double> %7, <2 x double> undef, <2 x i32> zeroinitializer
+ %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %2, 3
+ %10 = shufflevector <2 x double> %9, <2 x double> undef, <2 x i32> zeroinitializer
+ %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %4, 0, 0
+ %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %6, 0, 1
+ %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %8, 0, 2
+ %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
+ %tmp1 = getelementptr double* %a, i32 %inc
+ store double* %tmp1, double** %ptr
+ ret { [4 x <2 x double>] } %14
+}
+
+define { [2 x <8 x i8>] } @test_vld2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
+; CHECK-LABEL: test_vld2_lane_fx_update
+; CHECK: ld2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
+ %1 = extractvalue [2 x <8 x i8>] %b, 0
+ %2 = extractvalue [2 x <8 x i8>] %b, 1
+ %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+ %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
+ %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
+ %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
+ %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
+ %tmp1 = getelementptr i8* %a, i32 2
+ store i8* %tmp1, i8** %ptr
+ ret { [2 x <8 x i8>] } %7
+}
+
+define { [2 x <8 x i8>] } @test_vld2_lane_reg_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld2_lane_reg_update
+; CHECK: ld2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[6], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+ %1 = extractvalue [2 x <8 x i8>] %b, 0
+ %2 = extractvalue [2 x <8 x i8>] %b, 1
+ %3 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 6, i32 1)
+ %4 = extractvalue { <8 x i8>, <8 x i8> } %3, 0
+ %5 = extractvalue { <8 x i8>, <8 x i8> } %3, 1
+ %6 = insertvalue { [2 x <8 x i8>] } undef, <8 x i8> %4, 0, 0
+ %7 = insertvalue { [2 x <8 x i8>] } %6, <8 x i8> %5, 0, 1
+ %tmp1 = getelementptr i8* %a, i32 %inc
+ store i8* %tmp1, i8** %ptr
+ ret { [2 x <8 x i8>] } %7
+}
+
+define { [3 x <2 x float>] } @test_vld3_lane_fx_update(float* %a, [3 x <2 x float>] %b, float** %ptr) {
+; CHECK-LABEL: test_vld3_lane_fx_update
+; CHECK: ld3 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #12
+ %1 = extractvalue [3 x <2 x float>] %b, 0
+ %2 = extractvalue [3 x <2 x float>] %b, 1
+ %3 = extractvalue [3 x <2 x float>] %b, 2
+ %4 = bitcast float* %a to i8*
+ %5 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %4, <2 x float> %1, <2 x float> %2, <2 x float> %3, i32 1, i32 4)
+ %6 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 0
+ %7 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 1
+ %8 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %5, 2
+ %9 = insertvalue { [3 x <2 x float>] } undef, <2 x float> %6, 0, 0
+ %10 = insertvalue { [3 x <2 x float>] } %9, <2 x float> %7, 0, 1
+ %11 = insertvalue { [3 x <2 x float>] } %10, <2 x float> %8, 0, 2
+ %tmp1 = getelementptr float* %a, i32 3
+ store float* %tmp1, float** %ptr
+ ret { [3 x <2 x float>] } %11
+}
+
+define { [3 x <4 x i16>] } @test_vld3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld3_lane_reg_update
+; CHECK: ld3 {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+ %1 = extractvalue [3 x <4 x i16>] %b, 0
+ %2 = extractvalue [3 x <4 x i16>] %b, 1
+ %3 = extractvalue [3 x <4 x i16>] %b, 2
+ %4 = bitcast i16* %a to i8*
+ %5 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+ %6 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 0
+ %7 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 1
+ %8 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %5, 2
+ %9 = insertvalue { [3 x <4 x i16>] } undef, <4 x i16> %6, 0, 0
+ %10 = insertvalue { [3 x <4 x i16>] } %9, <4 x i16> %7, 0, 1
+ %11 = insertvalue { [3 x <4 x i16>] } %10, <4 x i16> %8, 0, 2
+ %tmp1 = getelementptr i16* %a, i32 %inc
+ store i16* %tmp1, i16** %ptr
+ ret { [3 x <4 x i16>] } %11
+}
+
+define { [4 x <2 x i32>] } @test_vld4_lane_fx_update(i32* readonly %a, [4 x <2 x i32>] %b, i32** %ptr) {
+; CHECK-LABEL: test_vld4_lane_fx_update
+; CHECK: ld4 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], #16
+ %1 = extractvalue [4 x <2 x i32>] %b, 0
+ %2 = extractvalue [4 x <2 x i32>] %b, 1
+ %3 = extractvalue [4 x <2 x i32>] %b, 2
+ %4 = extractvalue [4 x <2 x i32>] %b, 3
+ %5 = bitcast i32* %a to i8*
+ %6 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %5, <2 x i32> %1, <2 x i32> %2, <2 x i32> %3, <2 x i32> %4, i32 1, i32 4)
+ %7 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 0
+ %8 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 1
+ %9 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 2
+ %10 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %6, 3
+ %11 = insertvalue { [4 x <2 x i32>] } undef, <2 x i32> %7, 0, 0
+ %12 = insertvalue { [4 x <2 x i32>] } %11, <2 x i32> %8, 0, 1
+ %13 = insertvalue { [4 x <2 x i32>] } %12, <2 x i32> %9, 0, 2
+ %14 = insertvalue { [4 x <2 x i32>] } %13, <2 x i32> %10, 0, 3
+ %tmp1 = getelementptr i32* %a, i32 4
+ store i32* %tmp1, i32** %ptr
+ ret { [4 x <2 x i32>] } %14
+}
+
+define { [4 x <2 x double>] } @test_vld4_lane_reg_update(double* readonly %a, [4 x <2 x double>] %b, double** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vld4_lane_reg_update
+; CHECK: ld4 {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+ %1 = extractvalue [4 x <2 x double>] %b, 0
+ %2 = extractvalue [4 x <2 x double>] %b, 1
+ %3 = extractvalue [4 x <2 x double>] %b, 2
+ %4 = extractvalue [4 x <2 x double>] %b, 3
+ %5 = bitcast double* %a to i8*
+ %6 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
+ %7 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 0
+ %8 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 1
+ %9 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 2
+ %10 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %6, 3
+ %11 = insertvalue { [4 x <2 x double>] } undef, <2 x double> %7, 0, 0
+ %12 = insertvalue { [4 x <2 x double>] } %11, <2 x double> %8, 0, 1
+ %13 = insertvalue { [4 x <2 x double>] } %12, <2 x double> %9, 0, 2
+ %14 = insertvalue { [4 x <2 x double>] } %13, <2 x double> %10, 0, 3
+ %tmp1 = getelementptr double* %a, i32 %inc
+ store double* %tmp1, double** %ptr
+ ret { [4 x <2 x double>] } %14
+}
+
+define void @test_vst2_lane_fx_update(i8* %a, [2 x <8 x i8>] %b, i8** %ptr) {
+; CHECK-LABEL: test_vst2_lane_fx_update
+; CHECK: st2 {v{{[0-9]+}}.b, v{{[0-9]+}}.b}[7], [x{{[0-9]+|sp}}], #2
+ %1 = extractvalue [2 x <8 x i8>] %b, 0
+ %2 = extractvalue [2 x <8 x i8>] %b, 1
+ call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %1, <8 x i8> %2, i32 7, i32 1)
+ %tmp1 = getelementptr i8* %a, i32 2
+ store i8* %tmp1, i8** %ptr
+ ret void
+}
+
+define void @test_vst2_lane_reg_update(i32* %a, [2 x <2 x i32>] %b.coerce, i32** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst2_lane_reg_update
+; CHECK: st2 {v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+ %1 = extractvalue [2 x <2 x i32>] %b.coerce, 0
+ %2 = extractvalue [2 x <2 x i32>] %b.coerce, 1
+ %3 = bitcast i32* %a to i8*
+ tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %3, <2 x i32> %1, <2 x i32> %2, i32 1, i32 4)
+ %tmp1 = getelementptr i32* %a, i32 %inc
+ store i32* %tmp1, i32** %ptr
+ ret void
+}
+
+define void @test_vst3_lane_fx_update(float* %a, [3 x <4 x float>] %b, float** %ptr) {
+; CHECK-LABEL: test_vst3_lane_fx_update
+; CHECK: st3 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[3], [x{{[0-9]+|sp}}], #12
+ %1 = extractvalue [3 x <4 x float>] %b, 0
+ %2 = extractvalue [3 x <4 x float>] %b, 1
+ %3 = extractvalue [3 x <4 x float>] %b, 2
+ %4 = bitcast float* %a to i8*
+ call void @llvm.arm.neon.vst3lane.v4f32(i8* %4, <4 x float> %1, <4 x float> %2, <4 x float> %3, i32 3, i32 4)
+ %tmp1 = getelementptr float* %a, i32 3
+ store float* %tmp1, float** %ptr
+ ret void
+}
+
+; Function Attrs: nounwind
+define void @test_vst3_lane_reg_update(i16* %a, [3 x <4 x i16>] %b, i16** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst3_lane_reg_update
+; CHECK: st3 {v{{[0-9]+}}.h, v{{[0-9]+}}.h, v{{[0-9]+}}.h}[3], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+ %1 = extractvalue [3 x <4 x i16>] %b, 0
+ %2 = extractvalue [3 x <4 x i16>] %b, 1
+ %3 = extractvalue [3 x <4 x i16>] %b, 2
+ %4 = bitcast i16* %a to i8*
+ tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %4, <4 x i16> %1, <4 x i16> %2, <4 x i16> %3, i32 3, i32 2)
+ %tmp1 = getelementptr i16* %a, i32 %inc
+ store i16* %tmp1, i16** %ptr
+ ret void
+}
+
+define void @test_vst4_lane_fx_update(double* %a, [4 x <2 x double>] %b.coerce, double** %ptr) {
+; CHECK-LABEL: test_vst4_lane_fx_update
+; CHECK: st4 {v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d, v{{[0-9]+}}.d}[1], [x{{[0-9]+|sp}}], #32
+ %1 = extractvalue [4 x <2 x double>] %b.coerce, 0
+ %2 = extractvalue [4 x <2 x double>] %b.coerce, 1
+ %3 = extractvalue [4 x <2 x double>] %b.coerce, 2
+ %4 = extractvalue [4 x <2 x double>] %b.coerce, 3
+ %5 = bitcast double* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %5, <2 x double> %1, <2 x double> %2, <2 x double> %3, <2 x double> %4, i32 1, i32 8)
+ %tmp1 = getelementptr double* %a, i32 4
+ store double* %tmp1, double** %ptr
+ ret void
+}
+
+
+define void @test_vst4_lane_reg_update(float* %a, [4 x <2 x float>] %b.coerce, float** %ptr, i32 %inc) {
+; CHECK-LABEL: test_vst4_lane_reg_update
+; CHECK: st4 {v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s, v{{[0-9]+}}.s}[1], [x{{[0-9]+|sp}}], x{{[0-9]+}}
+ %1 = extractvalue [4 x <2 x float>] %b.coerce, 0
+ %2 = extractvalue [4 x <2 x float>] %b.coerce, 1
+ %3 = extractvalue [4 x <2 x float>] %b.coerce, 2
+ %4 = extractvalue [4 x <2 x float>] %b.coerce, 3
+ %5 = bitcast float* %a to i8*
+ tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %5, <2 x float> %1, <2 x float> %2, <2 x float> %3, <2 x float> %4, i32 1, i32 4)
+ %tmp1 = getelementptr float* %a, i32 %inc
+ store float* %tmp1, float** %ptr
+ ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
+declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
+declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
+declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
+declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
+declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index eaa5c562f3..74719e09a5 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -4170,6 +4170,125 @@
// CHECK-ERROR: st4 {v31.2d, v0.2d, v1.2d, v2.1d}, [x3], x1
// CHECK-ERROR: ^
+//------------------------------------------------------------------------------
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 1,2,3,4)
+//------------------------------------------------------------------------------
+ ld1r {x1}, [x0]
+ ld2r {v31.4s, v0.2s}, [sp]
+ ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+ ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
+// CHECK-ERROR: error: expected vector type register
+// CHECK-ERROR: ld1r {x1}, [x0]
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: ld2r {v31.4s, v0.2s}, [sp]
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld3r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid space between two vectors
+// CHECK-ERROR: ld4r {v31.2s, v0.2s, v1.2d, v2.2s}, [sp]
+// CHECK-ERROR: ^
+
+//------------------------------------------------------------------------------
+// Load/Store single N-element structure to/from one lane of N consecutive
+// registers (N = 1, 2,3,4)
+//------------------------------------------------------------------------------
+ ld1 {v0.b}[16], [x0]
+ ld2 {v15.h, v16.h}[8], [x15]
+ ld3 {v31.s, v0.s, v1.s}[-1], [sp]
+ ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:: error: lane number incompatible with layout
+// CHECK-ERROR: ld1 {v0.b}[16], [x0]
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: ld2 {v15.h, v16.h}[8], [x15]
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: expected lane number
+// CHECK-ERROR: ld3 {v31.s, v0.s, v1.s}[-1], [sp]
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: lane number incompatible with layout
+// CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR: ^
+
+ st1 {v0.d}[16], [x0]
+ st2 {v31.s, v0.s}[3], [8]
+ st3 {v15.h, v16.h, v17.h}[-1], [x15]
+ st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR:: error: lane number incompatible with layout
+// CHECK-ERROR: st1 {v0.d}[16], [x0]
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st2 {v31.s, v0.s}[3], [8]
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: expected lane number
+// CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[-1], [x15]
+// CHECK-ERROR: ^
+// CHECK-ERROR: lane number incompatible with layout
+// CHECK-ERROR: st4 {v0.d, v1.d, v2.d, v3.d}[2], [x0]
+// CHECK-ERROR: ^
+
+//------------------------------------------------------------------------------
+// Post-index of load single N-element structure to all lanes of N consecutive
+// registers (N = 1,2,3,4)
+//------------------------------------------------------------------------------
+ ld1r {v15.8h}, [x15], #5
+ ld2r {v0.2d, v1.2d}, [x0], #7
+ ld3r {v15.4h, v16.4h, v17.4h}, [x15], #1
+ ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], sp
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld1r {v15.8h}, [x15], #5
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld2r {v0.2d, v1.2d}, [x0], #7
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #1
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], sp
+// CHECK-ERROR: ^
+
+//------------------------------------------------------------------------------
+// Post-index of Load/Store single N-element structure to/from one lane of N
+// consecutive registers (N = 1, 2,3,4)
+//------------------------------------------------------------------------------
+ ld1 {v0.b}[0], [x0], #2
+ ld2 {v15.h, v16.h}[0], [x15], #3
+ ld3 {v31.s, v0.s, v1.d}[0], [sp], x9
+ ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld1 {v0.b}[0], [x0], #2
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld2 {v15.h, v16.h}[0], [x15], #3
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: expected the same vector layout
+// CHECK-ERROR: ld3 {v31.s, v0.s, v1.d}[0], [sp], x9
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24
+// CHECK-ERROR: ^
+
+ st1 {v0.d}[0], [x0], #7
+ st2 {v31.s, v0.s}[0], [sp], #6
+ st3 {v15.h, v16.h, v17.h}[0], [x15], #8
+ st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], #1
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st1 {v0.d}[0], [x0], #7
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st2 {v31.s, v0.s}[0], [sp], #6
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st3 {v15.h, v16.h, v17.h}[0], [x15], #8
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], #1
+// CHECK-ERROR: ^
+
+
ins v2.b[16], w1
ins v7.h[8], w14
ins v20.s[5], w30
diff --git a/test/MC/AArch64/neon-simd-ldst-one-elem.s b/test/MC/AArch64/neon-simd-ldst-one-elem.s
new file mode 100644
index 0000000000..140d7525fe
--- /dev/null
+++ b/test/MC/AArch64/neon-simd-ldst-one-elem.s
@@ -0,0 +1,325 @@
+// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Load single 1-element structure to all lanes of 1 register
+//------------------------------------------------------------------------------
+ ld1r {v0.16b}, [x0]
+ ld1r {v15.8h}, [x15]
+ ld1r {v31.4s}, [sp]
+ ld1r {v0.2d}, [x0]
+ ld1r {v0.8b}, [x0]
+ ld1r {v15.4h}, [x15]
+ ld1r {v31.2s}, [sp]
+ ld1r {v0.1d}, [x0]
+// CHECK: ld1r {v0.16b}, [x0] // encoding: [0x00,0xc0,0x40,0x4d]
+// CHECK: ld1r {v15.8h}, [x15] // encoding: [0xef,0xc5,0x40,0x4d]
+// CHECK: ld1r {v31.4s}, [sp] // encoding: [0xff,0xcb,0x40,0x4d]
+// CHECK: ld1r {v0.2d}, [x0] // encoding: [0x00,0xcc,0x40,0x4d]
+// CHECK: ld1r {v0.8b}, [x0] // encoding: [0x00,0xc0,0x40,0x0d]
+// CHECK: ld1r {v15.4h}, [x15] // encoding: [0xef,0xc5,0x40,0x0d]
+// CHECK: ld1r {v31.2s}, [sp] // encoding: [0xff,0xcb,0x40,0x0d]
+// CHECK: ld1r {v0.1d}, [x0] // encoding: [0x00,0xcc,0x40,0x0d]
+
+//------------------------------------------------------------------------------
+// Load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+ ld2r {v0.16b, v1.16b}, [x0]
+ ld2r {v15.8h, v16.8h}, [x15]
+ ld2r {v31.4s, v0.4s}, [sp]
+ ld2r {v0.2d, v1.2d}, [x0]
+ ld2r {v0.8b, v1.8b}, [x0]
+ ld2r {v15.4h, v16.4h}, [x15]
+ ld2r {v31.2s, v0.2s}, [sp]
+ ld2r {v31.1d, v0.1d}, [sp]
+// CHECK: ld2r {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xc0,0x60,0x4d]
+// CHECK: ld2r {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xc5,0x60,0x4d]
+// CHECK: ld2r {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xcb,0x60,0x4d]
+// CHECK: ld2r {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xcc,0x60,0x4d]
+// CHECK: ld2r {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xc0,0x60,0x0d]
+// CHECK: ld2r {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xc5,0x60,0x0d]
+// CHECK: ld2r {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xcb,0x60,0x0d]
+// CHECK: ld2r {v31.1d, v0.1d}, [sp] // encoding: [0xff,0xcf,0x60,0x0d]
+
+ ld3r {v0.16b, v1.16b, v2.16b}, [x0]
+ ld3r {v15.8h, v16.8h, v17.8h}, [x15]
+ ld3r {v31.4s, v0.4s, v1.4s}, [sp]
+ ld3r {v0.2d, v1.2d, v2.2d}, [x0]
+ ld3r {v0.8b, v1.8b, v2.8b}, [x0]
+ ld3r {v15.4h, v16.4h, v17.4h}, [x15]
+ ld3r {v31.2s, v0.2s, v1.2s}, [sp]
+ ld3r {v31.1d, v0.1d, v1.1d}, [sp]
+// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0xe0,0x40,0x4d]
+// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0xe5,0x40,0x4d]
+// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0xeb,0x40,0x4d]
+// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0xec,0x40,0x4d]
+// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0xe0,0x40,0x0d]
+// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0xe5,0x40,0x0d]
+// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0xeb,0x40,0x0d]
+// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp] // encoding: [0xff,0xef,0x40,0x0d]
+
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
+ ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
+ ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
+ ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
+ ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
+ ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+ ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp]
+// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0xe0,0x60,0x4d]
+// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0xe5,0x60,0x4d]
+// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0xeb,0x60,0x4d]
+// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0xec,0x60,0x4d]
+// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0xe0,0x60,0x0d]
+// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0xe5,0x60,0x0d]
+// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0xeb,0x60,0x0d]
+// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp] // encoding: [0xff,0xef,0x60,0x0d]
+
+//------------------------------------------------------------------------------
+// Load single 1-element structure to one lane of 1 register.
+//------------------------------------------------------------------------------
+ ld1 {v0.b}[9], [x0]
+ ld1 {v15.h}[7], [x15]
+ ld1 {v31.s}[3], [sp]
+ ld1 {v0.d}[1], [x0]
+// CHECK: ld1 {v0.b}[9], [x0] // encoding: [0x00,0x04,0x40,0x4d]
+// CHECK: ld1 {v15.h}[7], [x15] // encoding: [0xef,0x59,0x40,0x4d]
+// CHECK: ld1 {v31.s}[3], [sp] // encoding: [0xff,0x93,0x40,0x4d]
+// CHECK: ld1 {v0.d}[1], [x0] // encoding: [0x00,0x84,0x40,0x4d]
+
+//------------------------------------------------------------------------------
+// Load single N-element structure to one lane of N consecutive registers
+// (N = 2,3,4)
+//------------------------------------------------------------------------------
+ ld2 {v0.b, v1.b}[9], [x0]
+ ld2 {v15.h, v16.h}[7], [x15]
+ ld2 {v31.s, v0.s}[3], [sp]
+ ld2 {v0.d, v1.d}[1], [x0]
+// CHECK: ld2 {v0.b, v1.b}[9], [x0] // encoding: [0x00,0x04,0x60,0x4d]
+// CHECK: ld2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x60,0x4d]
+// CHECK: ld2 {v31.s, v0.s}[3], [sp] // encoding: [0xff,0x93,0x60,0x4d]
+// CHECK: ld2 {v0.d, v1.d}[1], [x0] // encoding: [0x00,0x84,0x60,0x4d]
+
+ ld3 {v0.b, v1.b, v2.b}[9], [x0]
+ ld3 {v15.h, v16.h, v17.h}[7], [x15]
+ ld3 {v31.s, v0.s, v1.s}[3], [sp]
+ ld3 {v0.d, v1.d, v2.d}[1], [x0]
+// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x40,0x4d]
+// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x40,0x4d]
+// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x40,0x4d]
+// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x40,0x4d]
+
+ ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+ ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15]
+ ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp]
+ ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
+// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x60,0x4d]
+// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x60,0x4d]
+// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x60,0x4d]
+// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x60,0x4d]
+
+//------------------------------------------------------------------------------
+// Store single 1-element structure from one lane of 1 register.
+//------------------------------------------------------------------------------
+ st1 {v0.b}[9], [x0]
+ st1 {v15.h}[7], [x15]
+ st1 {v31.s}[3], [sp]
+ st1 {v0.d}[1], [x0]
+// CHECK: st1 {v0.b}[9], [x0] // encoding: [0x00,0x04,0x00,0x4d]
+// CHECK: st1 {v15.h}[7], [x15] // encoding: [0xef,0x59,0x00,0x4d]
+// CHECK: st1 {v31.s}[3], [sp] // encoding: [0xff,0x93,0x00,0x4d]
+// CHECK: st1 {v0.d}[1], [x0] // encoding: [0x00,0x84,0x00,0x4d]
+
+//------------------------------------------------------------------------------
+// Store single N-element structure from one lane of N consecutive registers
+// (N = 2,3,4)
+//------------------------------------------------------------------------------
+ st2 {v0.b, v1.b}[9], [x0]
+ st2 {v15.h, v16.h}[7], [x15]
+ st2 {v31.s, v0.s}[3], [sp]
+ st2 {v0.d, v1.d}[1], [x0]
+// CHECK: st2 {v0.b, v1.b}[9], [x0] // encoding: [0x00,0x04,0x20,0x4d]
+// CHECK: st2 {v15.h, v16.h}[7], [x15] // encoding: [0xef,0x59,0x20,0x4d]
+// CHECK: st2 {v31.s, v0.s}[3], [sp] // encoding: [0xff,0x93,0x20,0x4d]
+// CHECK: st2 {v0.d, v1.d}[1], [x0] // encoding: [0x00,0x84,0x20,0x4d]
+
+ st3 {v0.b, v1.b, v2.b}[9], [x0]
+ st3 {v15.h, v16.h, v17.h}[7], [x15]
+ st3 {v31.s, v0.s, v1.s}[3], [sp]
+ st3 {v0.d, v1.d, v2.d}[1], [x0]
+// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0] // encoding: [0x00,0x24,0x00,0x4d]
+// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15] // encoding: [0xef,0x79,0x00,0x4d]
+// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp] // encoding: [0xff,0xb3,0x00,0x4d]
+// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0] // encoding: [0x00,0xa4,0x00,0x4d]
+
+ st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+ st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15]
+ st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp]
+ st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
+// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0] // encoding: [0x00,0x24,0x20,0x4d]
+// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15] // encoding: [0xef,0x79,0x20,0x4d]
+// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp] // encoding: [0xff,0xb3,0x20,0x4d]
+// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0] // encoding: [0x00,0xa4,0x20,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index oad single 1-element structure to all lanes of 1 register
+//------------------------------------------------------------------------------
+ ld1r {v0.16b}, [x0], #1
+ ld1r {v15.8h}, [x15], #2
+ ld1r {v31.4s}, [sp], #4
+ ld1r {v0.2d}, [x0], #8
+ ld1r {v0.8b}, [x0], x0
+ ld1r {v15.4h}, [x15], x1
+ ld1r {v31.2s}, [sp], x2
+ ld1r {v0.1d}, [x0], x3
+// CHECK: ld1r {v0.16b}, [x0], #1 // encoding: [0x00,0xc0,0xdf,0x4d]
+// CHECK: ld1r {v15.8h}, [x15], #2 // encoding: [0xef,0xc5,0xdf,0x4d]
+// CHECK: ld1r {v31.4s}, [sp], #4 // encoding: [0xff,0xcb,0xdf,0x4d]
+// CHECK: ld1r {v0.2d}, [x0], #8 // encoding: [0x00,0xcc,0xdf,0x4d]
+// CHECK: ld1r {v0.8b}, [x0], x0 // encoding: [0x00,0xc0,0xc0,0x0d]
+// CHECK: ld1r {v15.4h}, [x15], x1 // encoding: [0xef,0xc5,0xc1,0x0d]
+// CHECK: ld1r {v31.2s}, [sp], x2 // encoding: [0xff,0xcb,0xc2,0x0d]
+// CHECK: ld1r {v0.1d}, [x0], x3 // encoding: [0x00,0xcc,0xc3,0x0d]
+
+//------------------------------------------------------------------------------
+// Post-index load single N-element structure to all lanes of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+ ld2r {v0.16b, v1.16b}, [x0], #2
+ ld2r {v15.8h, v16.8h}, [x15], #4
+ ld2r {v31.4s, v0.4s}, [sp], #8
+ ld2r {v0.2d, v1.2d}, [x0], #16
+ ld2r {v0.8b, v1.8b}, [x0], x6
+ ld2r {v15.4h, v16.4h}, [x15], x7
+ ld2r {v31.2s, v0.2s}, [sp], x9
+ ld2r {v31.1d, v0.1d}, [x0], x5
+// CHECK: ld2r {v0.16b, v1.16b}, [x0], #2 // encoding: [0x00,0xc0,0xff,0x4d]
+// CHECK: ld2r {v15.8h, v16.8h}, [x15], #4 // encoding: [0xef,0xc5,0xff,0x4d]
+// CHECK: ld2r {v31.4s, v0.4s}, [sp], #8 // encoding: [0xff,0xcb,0xff,0x4d]
+// CHECK: ld2r {v0.2d, v1.2d}, [x0], #16 // encoding: [0x00,0xcc,0xff,0x4d]
+// CHECK: ld2r {v0.8b, v1.8b}, [x0], x6 // encoding: [0x00,0xc0,0xe6,0x0d]
+// CHECK: ld2r {v15.4h, v16.4h}, [x15], x7 // encoding: [0xef,0xc5,0xe7,0x0d]
+// CHECK: ld2r {v31.2s, v0.2s}, [sp], x9 // encoding: [0xff,0xcb,0xe9,0x0d]
+// CHECK: ld2r {v31.1d, v0.1d}, [x0], x5 // encoding: [0x1f,0xcc,0xe5,0x0d]
+
+ ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9
+ ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6
+ ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7
+ ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5
+ ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3
+ ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6
+ ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12
+ ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24
+// CHECK: ld3r {v0.16b, v1.16b, v2.16b}, [x0], x9 // encoding: [0x00,0xe0,0xc9,0x4d]
+// CHECK: ld3r {v15.8h, v16.8h, v17.8h}, [x15], x6 // encoding: [0xef,0xe5,0xc6,0x4d]
+// CHECK: ld3r {v31.4s, v0.4s, v1.4s}, [sp], x7 // encoding: [0xff,0xeb,0xc7,0x4d]
+// CHECK: ld3r {v0.2d, v1.2d, v2.2d}, [x0], x5 // encoding: [0x00,0xec,0xc5,0x4d]
+// CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3 // encoding: [0x00,0xe0,0xdf,0x0d]
+// CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6 // encoding: [0xef,0xe5,0xdf,0x0d]
+// CHECK: ld3r {v31.2s, v0.2s, v1.2s}, [sp], #12 // encoding: [0xff,0xeb,0xdf,0x0d]
+// CHECK: ld3r {v31.1d, v0.1d, v1.1d}, [sp], #24 // encoding: [0xff,0xef,0xdf,0x0d]
+
+ ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4
+ ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8
+ ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16
+ ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32
+ ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5
+ ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9
+ ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30
+ ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7
+// CHECK: ld4r {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], #4 // encoding: [0x00,0xe0,0xff,0x4d]
+// CHECK: ld4r {v15.8h, v16.8h, v17.8h, v18.8h}, [x15], #8 // encoding: [0xef,0xe5,0xff,0x4d]
+// CHECK: ld4r {v31.4s, v0.4s, v1.4s, v2.4s}, [sp], #16 // encoding: [0xff,0xeb,0xff,0x4d]
+// CHECK: ld4r {v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #32 // encoding: [0x00,0xec,0xff,0x4d]
+// CHECK: ld4r {v0.8b, v1.8b, v2.8b, v3.8b}, [x0], x5 // encoding: [0x00,0xe0,0xe5,0x0d]
+// CHECK: ld4r {v15.4h, v16.4h, v17.4h, v18.4h}, [x15], x9 // encoding: [0xef,0xe5,0xe9,0x0d]
+// CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30 // encoding: [0xff,0xeb,0xfe,0x0d]
+// CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7 // encoding: [0xff,0xef,0xe7,0x0d]
+
+//------------------------------------------------------------------------------
+// Post-index load single 1-element structure to one lane of 1 register.
+//------------------------------------------------------------------------------
+ ld1 {v0.b}[9], [x0], #1
+ ld1 {v15.h}[7], [x15], x9
+ ld1 {v31.s}[3], [sp], x6
+ ld1 {v0.d}[1], [x0], #8
+// CHECK: ld1 {v0.b}[9], [x0], #1 // encoding: [0x00,0x04,0xdf,0x4d]
+// CHECK: ld1 {v15.h}[7], [x15], x9 // encoding: [0xef,0x59,0xc9,0x4d]
+// CHECK: ld1 {v31.s}[3], [sp], x6 // encoding: [0xff,0x93,0xc6,0x4d]
+// CHECK: ld1 {v0.d}[1], [x0], #8 // encoding: [0x00,0x84,0xdf,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index load single N-element structure to one lane of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+ ld2 {v0.b, v1.b}[9], [x0], x3
+ ld2 {v15.h, v16.h}[7], [x15], #4
+ ld2 {v31.s, v0.s}[3], [sp], #8
+ ld2 {v0.d, v1.d}[1], [x0], x0
+// CHECK: ld2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xe3,0x4d]
+// CHECK: ld2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xff,0x4d]
+// CHECK: ld2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xff,0x4d]
+// CHECK: ld2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xe0,0x4d]
+
+ ld3 {v0.b, v1.b, v2.b}[9], [x0], #3
+ ld3 {v15.h, v16.h, v17.h}[7], [x15], #6
+ ld3 {v31.s, v0.s, v1.s}[3], [sp], x3
+ ld3 {v0.d, v1.d, v2.d}[1], [x0], x6
+// CHECK: ld3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0xdf,0x4d]
+// CHECK: ld3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0xdf,0x4d]
+// CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0xc3,0x4d]
+// CHECK: ld3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0xc6,0x4d]
+
+ ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
+ ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7
+ ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16
+ ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
+// CHECK: ld4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xe5,0x4d]
+// CHECK: ld4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xe7,0x4d]
+// CHECK: ld4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xff,0x4d]
+// CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xff,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index store single 1-element structure from one lane of 1 register.
+//------------------------------------------------------------------------------
+ st1 {v0.b}[9], [x0], #1
+ st1 {v15.h}[7], [x15], x9
+ st1 {v31.s}[3], [sp], x6
+ st1 {v0.d}[1], [x0], #8
+// CHECK: st1 {v0.b}[9], [x0], #1 // encoding: [0x00,0x04,0x9f,0x4d]
+// CHECK: st1 {v15.h}[7], [x15], x9 // encoding: [0xef,0x59,0x89,0x4d]
+// CHECK: st1 {v31.s}[3], [sp], x6 // encoding: [0xff,0x93,0x86,0x4d]
+// CHECK: st1 {v0.d}[1], [x0], #8 // encoding: [0x00,0x84,0x9f,0x4d]
+
+//------------------------------------------------------------------------------
+// Post-index store single N-element structure from one lane of N consecutive
+// registers (N = 2,3,4)
+//------------------------------------------------------------------------------
+ st2 {v0.b, v1.b}[9], [x0], x3
+ st2 {v15.h, v16.h}[7], [x15], #4
+ st2 {v31.s, v0.s}[3], [sp], #8
+ st2 {v0.d, v1.d}[1], [x0], x0
+// CHECK: st2 {v0.b, v1.b}[9], [x0], x3 // encoding: [0x00,0x04,0xa3,0x4d]
+// CHECK: st2 {v15.h, v16.h}[7], [x15], #4 // encoding: [0xef,0x59,0xbf,0x4d]
+// CHECK: st2 {v31.s, v0.s}[3], [sp], #8 // encoding: [0xff,0x93,0xbf,0x4d]
+// CHECK: st2 {v0.d, v1.d}[1], [x0], x0 // encoding: [0x00,0x84,0xa0,0x4d]
+
+ st3 {v0.b, v1.b, v2.b}[9], [x0], #3
+ st3 {v15.h, v16.h, v17.h}[7], [x15], #6
+ st3 {v31.s, v0.s, v1.s}[3], [sp], x3
+ st3 {v0.d, v1.d, v2.d}[1], [x0], x6
+// CHECK: st3 {v0.b, v1.b, v2.b}[9], [x0], #3 // encoding: [0x00,0x24,0x9f,0x4d]
+// CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15], #6 // encoding: [0xef,0x79,0x9f,0x4d]
+// CHECK: st3 {v31.s, v0.s, v1.s}[3], [sp], x3 // encoding: [0xff,0xb3,0x83,0x4d]
+// CHECK: st3 {v0.d, v1.d, v2.d}[1], [x0], x6 // encoding: [0x00,0xa4,0x86,0x4d]
+
+ st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5
+ st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7
+ st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16
+ st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32
+// CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0], x5 // encoding: [0x00,0x24,0xa5,0x4d]
+// CHECK: st4 {v15.h, v16.h, v17.h, v18.h}[7], [x15], x7 // encoding: [0xef,0x79,0xa7,0x4d]
+// CHECK: st4 {v31.s, v0.s, v1.s, v2.s}[3], [sp], #16 // encoding: [0xff,0xb3,0xbf,0x4d]
+// CHECK: st4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #32 // encoding: [0x00,0xa4,0xbf,0x4d]
diff --git a/test/MC/Disassembler/AArch64/neon-instructions.txt b/test/MC/Disassembler/AArch64/neon-instructions.txt
index c1659019a8..9f9e777285 100644
--- a/test/MC/Disassembler/AArch64/neon-instructions.txt
+++ b/test/MC/Disassembler/AArch64/neon-instructions.txt
@@ -2060,6 +2060,90 @@
0xff,0x0b,0x9f,0x4c
#----------------------------------------------------------------------
+# Vector load single N-element structure to all lane of N
+# consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1r {v0.16b}, [x0]
+# CHECK: ld1r {v15.8h}, [x15]
+# CHECK: ld2r {v31.4s, v0.4s}, [sp]
+# CHECK: ld2r {v0.2d, v1.2d}, [x0]
+# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0]
+# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15]
+# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
+# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp]
+0x00,0xc0,0x40,0x4d
+0xef,0xc5,0x40,0x4d
+0xff,0xcb,0x60,0x4d
+0x00,0xcc,0x60,0x4d
+0x00,0xe0,0x40,0x0d
+0xef,0xe5,0x40,0x0d
+0xff,0xeb,0x60,0x0d
+0xff,0xef,0x60,0x0d
+
+#----------------------------------------------------------------------
+# Vector load/store single N-element structure to/from one lane of N
+# consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1 {v0.b}[9], [x0]
+# CHECK: ld2 {v15.h, v16.h}[7], [x15]
+# CHECK: ld3 {v31.s, v0.s, v1.s}[3], [sp]
+# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0]
+# CHECK: st1 {v0.d}[1], [x0]
+# CHECK: st2 {v31.s, v0.s}[3], [sp]
+# CHECK: st3 {v15.h, v16.h, v17.h}[7], [x15]
+# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[9], [x0]
+0x00,0x04,0x40,0x4d
+0xef,0x59,0x60,0x4d
+0xff,0xb3,0x40,0x4d
+0x00,0xa4,0x60,0x4d
+0x00,0x84,0x00,0x4d
+0xff,0x93,0x20,0x4d
+0xef,0x79,0x00,0x4d
+0x00,0x24,0x20,0x4d
+
+#----------------------------------------------------------------------
+# Post-index of vector load single N-element structure to all lane of N
+# consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1r {v0.16b}, [x0], #1
+# CHECK: ld1r {v15.8h}, [x15], #2
+# CHECK: ld2r {v31.4s, v0.4s}, [sp], #8
+# CHECK: ld2r {v0.2d, v1.2d}, [x0], #16
+# CHECK: ld3r {v0.8b, v1.8b, v2.8b}, [x0], #3
+# CHECK: ld3r {v15.4h, v16.4h, v17.4h}, [x15], #6
+# CHECK: ld4r {v31.2s, v0.2s, v1.2s, v2.2s}, [sp], x30
+# CHECK: ld4r {v31.1d, v0.1d, v1.1d, v2.1d}, [sp], x7
+0x00,0xc0,0xdf,0x4d
+0xef,0xc5,0xdf,0x4d
+0xff,0xcb,0xff,0x4d
+0x00,0xcc,0xff,0x4d
+0x00,0xe0,0xdf,0x0d
+0xef,0xe5,0xdf,0x0d
+0xff,0xeb,0xfe,0x0d
+0xff,0xef,0xe7,0x0d
+
+#----------------------------------------------------------------------
+# Post-index of vector load/store single N-element structure to/from
+# one lane of N consecutive registers (N = 1,2,3,4)
+#----------------------------------------------------------------------
+# CHECK: ld1 {v0.b}[0], [x0], #1
+# CHECK: ld2 {v15.h, v16.h}[0], [x15], #4
+# CHECK: ld3 {v31.s, v0.s, v1.s}[0], [sp], x3
+# CHECK: ld4 {v0.d, v1.d, v2.d, v3.d}[1], [x0], #24
+# CHECK: st1 {v0.d}[0], [x0], #8
+# CHECK: st2 {v31.s, v0.s}[0], [sp], #8
+# CHECK: st3 {v15.h, v16.h, v17.h}[0], [x15], #6
+# CHECK: st4 {v0.b, v1.b, v2.b, v3.b}[1], [x0], x5
+0x00,0x04,0xdf,0x4d
+0xef,0x59,0xff,0x4d
+0xff,0xb3,0xc3,0x4d
+0x00,0xa4,0xff,0x4d
+0x00,0x84,0x9f,0x4d
+0xff,0x93,0xbf,0x4d
+0xef,0x79,0x9f,0x4d
+0x00,0x24,0xa5,0x4d
+
+#----------------------------------------------------------------------
# Bitwise extract
#----------------------------------------------------------------------
0x20,0x18,0x02,0x2e