summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTim Northover <tnorthover@apple.com>2013-08-01 09:20:35 +0000
committerTim Northover <tnorthover@apple.com>2013-08-01 09:20:35 +0000
commit87773c318fcee853fb34a80a10c4347d523bdafb (patch)
tree6c8b6620d46529f553a508e9190a264534e0a0dd
parent691aa094dafe54151b6f70168f066bd87c161e8d (diff)
downloadllvm-87773c318fcee853fb34a80a10c4347d523bdafb.tar.gz
llvm-87773c318fcee853fb34a80a10c4347d523bdafb.tar.bz2
llvm-87773c318fcee853fb34a80a10c4347d523bdafb.tar.xz
AArch64: add initial NEON support
Patch by Ana Pazos. - Completed implementation of instruction formats: AdvSIMD three same AdvSIMD modified immediate AdvSIMD scalar pairwise - Completed implementation of instruction classes (some of the instructions in these classes belong to yet unfinished instruction formats): Vector Arithmetic Vector Immediate Vector Pairwise Arithmetic - Initial implementation of instruction formats: AdvSIMD scalar two-reg misc AdvSIMD scalar three same - Intial implementation of instruction class: Scalar Arithmetic - Initial clang changes to support arm v8 intrinsics. Note: no clang changes for scalar intrinsics function name mangling yet. - Comprehensive test cases for added instructions To verify auto codegen, encoding, decoding, diagnosis, intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187567 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/IR/Intrinsics.td1
-rw-r--r--include/llvm/IR/IntrinsicsAArch64.td41
-rw-r--r--lib/Target/AArch64/AArch64CallingConv.td2
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp522
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h33
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td93
-rw-r--r--lib/Target/AArch64/AArch64InstrInfo.td40
-rw-r--r--lib/Target/AArch64/AArch64InstrNEON.td1634
-rw-r--r--lib/Target/AArch64/AArch64MCInstLower.cpp5
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.td2
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.cpp6
-rw-r--r--lib/Target/AArch64/AArch64Subtarget.h3
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp140
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp40
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp81
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h9
-rw-r--r--lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp2
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.cpp66
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h6
-rw-r--r--test/CodeGen/AArch64/complex-copy-noneon.ll21
-rw-r--r--test/CodeGen/AArch64/inline-asm-constraints.ll22
-rw-r--r--test/CodeGen/AArch64/neon-aba-abd.ll226
-rw-r--r--test/CodeGen/AArch64/neon-add-pairwise.ll92
-rw-r--r--test/CodeGen/AArch64/neon-add-sub.ll132
-rw-r--r--test/CodeGen/AArch64/neon-bitcast.ll574
-rw-r--r--test/CodeGen/AArch64/neon-bitwise-instructions.ll594
-rw-r--r--test/CodeGen/AArch64/neon-compare-instructions.ll1982
-rw-r--r--test/CodeGen/AArch64/neon-facge-facgt.ll56
-rw-r--r--test/CodeGen/AArch64/neon-fma.ll112
-rw-r--r--test/CodeGen/AArch64/neon-frsqrt-frecp.ll54
-rw-r--r--test/CodeGen/AArch64/neon-halving-add-sub.ll207
-rw-r--r--test/CodeGen/AArch64/neon-max-min-pairwise.ll310
-rw-r--r--test/CodeGen/AArch64/neon-max-min.ll310
-rw-r--r--test/CodeGen/AArch64/neon-mla-mls.ll88
-rw-r--r--test/CodeGen/AArch64/neon-mov.ll205
-rw-r--r--test/CodeGen/AArch64/neon-mul-div.ll181
-rw-r--r--test/CodeGen/AArch64/neon-rounding-halving-add.ll105
-rw-r--r--test/CodeGen/AArch64/neon-rounding-shift.ll138
-rw-r--r--test/CodeGen/AArch64/neon-saturating-add-sub.ll274
-rw-r--r--test/CodeGen/AArch64/neon-saturating-rounding-shift.ll138
-rw-r--r--test/CodeGen/AArch64/neon-saturating-shift.ll138
-rw-r--r--test/CodeGen/AArch64/neon-shift.ll140
-rw-r--r--test/MC/AArch64/basic-a64-diagnostics.s8
-rw-r--r--test/MC/AArch64/basic-a64-instructions.s2
-rw-r--r--test/MC/AArch64/neon-aba-abd.s78
-rw-r--r--test/MC/AArch64/neon-add-pairwise.s35
-rw-r--r--test/MC/AArch64/neon-add-sub-instructions.s82
-rw-r--r--test/MC/AArch64/neon-bitwise-instructions.s60
-rw-r--r--test/MC/AArch64/neon-compare-instructions.s405
-rw-r--r--test/MC/AArch64/neon-diagnostics.s1207
-rw-r--r--test/MC/AArch64/neon-facge-facgt.s41
-rw-r--r--test/MC/AArch64/neon-frsqrt-frecp.s27
-rw-r--r--test/MC/AArch64/neon-halving-add-sub.s74
-rw-r--r--test/MC/AArch64/neon-max-min-pairwise.s110
-rw-r--r--test/MC/AArch64/neon-max-min.s110
-rw-r--r--test/MC/AArch64/neon-mla-mls-instructions.s61
-rw-r--r--test/MC/AArch64/neon-mov.s207
-rw-r--r--test/MC/AArch64/neon-mul-div-instructions.s86
-rw-r--r--test/MC/AArch64/neon-rounding-halving-add.s39
-rw-r--r--test/MC/AArch64/neon-rounding-shift.s57
-rw-r--r--test/MC/AArch64/neon-saturating-add-sub.s133
-rw-r--r--test/MC/AArch64/neon-saturating-rounding-shift.s70
-rw-r--r--test/MC/AArch64/neon-saturating-shift.s69
-rw-r--r--test/MC/AArch64/neon-shift.s57
-rw-r--r--test/MC/AArch64/noneon-diagnostics.s28
-rw-r--r--test/MC/Disassembler/AArch64/neon-instructions.txt673
66 files changed, 12503 insertions, 41 deletions
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index e1023826ba..1a849c4c30 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -494,6 +494,7 @@ def int_convertuu : Intrinsic<[llvm_anyint_ty],
include "llvm/IR/IntrinsicsPowerPC.td"
include "llvm/IR/IntrinsicsX86.td"
include "llvm/IR/IntrinsicsARM.td"
+include "llvm/IR/IntrinsicsAArch64.td"
include "llvm/IR/IntrinsicsXCore.td"
include "llvm/IR/IntrinsicsHexagon.td"
include "llvm/IR/IntrinsicsNVVM.td"
diff --git a/include/llvm/IR/IntrinsicsAArch64.td b/include/llvm/IR/IntrinsicsAArch64.td
new file mode 100644
index 0000000000..d7b1947db1
--- /dev/null
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@@ -0,0 +1,41 @@
+//===- IntrinsicsAArch64.td - Defines AArch64 intrinsics -----------*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the AArch64-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON)
+
+let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
+
+// Vector Absolute Compare (Floating Point)
+def int_aarch64_neon_vacgeq : Intrinsic<[llvm_v2i64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+def int_aarch64_neon_vacgtq : Intrinsic<[llvm_v2i64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+
+// Vector maxNum (Floating Point)
+def int_aarch64_neon_vmaxnm : Neon_2Arg_Intrinsic;
+
+// Vector minNum (Floating Point)
+def int_aarch64_neon_vminnm : Neon_2Arg_Intrinsic;
+
+// Vector Pairwise maxNum (Floating Point)
+def int_aarch64_neon_vpmaxnm : Neon_2Arg_Intrinsic;
+
+// Vector Pairwise minNum (Floating Point)
+def int_aarch64_neon_vpminnm : Neon_2Arg_Intrinsic;
+
+// Vector Multiply Extended (Floating Point)
+def int_aarch64_neon_vmulx : Neon_2Arg_Intrinsic;
+}
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
index b880d8373d..bff7eebe00 100644
--- a/lib/Target/AArch64/AArch64CallingConv.td
+++ b/lib/Target/AArch64/AArch64CallingConv.td
@@ -61,7 +61,7 @@ def CC_A64_APCS : CallingConv<[
// Vectors and Floating-point types.
CCIfType<[v2i8], CCBitConvertToType<f16>>,
CCIfType<[v4i8, v2i16], CCBitConvertToType<f32>>,
- CCIfType<[v8i8, v4i16, v2i32, v2f32], CCBitConvertToType<f64>>,
+ CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64], CCBitConvertToType<f64>>,
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
CCBitConvertToType<f128>>,
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index d0abc0bbd1..44b691bfcc 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -42,6 +42,8 @@ static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
: TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
+ const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+
// SIMD compares set the entire lane's bits to 1
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
@@ -53,6 +55,21 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
+ if (Subtarget->hasNEON()) {
+ // And the vectors
+ addRegisterClass(MVT::v8i8, &AArch64::VPR64RegClass);
+ addRegisterClass(MVT::v4i16, &AArch64::VPR64RegClass);
+ addRegisterClass(MVT::v2i32, &AArch64::VPR64RegClass);
+ addRegisterClass(MVT::v1i64, &AArch64::VPR64RegClass);
+ addRegisterClass(MVT::v2f32, &AArch64::VPR64RegClass);
+ addRegisterClass(MVT::v16i8, &AArch64::VPR128RegClass);
+ addRegisterClass(MVT::v8i16, &AArch64::VPR128RegClass);
+ addRegisterClass(MVT::v4i32, &AArch64::VPR128RegClass);
+ addRegisterClass(MVT::v2i64, &AArch64::VPR128RegClass);
+ addRegisterClass(MVT::v4f32, &AArch64::VPR128RegClass);
+ addRegisterClass(MVT::v2f64, &AArch64::VPR128RegClass);
+ }
+
computeRegisterProperties();
// We combine OR nodes for bitfield and NEON BSL operations.
@@ -251,6 +268,31 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
setExceptionPointerRegister(AArch64::X0);
setExceptionSelectorRegister(AArch64::X1);
+
+ if (Subtarget->hasNEON()) {
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
+
+ setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
+ setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
+ setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
+ }
}
EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
@@ -777,7 +819,22 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
case AArch64ISD::WrapperSmall: return "AArch64ISD::WrapperSmall";
- default: return NULL;
+ case AArch64ISD::NEON_BSL:
+ return "AArch64ISD::NEON_BSL";
+ case AArch64ISD::NEON_MOVIMM:
+ return "AArch64ISD::NEON_MOVIMM";
+ case AArch64ISD::NEON_MVNIMM:
+ return "AArch64ISD::NEON_MVNIMM";
+ case AArch64ISD::NEON_FMOVIMM:
+ return "AArch64ISD::NEON_FMOVIMM";
+ case AArch64ISD::NEON_CMP:
+ return "AArch64ISD::NEON_CMP";
+ case AArch64ISD::NEON_CMPZ:
+ return "AArch64ISD::NEON_CMPZ";
+ case AArch64ISD::NEON_TST:
+ return "AArch64ISD::NEON_TST";
+ default:
+ return NULL;
}
}
@@ -2230,6 +2287,213 @@ AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(A64CC::NE, MVT::i32));
}
+static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
+ SDLoc DL(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ EVT VT = Op.getValueType();
+ bool Invert = false;
+ SDValue Op0, Op1;
+ unsigned Opcode;
+
+ if (LHS.getValueType().isInteger()) {
+
+ // Attempt to use Vector Integer Compare Mask Test instruction.
+ // TST = icmp ne (and (op0, op1), zero).
+ if (CC == ISD::SETNE) {
+ if (((LHS.getOpcode() == ISD::AND) &&
+ ISD::isBuildVectorAllZeros(RHS.getNode())) ||
+ ((RHS.getOpcode() == ISD::AND) &&
+ ISD::isBuildVectorAllZeros(LHS.getNode()))) {
+
+ SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
+ SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
+ SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
+ return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
+ }
+ }
+
+ // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
+ // Note: Compare against Zero does not support unsigned predicates.
+ if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
+ ISD::isBuildVectorAllZeros(LHS.getNode())) &&
+ !isUnsignedIntSetCC(CC)) {
+
+ // If LHS is the zero value, swap operands and CondCode.
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+ CC = getSetCCSwappedOperands(CC);
+ Op0 = RHS;
+ } else
+ Op0 = LHS;
+
+ // Ensure valid CondCode for Compare Mask against Zero instruction:
+ // EQ, GE, GT, LE, LT.
+ if (ISD::SETNE == CC) {
+ Invert = true;
+ CC = ISD::SETEQ;
+ }
+
+ // Using constant type to differentiate integer and FP compares with zero.
+ Op1 = DAG.getConstant(0, MVT::i32);
+ Opcode = AArch64ISD::NEON_CMPZ;
+
+ } else {
+ // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
+ // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
+ bool Swap = false;
+ switch (CC) {
+ default:
+ llvm_unreachable("Illegal integer comparison.");
+ case ISD::SETEQ:
+ case ISD::SETGT:
+ case ISD::SETGE:
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ break;
+ case ISD::SETNE:
+ Invert = true;
+ CC = ISD::SETEQ;
+ break;
+ case ISD::SETULT:
+ case ISD::SETULE:
+ case ISD::SETLT:
+ case ISD::SETLE:
+ Swap = true;
+ CC = getSetCCSwappedOperands(CC);
+ }
+
+ if (Swap)
+ std::swap(LHS, RHS);
+
+ Opcode = AArch64ISD::NEON_CMP;
+ Op0 = LHS;
+ Op1 = RHS;
+ }
+
+ // Generate Compare Mask instr or Compare Mask against Zero instr.
+ SDValue NeonCmp =
+ DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
+
+ if (Invert)
+ NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
+
+ return NeonCmp;
+ }
+
+ // Now handle Floating Point cases.
+ // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
+ if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
+ ISD::isBuildVectorAllZeros(LHS.getNode())) {
+
+ // If LHS is the zero value, swap operands and CondCode.
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+ CC = getSetCCSwappedOperands(CC);
+ Op0 = RHS;
+ } else
+ Op0 = LHS;
+
+ // Using constant type to differentiate integer and FP compares with zero.
+ Op1 = DAG.getConstantFP(0, MVT::f32);
+ Opcode = AArch64ISD::NEON_CMPZ;
+ } else {
+ // Attempt to use Vector Floating Point Compare Mask instruction.
+ Op0 = LHS;
+ Op1 = RHS;
+ Opcode = AArch64ISD::NEON_CMP;
+ }
+
+ SDValue NeonCmpAlt;
+ // Some register compares have to be implemented with swapped CC and operands,
+ // e.g.: OLT implemented as OGT with swapped operands.
+ bool SwapIfRegArgs = false;
+
+ // Ensure valid CondCode for FP Compare Mask against Zero instruction:
+ // EQ, GE, GT, LE, LT.
+ // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
+ switch (CC) {
+ default:
+ llvm_unreachable("Illegal FP comparison");
+ case ISD::SETUNE:
+ case ISD::SETNE:
+ Invert = true; // Fallthrough
+ case ISD::SETOEQ:
+ case ISD::SETEQ:
+ CC = ISD::SETEQ;
+ break;
+ case ISD::SETOLT:
+ case ISD::SETLT:
+ CC = ISD::SETLT;
+ SwapIfRegArgs = true;
+ break;
+ case ISD::SETOGT:
+ case ISD::SETGT:
+ CC = ISD::SETGT;
+ break;
+ case ISD::SETOLE:
+ case ISD::SETLE:
+ CC = ISD::SETLE;
+ SwapIfRegArgs = true;
+ break;
+ case ISD::SETOGE:
+ case ISD::SETGE:
+ CC = ISD::SETGE;
+ break;
+ case ISD::SETUGE:
+ Invert = true;
+ CC = ISD::SETLT;
+ SwapIfRegArgs = true;
+ break;
+ case ISD::SETULE:
+ Invert = true;
+ CC = ISD::SETGT;
+ break;
+ case ISD::SETUGT:
+ Invert = true;
+ CC = ISD::SETLE;
+ SwapIfRegArgs = true;
+ break;
+ case ISD::SETULT:
+ Invert = true;
+ CC = ISD::SETGE;
+ break;
+ case ISD::SETUEQ:
+ Invert = true; // Fallthrough
+ case ISD::SETONE:
+ // Expand this to (OGT |OLT).
+ NeonCmpAlt =
+ DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
+ CC = ISD::SETLT;
+ SwapIfRegArgs = true;
+ break;
+ case ISD::SETUO:
+ Invert = true; // Fallthrough
+ case ISD::SETO:
+ // Expand this to (OGE | OLT).
+ NeonCmpAlt =
+ DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
+ CC = ISD::SETLT;
+ SwapIfRegArgs = true;
+ break;
+ }
+
+ if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
+ CC = getSetCCSwappedOperands(CC);
+ std::swap(Op0, Op1);
+ }
+
+ // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
+ SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
+
+ if (NeonCmpAlt.getNode())
+ NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
+
+ if (Invert)
+ NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
+
+ return NeonCmp;
+}
+
// (SETCC lhs, rhs, condcode)
SDValue
AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
@@ -2239,6 +2503,9 @@ AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
EVT VT = Op.getValueType();
+ if (VT.isVector())
+ return LowerVectorSETCC(Op, DAG);
+
if (LHS.getValueType() == MVT::f128) {
// f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
// for the rest of the function (some i32 or i64 values).
@@ -2395,11 +2662,155 @@ AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SETCC: return LowerSETCC(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, DAG);
case ISD::VASTART: return LowerVASTART(Op, DAG);
+ case ISD::BUILD_VECTOR:
+ return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
}
return SDValue();
}
+/// Check if the specified splat value corresponds to a valid vector constant
+/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI). If
+/// so, return the encoded 8-bit immediate and the OpCmode instruction fields
+/// values.
+static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
+ unsigned SplatBitSize, SelectionDAG &DAG,
+ bool is128Bits, NeonModImmType type, EVT &VT,
+ unsigned &Imm, unsigned &OpCmode) {
+ switch (SplatBitSize) {
+ default:
+ llvm_unreachable("unexpected size for isNeonModifiedImm");
+ case 8: {
+ if (type != Neon_Mov_Imm)
+ return false;
+ assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
+ // Neon movi per byte: Op=0, Cmode=1110.
+ OpCmode = 0xe;
+ Imm = SplatBits;
+ VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
+ break;
+ }
+ case 16: {
+ // Neon move inst per halfword
+ VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
+ if ((SplatBits & ~0xff) == 0) {
+ // Value = 0x00nn is 0x00nn LSL 0
+ // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
+ // bic: Op=1, Cmode=1001; orr: Op=0, Cmode=1001
+ // Op=x, Cmode=100y
+ Imm = SplatBits;
+ OpCmode = 0x8;
+ break;
+ }
+ if ((SplatBits & ~0xff00) == 0) {
+ // Value = 0xnn00 is 0x00nn LSL 8
+ // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
+ // bic: Op=1, Cmode=1011; orr: Op=0, Cmode=1011
+ // Op=x, Cmode=101x
+ Imm = SplatBits >> 8;
+ OpCmode = 0xa;
+ break;
+ }
+ // can't handle any other
+ return false;
+ }
+
+ case 32: {
+ // First the LSL variants (MSL is unusable by some interested instructions).
+
+ // Neon move instr per word, shift zeros
+ VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
+ if ((SplatBits & ~0xff) == 0) {
+ // Value = 0x000000nn is 0x000000nn LSL 0
+ // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
+ // bic: Op=1, Cmode= 0001; orr: Op=0, Cmode= 0001
+ // Op=x, Cmode=000x
+ Imm = SplatBits;
+ OpCmode = 0;
+ break;
+ }
+ if ((SplatBits & ~0xff00) == 0) {
+ // Value = 0x0000nn00 is 0x000000nn LSL 8
+ // movi: Op=0, Cmode= 0010; mvni: Op=1, Cmode= 0010
+ // bic: Op=1, Cmode= 0011; orr : Op=0, Cmode= 0011
+ // Op=x, Cmode=001x
+ Imm = SplatBits >> 8;
+ OpCmode = 0x2;
+ break;
+ }
+ if ((SplatBits & ~0xff0000) == 0) {
+ // Value = 0x00nn0000 is 0x000000nn LSL 16
+ // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
+ // bic: Op=1, Cmode= 0101; orr: Op=0, Cmode= 0101
+ // Op=x, Cmode=010x
+ Imm = SplatBits >> 16;
+ OpCmode = 0x4;
+ break;
+ }
+ if ((SplatBits & ~0xff000000) == 0) {
+ // Value = 0xnn000000 is 0x000000nn LSL 24
+ // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
+ // bic: Op=1, Cmode= 0111; orr: Op=0, Cmode= 0111
+ // Op=x, Cmode=011x
+ Imm = SplatBits >> 24;
+ OpCmode = 0x6;
+ break;
+ }
+
+ // Now the MSL immediates.
+
+ // Neon move instr per word, shift ones
+ if ((SplatBits & ~0xffff) == 0 &&
+ ((SplatBits | SplatUndef) & 0xff) == 0xff) {
+ // Value = 0x0000nnff is 0x000000nn MSL 8
+ // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
+ // Op=x, Cmode=1100
+ Imm = SplatBits >> 8;
+ OpCmode = 0xc;
+ break;
+ }
+ if ((SplatBits & ~0xffffff) == 0 &&
+ ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
+ // Value = 0x00nnffff is 0x000000nn MSL 16
+ // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
+ // Op=x, Cmode=1101
+ Imm = SplatBits >> 16;
+ OpCmode = 0xd;
+ break;
+ }
+ // can't handle any other
+ return false;
+ }
+
+ case 64: {
+ if (type != Neon_Mov_Imm)
+ return false;
+ // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
+ // movi Op=1, Cmode=1110.
+ OpCmode = 0x1e;
+ uint64_t BitMask = 0xff;
+ uint64_t Val = 0;
+ unsigned ImmMask = 1;
+ Imm = 0;
+ for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
+ if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
+ Val |= BitMask;
+ Imm |= ImmMask;
+ } else if ((SplatBits & BitMask) != 0) {
+ return false;
+ }
+ BitMask <<= 8;
+ ImmMask <<= 1;
+ }
+ SplatBits = Val;
+ VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
+ break;
+ }
+ }
+
+ return true;
+}
+
static SDValue PerformANDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
@@ -2725,6 +3136,7 @@ static SDValue PerformORCombine(SDNode *N,
const AArch64Subtarget *Subtarget) {
SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
EVT VT = N->getValueType(0);
if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
@@ -2745,6 +3157,44 @@ static SDValue PerformORCombine(SDNode *N,
if (Res.getNode())
return Res;
+ if (!Subtarget->hasNEON())
+ return SDValue();
+
+ // Attempt to use vector immediate-form BSL
+ // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+
+ SDValue N0 = N->getOperand(0);
+ if (N0.getOpcode() != ISD::AND)
+ return SDValue();
+
+ SDValue N1 = N->getOperand(1);
+ if (N1.getOpcode() != ISD::AND)
+ return SDValue();
+
+ if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+ APInt SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+ BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
+ APInt SplatBits0;
+ if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
+ HasAnyUndefs) &&
+ !HasAnyUndefs) {
+ BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+ APInt SplatBits1;
+ if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+ HasAnyUndefs) &&
+ !HasAnyUndefs && SplatBits0 == ~SplatBits1) {
+ // Canonicalize the vector type to make instruction selection simpler.
+ EVT CanonicalVT = VT.is128BitVector() ? MVT::v16i8 : MVT::v8i8;
+ SDValue Result = DAG.getNode(AArch64ISD::NEON_BSL, DL, CanonicalVT,
+ N0->getOperand(1), N0->getOperand(0),
+ N1->getOperand(0));
+ return DAG.getNode(ISD::BITCAST, DL, VT, Result);
+ }
+ }
+ }
+
return SDValue();
}
@@ -2819,6 +3269,76 @@ AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
return false;
}
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.
+SDValue
+AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+ const AArch64Subtarget *ST) const {
+
+ BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+
+ APInt SplatBits, SplatUndef;
+ unsigned SplatBitSize;
+ bool HasAnyUndefs;
+
+ // Note we favor lowering MOVI over MVNI.
+ // This has implications on the definition of patterns in TableGen to select
+ // BIC immediate instructions but not ORR immediate instructions.
+ // If this lowering order is changed, TableGen patterns for BIC immediate and
+ // ORR immediate instructions have to be updated.
+ if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+ if (SplatBitSize <= 64) {
+ // First attempt to use vector immediate-form MOVI
+ EVT NeonMovVT;
+ unsigned Imm = 0;
+ unsigned OpCmode = 0;
+
+ if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+ SplatBitSize, DAG, VT.is128BitVector(),
+ Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
+ SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
+ SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
+
+ if (ImmVal.getNode() && OpCmodeVal.getNode()) {
+ SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
+ ImmVal, OpCmodeVal);
+ return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
+ }
+ }
+
+ // Then attempt to use vector immediate-form MVNI
+ uint64_t NegatedImm = (~SplatBits).getZExtValue();
+ if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
+ DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
+ Imm, OpCmode)) {
+ SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
+ SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
+ if (ImmVal.getNode() && OpCmodeVal.getNode()) {
+ SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
+ ImmVal, OpCmodeVal);
+ return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
+ }
+ }
+
+ // Attempt to use vector immediate-form FMOV
+ if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
+ (VT == MVT::v2f64 && SplatBitSize == 64)) {
+ APFloat RealVal(
+ SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
+ SplatBits);
+ uint32_t ImmVal;
+ if (A64Imms::isFPImm(RealVal, ImmVal)) {
+ SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
+ return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
AArch64TargetLowering::ConstraintType
AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
if (Constraint.size() == 1) {
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 320346e60b..67a908e24e 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -111,7 +111,28 @@ namespace AArch64ISD {
// created using the small memory model style: i.e. adrp/add or
// adrp/mem-op. This exists to prevent bare TargetAddresses which may never
// get selected.
- WrapperSmall
+ WrapperSmall,
+
+ // Vector bitwise select
+ NEON_BSL,
+
+ // Vector move immediate
+ NEON_MOVIMM,
+
+ // Vector Move Inverted Immediate
+ NEON_MVNIMM,
+
+ // Vector FP move immediate
+ NEON_FMOVIMM,
+
+ // Vector compare
+ NEON_CMP,
+
+ // Vector compare zero
+ NEON_CMPZ,
+
+ // Vector compare bitwise test
+ NEON_TST
};
}
@@ -148,9 +169,11 @@ public:
SDLoc dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const;
- void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
- SDLoc DL, SDValue &Chain) const;
+ SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
+ const AArch64Subtarget *ST) const;
+ void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+ SDValue &Chain) const;
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
@@ -253,6 +276,10 @@ private:
return &getTargetMachine().getSubtarget<AArch64Subtarget>();
}
};
+enum NeonModImmType {
+ Neon_Mov_Imm,
+ Neon_Mvn_Imm
+};
} // namespace llvm
#endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 9dd122f149..09451fdc45 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -959,3 +959,96 @@ class A64I_Breg<bits<4> opc, bits<5> op2, bits<6> op3, bits<5> op4,
let Inst{4-0} = op4;
}
+
+//===----------------------------------------------------------------------===//
+//
+// Neon Instruction Format Definitions.
+//
+
+let Predicates = [HasNEON] in {
+
+class NeonInstAlias<string Asm, dag Result, bit Emit = 0b1>
+ : InstAlias<Asm, Result, Emit> {
+}
+
+// Format AdvSIMD 3 vector registers with same vector type
+class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode,
+ dag outs, dag ins, string asmstr,
+ list<dag> patterns, InstrItinClass itin>
+ : A64InstRdnm<outs, ins, asmstr, patterns, itin>
+{
+ let Inst{31} = 0b0;
+ let Inst{30} = q;
+ let Inst{29} = u;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21} = 0b1;
+ // Inherit Rm in 20-16
+ let Inst{15-11} = opcode;
+ let Inst{10} = 0b1;
+ // Inherit Rn in 9-5
+ // Inherit Rd in 4-0
+}
+
+// Format AdvSIMD 1 vector register with modified immediate
+class NeonI_1VModImm<bit q, bit op,
+ dag outs, dag ins, string asmstr,
+ list<dag> patterns, InstrItinClass itin>
+ : A64InstRd<outs,ins, asmstr, patterns, itin>
+{
+ bits<8> Imm;
+ bits<4> cmode;
+ let Inst{31} = 0b0;
+ let Inst{30} = q;
+ let Inst{29} = op;
+ let Inst{28-19} = 0b0111100000;
+ let Inst{15-12} = cmode;
+ let Inst{11} = 0b0; // o2
+ let Inst{10} = 1;
+ // Inherit Rd in 4-0
+ let Inst{18-16} = Imm{7-5}; // imm a:b:c
+ let Inst{9-5} = Imm{4-0}; // imm d:e:f:g:h
+}
+
+// Format AdvSIMD 3 scalar registers with same type
+
+class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
+ dag outs, dag ins, string asmstr,
+ list<dag> patterns, InstrItinClass itin>
+ : A64InstRdnm<outs, ins, asmstr, patterns, itin>
+{
+ let Inst{31} = 0b0;
+ let Inst{30} = 0b1;
+ let Inst{29} = u;
+ let Inst{28-24} = 0b11110;
+ let Inst{23-22} = size;
+ let Inst{21} = 0b1;
+ // Inherit Rm in 20-16
+ let Inst{15-11} = opcode;
+ let Inst{10} = 0b1;
+ // Inherit Rn in 9-5
+ // Inherit Rd in 4-0
+}
+
+
+// Format AdvSIMD 2 vector registers miscellaneous
+class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
+ dag outs, dag ins, string asmstr,
+ list<dag> patterns, InstrItinClass itin>
+ : A64InstRdn<outs, ins, asmstr, patterns, itin>
+{
+ let Inst{31} = 0b0;
+ let Inst{30} = q;
+ let Inst{29} = u;
+ let Inst{28-24} = 0b01110;
+ let Inst{23-22} = size;
+ let Inst{21-17} = 0b10000;
+ let Inst{16-12} = opcode;
+ let Inst{11-10} = 0b10;
+
+ // Inherit Rn in 9-5
+ // Inherit Rd in 4-0
+}
+
+}
+
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 725a12164b..07289b0be1 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -11,6 +11,17 @@
//
//===----------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasNEON : Predicate<"Subtarget->hasNEON()">,
+ AssemblerPredicate<"FeatureNEON", "neon">;
+def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
+ AssemblerPredicate<"FeatureCrypto","crypto">;
+
+// Use fused MAC if more precision in FP computation is allowed.
+def UseFusedMAC : Predicate<"(TM.Options.AllowFPOpFusion =="
+ " FPOpFusion::Fast)">;
include "AArch64InstrFormats.td"
//===----------------------------------------------------------------------===//
@@ -2173,6 +2184,29 @@ def FMSUBdddd : A64I_fpdp3Impl<"fmsub", FPR64, f64, 0b01, 0b0, 0b1, fmsub>;
def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>;
def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
+// Extra patterns for when we're allowed to optimise separate multiplication and
+// addition.
+let Predicates = [UseFusedMAC] in {
+def : Pat<(fadd FPR32:$Ra, (fmul FPR32:$Rn, FPR32:$Rm)),
+ (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+def : Pat<(fsub FPR32:$Ra, (fmul FPR32:$Rn, FPR32:$Rm)),
+ (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+def : Pat<(fsub (fmul FPR32:$Rn, FPR32:$Rm), FPR32:$Ra),
+ (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+def : Pat<(fsub (fneg FPR32:$Ra), (fmul FPR32:$Rn, FPR32:$Rm)),
+ (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
+
+def : Pat<(fadd FPR64:$Ra, (fmul FPR64:$Rn, FPR64:$Rm)),
+ (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+def : Pat<(fsub FPR64:$Ra, (fmul FPR64:$Rn, FPR64:$Rm)),
+ (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+def : Pat<(fsub (fmul FPR64:$Rn, FPR64:$Rm), FPR64:$Ra),
+ (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+def : Pat<(fsub (fneg FPR64:$Ra), (fmul FPR64:$Rn, FPR64:$Rm)),
+ (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
+}
+
+
//===----------------------------------------------------------------------===//
// Floating-point <-> fixed-point conversion instructions
//===----------------------------------------------------------------------===//
@@ -5123,3 +5157,9 @@ defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm),
defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)),
(i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>;
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "AArch64InstrNEON.td" \ No newline at end of file
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
new file mode 100644
index 0000000000..98b9e3e115
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -0,0 +1,1634 @@
+//===-- AArch64InstrNEON.td - NEON support for AArch64 -----*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the AArch64 NEON instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// NEON-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+def Neon_bsl : SDNode<"AArch64ISD::NEON_BSL", SDTypeProfile<1, 3,
+ [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+ SDTCisSameAs<0, 3>]>>;
+
+// (outs Result), (ins Imm, OpCmode)
+def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
+
+def Neon_movi : SDNode<"AArch64ISD::NEON_MOVIMM", SDT_Neon_movi>;
+
+def Neon_mvni : SDNode<"AArch64ISD::NEON_MVNIMM", SDT_Neon_movi>;
+
+// (outs Result), (ins Imm)
+def Neon_fmovi : SDNode<"AArch64ISD::NEON_FMOVIMM", SDTypeProfile<1, 1,
+ [SDTCisVec<0>, SDTCisVT<1, i32>]>>;
+
+// (outs Result), (ins LHS, RHS, CondCode)
+def Neon_cmp : SDNode<"AArch64ISD::NEON_CMP", SDTypeProfile<1, 3,
+ [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>;
+
+// (outs Result), (ins LHS, 0/0.0 constant, CondCode)
+def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3,
+ [SDTCisVec<0>, SDTCisVec<1>]>>;
+
+// (outs Result), (ins LHS, RHS)
+def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
+ [SDTCisVec<0>, SDTCisSameAs<1, 2>]>>;
+
+//===----------------------------------------------------------------------===//
+// Multiclasses
+//===----------------------------------------------------------------------===//
+
+multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size, bits<5> opcode,
+ string asmop, SDPatternOperator opnode8B,
+ SDPatternOperator opnode16B,
+ bit Commutable = 0>
+{
+ let isCommutable = Commutable in {
+ def _8B : NeonI_3VSame<0b0, u, size, opcode,
+ (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+ asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
+ [(set (v8i8 VPR64:$Rd),
+ (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
+ NoItinerary>;
+
+ def _16B : NeonI_3VSame<0b1, u, size, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+ asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
+ [(set (v16i8 VPR128:$Rd),
+ (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
+ NoItinerary>;
+ }
+
+}
+
+multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
+ string asmop, SDPatternOperator opnode,
+ bit Commutable = 0>
+{
+ let isCommutable = Commutable in {
+ def _4H : NeonI_3VSame<0b0, u, 0b01, opcode,
+ (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+ asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h",
+ [(set (v4i16 VPR64:$Rd),
+ (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))],
+ NoItinerary>;
+
+ def _8H : NeonI_3VSame<0b1, u, 0b01, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+ asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h",
+ [(set (v8i16 VPR128:$Rd),
+ (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))],
+ NoItinerary>;
+
+ def _2S : NeonI_3VSame<0b0, u, 0b10, opcode,
+ (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+ asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
+ [(set (v2i32 VPR64:$Rd),
+ (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))],
+ NoItinerary>;
+
+ def _4S : NeonI_3VSame<0b1, u, 0b10, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+ asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
+ [(set (v4i32 VPR128:$Rd),
+ (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))],
+ NoItinerary>;
+ }
+}
+multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
+ string asmop, SDPatternOperator opnode,
+ bit Commutable = 0>
+ : NeonI_3VSame_HS_sizes<u, opcode, asmop, opnode, Commutable>
+{
+ let isCommutable = Commutable in {
+ def _8B : NeonI_3VSame<0b0, u, 0b00, opcode,
+ (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+ asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
+ [(set (v8i8 VPR64:$Rd),
+ (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
+ NoItinerary>;
+
+ def _16B : NeonI_3VSame<0b1, u, 0b00, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+ asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
+ [(set (v16i8 VPR128:$Rd),
+ (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
+ NoItinerary>;
+ }
+}
+
+multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode,
+ string asmop, SDPatternOperator opnode,
+ bit Commutable = 0>
+ : NeonI_3VSame_BHS_sizes<u, opcode, asmop, opnode, Commutable>
+{
+ let isCommutable = Commutable in {
+ def _2D : NeonI_3VSame<0b1, u, 0b11, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+ asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
+ [(set (v2i64 VPR128:$Rd),
+ (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))],
+ NoItinerary>;
+ }
+}
+
+// Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types,
+// but Result types can be integer or floating point types.
+multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
+ string asmop, SDPatternOperator opnode2S,
+ SDPatternOperator opnode4S,
+ SDPatternOperator opnode2D,
+ ValueType ResTy2S, ValueType ResTy4S,
+ ValueType ResTy2D, bit Commutable = 0>
+{
+ let isCommutable = Commutable in {
+ def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode,
+ (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
+ asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
+ [(set (ResTy2S VPR64:$Rd),
+ (ResTy2S (opnode2S (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))],
+ NoItinerary>;
+
+ def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+ asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
+ [(set (ResTy4S VPR128:$Rd),
+ (ResTy4S (opnode4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))],
+ NoItinerary>;
+
+ def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
+ asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
+ [(set (ResTy2D VPR128:$Rd),
+ (ResTy2D (opnode2D (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))],
+ NoItinerary>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions
+//===----------------------------------------------------------------------===//
+
+// Vector Arithmetic Instructions
+
+// Vector Add (Integer and Floating-Point)
+
+defm ADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>;
+defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd, fadd, fadd,
+ v2f32, v4f32, v2f64, 1>;
+
+// Vector Sub (Integer and Floating-Point)
+
+defm SUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>;
+defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub, fsub, fsub,
+ v2f32, v4f32, v2f64, 0>;
+
+// Vector Multiply (Integer and Floating-Point)
+
+defm MULvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>;
+defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul, fmul, fmul,
+ v2f32, v4f32, v2f64, 1>;
+
+// Vector Multiply (Polynomial)
+
+defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul",
+ int_arm_neon_vmulp, int_arm_neon_vmulp, 1>;
+
+// Vector Multiply-accumulate and Multiply-subtract (Integer)
+
+// class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and
+// two operands constraints.
+class NeonI_3VSame_Constraint_impl<string asmop, string asmlane,
+ RegisterClass VPRC, ValueType OpTy, bit q, bit u, bits<2> size, bits<5> opcode,
+ SDPatternOperator opnode>
+ : NeonI_3VSame<q, u, size, opcode,
+ (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, VPRC:$Rm),
+ asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane,
+ [(set (OpTy VPRC:$Rd),
+ (OpTy (opnode (OpTy VPRC:$src), (OpTy VPRC:$Rn), (OpTy VPRC:$Rm))))],
+ NoItinerary> {
+ let Constraints = "$src = $Rd";
+}
+
+def Neon_mla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+ (add node:$Ra, (mul node:$Rn, node:$Rm))>;
+
+def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+ (sub node:$Ra, (mul node:$Rn, node:$Rm))>;
+
+
+def MLAvvv_8B: NeonI_3VSame_Constraint_impl<"mla", ".8b", VPR64, v8i8,
+ 0b0, 0b0, 0b00, 0b10010, Neon_mla>;
+def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8,
+ 0b1, 0b0, 0b00, 0b10010, Neon_mla>;
+def MLAvvv_4H: NeonI_3VSame_Constraint_impl<"mla", ".4h", VPR64, v4i16,
+ 0b0, 0b0, 0b01, 0b10010, Neon_mla>;
+def MLAvvv_8H: NeonI_3VSame_Constraint_impl<"mla", ".8h", VPR128, v8i16,
+ 0b1, 0b0, 0b01, 0b10010, Neon_mla>;
+def MLAvvv_2S: NeonI_3VSame_Constraint_impl<"mla", ".2s", VPR64, v2i32,
+ 0b0, 0b0, 0b10, 0b10010, Neon_mla>;
+def MLAvvv_4S: NeonI_3VSame_Constraint_impl<"mla", ".4s", VPR128, v4i32,
+ 0b1, 0b0, 0b10, 0b10010, Neon_mla>;
+
+def MLSvvv_8B: NeonI_3VSame_Constraint_impl<"mls", ".8b", VPR64, v8i8,
+ 0b0, 0b1, 0b00, 0b10010, Neon_mls>;
+def MLSvvv_16B: NeonI_3VSame_Constraint_impl<"mls", ".16b", VPR128, v16i8,
+ 0b1, 0b1, 0b00, 0b10010, Neon_mls>;
+def MLSvvv_4H: NeonI_3VSame_Constraint_impl<"mls", ".4h", VPR64, v4i16,
+ 0b0, 0b1, 0b01, 0b10010, Neon_mls>;
+def MLSvvv_8H: NeonI_3VSame_Constraint_impl<"mls", ".8h", VPR128, v8i16,
+ 0b1, 0b1, 0b01, 0b10010, Neon_mls>;
+def MLSvvv_2S: NeonI_3VSame_Constraint_impl<"mls", ".2s", VPR64, v2i32,
+ 0b0, 0b1, 0b10, 0b10010, Neon_mls>;
+def MLSvvv_4S: NeonI_3VSame_Constraint_impl<"mls", ".4s", VPR128, v4i32,
+ 0b1, 0b1, 0b10, 0b10010, Neon_mls>;
+
+// Vector Multiply-accumulate and Multiply-subtract (Floating Point)
+
+def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+ (fadd node:$Ra, (fmul node:$Rn, node:$Rm))>;
+
+def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+ (fsub node:$Ra, (fmul node:$Rn, node:$Rm))>;
+
+let Predicates = [HasNEON, UseFusedMAC] in {
+def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s", VPR64, v2f32,
+ 0b0, 0b0, 0b00, 0b11001, Neon_fmla>;
+def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s", VPR128, v4f32,
+ 0b1, 0b0, 0b00, 0b11001, Neon_fmla>;
+def FMLAvvv_2D: NeonI_3VSame_Constraint_impl<"fmla", ".2d", VPR128, v2f64,
+ 0b1, 0b0, 0b01, 0b11001, Neon_fmla>;
+
+def FMLSvvv_2S: NeonI_3VSame_Constraint_impl<"fmls", ".2s", VPR64, v2f32,
+ 0b0, 0b0, 0b10, 0b11001, Neon_fmls>;
+def FMLSvvv_4S: NeonI_3VSame_Constraint_impl<"fmls", ".4s", VPR128, v4f32,
+ 0b1, 0b0, 0b10, 0b11001, Neon_fmls>;
+def FMLSvvv_2D: NeonI_3VSame_Constraint_impl<"fmls", ".2d", VPR128, v2f64,
+ 0b1, 0b0, 0b11, 0b11001, Neon_fmls>;
+}
+
+// We're also allowed to match the fma instruction regardless of compile
+// options.
+def : Pat<(v2f32 (fma VPR64:$Rn, VPR64:$Rm, VPR64:$Ra)),
+ (FMLAvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
+def : Pat<(v4f32 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
+ (FMLAvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
+def : Pat<(v2f64 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
+ (FMLAvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
+
+def : Pat<(v2f32 (fma (fneg VPR64:$Rn), VPR64:$Rm, VPR64:$Ra)),
+ (FMLSvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
+def : Pat<(v4f32 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
+ (FMLSvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
+def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
+ (FMLSvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
+
+// Vector Divide (Floating-Point)
+
+defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv, fdiv, fdiv,
+ v2f32, v4f32, v2f64, 0>;
+
+// Vector Bitwise Operations
+
+// Vector Bitwise AND
+
+defm ANDvvv : NeonI_3VSame_B_sizes<0b0, 0b00, 0b00011, "and", and, and, 1>;
+
+// Vector Bitwise Exclusive OR
+
+defm EORvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b00011, "eor", xor, xor, 1>;
+
+// Vector Bitwise OR
+
+defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>;
+
+// ORR disassembled as MOV if Vn==Vm
+
+// Vector Move - register
+// Alias for ORR if Vn=Vm and it is the preferred syntax
+def : NeonInstAlias<"mov $Rd.8b, $Rn.8b",
+ (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn)>;
+def : NeonInstAlias<"mov $Rd.16b, $Rn.16b",
+ (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn)>;
+
+def Neon_immAllOnes: PatLeaf<(Neon_movi (i32 timm), (i32 imm)), [{
+ ConstantSDNode *ImmConstVal = cast<ConstantSDNode>(N->getOperand(0));
+ ConstantSDNode *OpCmodeConstVal = cast<ConstantSDNode>(N->getOperand(1));
+ unsigned EltBits;
+ uint64_t EltVal = A64Imms::decodeNeonModImm(ImmConstVal->getZExtValue(),
+ OpCmodeConstVal->getZExtValue(), EltBits);
+ return (EltBits == 8 && EltVal == 0xff);
+}]>;
+
+
+def Neon_not8B : PatFrag<(ops node:$in),
+ (xor node:$in, (bitconvert (v8i8 Neon_immAllOnes)))>;
+def Neon_not16B : PatFrag<(ops node:$in),
+ (xor node:$in, (bitconvert (v16i8 Neon_immAllOnes)))>;
+
+def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm),
+ (or node:$Rn, (Neon_not8B node:$Rm))>;
+
+def Neon_orn16B : PatFrag<(ops node:$Rn, node:$Rm),
+ (or node:$Rn, (Neon_not16B node:$Rm))>;
+
+def Neon_bic8B : PatFrag<(ops node:$Rn, node:$Rm),
+ (and node:$Rn, (Neon_not8B node:$Rm))>;
+
+def Neon_bic16B : PatFrag<(ops node:$Rn, node:$Rm),
+ (and node:$Rn, (Neon_not16B node:$Rm))>;
+
+
+// Vector Bitwise OR NOT - register
+
+defm ORNvvv : NeonI_3VSame_B_sizes<0b0, 0b11, 0b00011, "orn",
+ Neon_orn8B, Neon_orn16B, 0>;
+
+// Vector Bitwise Bit Clear (AND NOT) - register
+
+defm BICvvv : NeonI_3VSame_B_sizes<0b0, 0b01, 0b00011, "bic",
+ Neon_bic8B, Neon_bic16B, 0>;
+
+multiclass Neon_bitwise2V_patterns<SDPatternOperator opnode8B,
+ SDPatternOperator opnode16B,
+ Instruction INST8B,
+ Instruction INST16B> {
+ def : Pat<(v2i32 (opnode8B VPR64:$Rn, VPR64:$Rm)),
+ (INST8B VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v4i16 (opnode8B VPR64:$Rn, VPR64:$Rm)),
+ (INST8B VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v1i64 (opnode8B VPR64:$Rn, VPR64:$Rm)),
+ (INST8B VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v4i32 (opnode16B VPR128:$Rn, VPR128:$Rm)),
+ (INST16B VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v8i16 (opnode16B VPR128:$Rn, VPR128:$Rm)),
+ (INST16B VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v2i64 (opnode16B VPR128:$Rn, VPR128:$Rm)),
+ (INST16B VPR128:$Rn, VPR128:$Rm)>;
+}
+
+// Additional patterns for bitwise instructions AND, EOR, ORR, BIC, ORN
+defm : Neon_bitwise2V_patterns<and, and, ANDvvv_8B, ANDvvv_16B>;
+defm : Neon_bitwise2V_patterns<or, or, ORRvvv_8B, ORRvvv_16B>;
+defm : Neon_bitwise2V_patterns<xor, xor, EORvvv_8B, EORvvv_16B>;
+defm : Neon_bitwise2V_patterns<Neon_bic8B, Neon_bic16B, BICvvv_8B, BICvvv_16B>;
+defm : Neon_bitwise2V_patterns<Neon_orn8B, Neon_orn16B, ORNvvv_8B, ORNvvv_16B>;
+
+// Vector Bitwise Select
+def BSLvvv_8B : NeonI_3VSame_Constraint_impl<"bsl", ".8b", VPR64, v8i8,
+ 0b0, 0b1, 0b01, 0b00011, Neon_bsl>;
+
+def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8,
+ 0b1, 0b1, 0b01, 0b00011, Neon_bsl>;
+
+multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
+ Instruction INST8B,
+ Instruction INST16B> {
+ // Disassociate type from instruction definition
+ def : Pat<(v2i32 (opnode VPR64:$src,VPR64:$Rn, VPR64:$Rm)),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v4i16 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v1i64 (opnode VPR64:$src, VPR64:$Rn, VPR64:$Rm)),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v4i32 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)),
+ (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v8i16 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)),
+ (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v2i64 (opnode VPR128:$src, VPR128:$Rn, VPR128:$Rm)),
+ (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+
+ // Allow to match BSL instruction pattern with non-constant operand
+ def : Pat<(v8i8 (or (and VPR64:$Rn, VPR64:$Rd),
+ (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
+ (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v4i16 (or (and VPR64:$Rn, VPR64:$Rd),
+ (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
+ (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v2i32 (or (and VPR64:$Rn, VPR64:$Rd),
+ (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
+ (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v1i64 (or (and VPR64:$Rn, VPR64:$Rd),
+ (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
+ (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v16i8 (or (and VPR128:$Rn, VPR128:$Rd),
+ (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
+ (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v8i16 (or (and VPR128:$Rn, VPR128:$Rd),
+ (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
+ (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v4i32 (or (and VPR128:$Rn, VPR128:$Rd),
+ (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
+ (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v2i64 (or (and VPR128:$Rn, VPR128:$Rd),
+ (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
+ (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
+
+ // Allow to match llvm.arm.* intrinsics.
+ def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 VPR64:$src),
+ (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 VPR64:$src),
+ (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 VPR64:$src),
+ (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 VPR64:$src),
+ (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src),
+ (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src),
+ (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
+ (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 VPR128:$src),
+ (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
+ (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 VPR128:$src),
+ (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
+ (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 VPR128:$src),
+ (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
+ (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 VPR128:$src),
+ (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
+ (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+ def : Pat<(v2f64 (int_arm_neon_vbsl (v2f64 VPR128:$src),
+ (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
+ (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
+}
+
+// Additional patterns for bitwise instruction BSL
+defm: Neon_bitwise3V_patterns<Neon_bsl, BSLvvv_8B, BSLvvv_16B>;
+
+def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm),
+ (Neon_bsl node:$src, node:$Rn, node:$Rm),
+ [{ (void)N; return false; }]>;
+
+// Vector Bitwise Insert if True
+
+def BITvvv_8B : NeonI_3VSame_Constraint_impl<"bit", ".8b", VPR64, v8i8,
+ 0b0, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
+def BITvvv_16B : NeonI_3VSame_Constraint_impl<"bit", ".16b", VPR128, v16i8,
+ 0b1, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
+
+// Vector Bitwise Insert if False
+
+def BIFvvv_8B : NeonI_3VSame_Constraint_impl<"bif", ".8b", VPR64, v8i8,
+ 0b0, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
+def BIFvvv_16B : NeonI_3VSame_Constraint_impl<"bif", ".16b", VPR128, v16i8,
+ 0b1, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
+
+// Vector Absolute Difference and Accumulate (Signed, Unsigned)
+
+def Neon_uaba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+ (add node:$Ra, (int_arm_neon_vabdu node:$Rn, node:$Rm))>;
+def Neon_saba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
+ (add node:$Ra, (int_arm_neon_vabds node:$Rn, node:$Rm))>;
+
+// Vector Absolute Difference and Accumulate (Unsigned)
+def UABAvvv_8B : NeonI_3VSame_Constraint_impl<"uaba", ".8b", VPR64, v8i8,
+ 0b0, 0b1, 0b00, 0b01111, Neon_uaba>;
+def UABAvvv_16B : NeonI_3VSame_Constraint_impl<"uaba", ".16b", VPR128, v16i8,
+ 0b1, 0b1, 0b00, 0b01111, Neon_uaba>;
+def UABAvvv_4H : NeonI_3VSame_Constraint_impl<"uaba", ".4h", VPR64, v4i16,
+ 0b0, 0b1, 0b01, 0b01111, Neon_uaba>;
+def UABAvvv_8H : NeonI_3VSame_Constraint_impl<"uaba", ".8h", VPR128, v8i16,
+ 0b1, 0b1, 0b01, 0b01111, Neon_uaba>;
+def UABAvvv_2S : NeonI_3VSame_Constraint_impl<"uaba", ".2s", VPR64, v2i32,
+ 0b0, 0b1, 0b10, 0b01111, Neon_uaba>;
+def UABAvvv_4S : NeonI_3VSame_Constraint_impl<"uaba", ".4s", VPR128, v4i32,
+ 0b1, 0b1, 0b10, 0b01111, Neon_uaba>;
+
+// Vector Absolute Difference and Accumulate (Signed)
+def SABAvvv_8B : NeonI_3VSame_Constraint_impl<"saba", ".8b", VPR64, v8i8,
+ 0b0, 0b0, 0b00, 0b01111, Neon_saba>;
+def SABAvvv_16B : NeonI_3VSame_Constraint_impl<"saba", ".16b", VPR128, v16i8,
+ 0b1, 0b0, 0b00, 0b01111, Neon_saba>;
+def SABAvvv_4H : NeonI_3VSame_Constraint_impl<"saba", ".4h", VPR64, v4i16,
+ 0b0, 0b0, 0b01, 0b01111, Neon_saba>;
+def SABAvvv_8H : NeonI_3VSame_Constraint_impl<"saba", ".8h", VPR128, v8i16,
+ 0b1, 0b0, 0b01, 0b01111, Neon_saba>;
+def SABAvvv_2S : NeonI_3VSame_Constraint_impl<"saba", ".2s", VPR64, v2i32,
+ 0b0, 0b0, 0b10, 0b01111, Neon_saba>;
+def SABAvvv_4S : NeonI_3VSame_Constraint_impl<"saba", ".4s", VPR128, v4i32,
+ 0b1, 0b0, 0b10, 0b01111, Neon_saba>;
+
+
+// Vector Absolute Difference (Signed, Unsigned)
+defm UABDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01110, "uabd", int_arm_neon_vabdu, 0>;
+defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds, 0>;
+
+// Vector Absolute Difference (Floating Point)
+defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd",
+ int_arm_neon_vabds, int_arm_neon_vabds,
+ int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>;
+
+// Vector Reciprocal Step (Floating Point)
+defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps",
+ int_arm_neon_vrecps, int_arm_neon_vrecps,
+ int_arm_neon_vrecps,
+ v2f32, v4f32, v2f64, 0>;
+
+// Vector Reciprocal Square Root Step (Floating Point)
+defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts",
+ int_arm_neon_vrsqrts,
+ int_arm_neon_vrsqrts,
+ int_arm_neon_vrsqrts,
+ v2f32, v4f32, v2f64, 0>;
+
+// Vector Comparisons
+
+def Neon_cmeq : PatFrag<(ops node:$lhs, node:$rhs),
+ (Neon_cmp node:$lhs, node:$rhs, SETEQ)>;
+def Neon_cmphs : PatFrag<(ops node:$lhs, node:$rhs),
+ (Neon_cmp node:$lhs, node:$rhs, SETUGE)>;
+def Neon_cmge : PatFrag<(ops node:$lhs, node:$rhs),
+ (Neon_cmp node:$lhs, node:$rhs, SETGE)>;
+def Neon_cmhi : PatFrag<(ops node:$lhs, node:$rhs),
+ (Neon_cmp node:$lhs, node:$rhs, SETUGT)>;
+def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs),
+ (Neon_cmp node:$lhs, node:$rhs, SETGT)>;
+
+// NeonI_compare_aliases class: swaps register operands to implement
+// comparison aliases, e.g., CMLE is alias for CMGE with operands reversed.
+class NeonI_compare_aliases<string asmop, string asmlane,
+ Instruction inst, RegisterClass VPRC>
+ : NeonInstAlias<asmop # "\t$Rd" # asmlane #", $Rn" # asmlane #
+ ", $Rm" # asmlane,
+ (inst VPRC:$Rd, VPRC:$Rm, VPRC:$Rn), 0b0>;
+
+// Vector Comparisons (Integer)
+
+// Vector Compare Mask Equal (Integer)
+let isCommutable =1 in {
+defm CMEQvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10001, "cmeq", Neon_cmeq, 0>;
+}
+
+// Vector Compare Mask Higher or Same (Unsigned Integer)
+defm CMHSvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00111, "cmhs", Neon_cmphs, 0>;
+
+// Vector Compare Mask Greater Than or Equal (Integer)
+defm CMGEvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00111, "cmge", Neon_cmge, 0>;
+
+// Vector Compare Mask Higher (Unsigned Integer)
+defm CMHIvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00110, "cmhi", Neon_cmhi, 0>;
+
+// Vector Compare Mask Greater Than (Integer)
+defm CMGTvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00110, "cmgt", Neon_cmgt, 0>;
+
+// Vector Compare Mask Bitwise Test (Integer)
+defm CMTSTvvv: NeonI_3VSame_BHSD_sizes<0b0, 0b10001, "cmtst", Neon_tst, 0>;
+
+// Vector Compare Mask Less or Same (Unsigned Integer)
+// CMLS is alias for CMHS with operands reversed.
+def CMLSvvv_8B : NeonI_compare_aliases<"cmls", ".8b", CMHSvvv_8B, VPR64>;
+def CMLSvvv_16B : NeonI_compare_aliases<"cmls", ".16b", CMHSvvv_16B, VPR128>;
+def CMLSvvv_4H : NeonI_compare_aliases<"cmls", ".4h", CMHSvvv_4H, VPR64>;
+def CMLSvvv_8H : NeonI_compare_aliases<"cmls", ".8h", CMHSvvv_8H, VPR128>;
+def CMLSvvv_2S : NeonI_compare_aliases<"cmls", ".2s", CMHSvvv_2S, VPR64>;
+def CMLSvvv_4S : NeonI_compare_aliases<"cmls", ".4s", CMHSvvv_4S, VPR128>;
+def CMLSvvv_2D : NeonI_compare_aliases<"cmls", ".2d", CMHSvvv_2D, VPR128>;
+
+// Vector Compare Mask Less Than or Equal (Integer)
+// CMLE is alias for CMGE with operands reversed.
+def CMLEvvv_8B : NeonI_compare_aliases<"cmle", ".8b", CMGEvvv_8B, VPR64>;
+def CMLEvvv_16B : NeonI_compare_aliases<"cmle", ".16b", CMGEvvv_16B, VPR128>;
+def CMLEvvv_4H : NeonI_compare_aliases<"cmle", ".4h", CMGEvvv_4H, VPR64>;
+def CMLEvvv_8H : NeonI_compare_aliases<"cmle", ".8h", CMGEvvv_8H, VPR128>;
+def CMLEvvv_2S : NeonI_compare_aliases<"cmle", ".2s", CMGEvvv_2S, VPR64>;
+def CMLEvvv_4S : NeonI_compare_aliases<"cmle", ".4s", CMGEvvv_4S, VPR128>;
+def CMLEvvv_2D : NeonI_compare_aliases<"cmle", ".2d", CMGEvvv_2D, VPR128>;
+
+// Vector Compare Mask Lower (Unsigned Integer)
+// CMLO is alias for CMHI with operands reversed.
+def CMLOvvv_8B : NeonI_compare_aliases<"cmlo", ".8b", CMHIvvv_8B, VPR64>;
+def CMLOvvv_16B : NeonI_compare_aliases<"cmlo", ".16b", CMHIvvv_16B, VPR128>;
+def CMLOvvv_4H : NeonI_compare_aliases<"cmlo", ".4h", CMHIvvv_4H, VPR64>;
+def CMLOvvv_8H : NeonI_compare_aliases<"cmlo", ".8h", CMHIvvv_8H, VPR128>;
+def CMLOvvv_2S : NeonI_compare_aliases<"cmlo", ".2s", CMHIvvv_2S, VPR64>;
+def CMLOvvv_4S : NeonI_compare_aliases<"cmlo", ".4s", CMHIvvv_4S, VPR128>;
+def CMLOvvv_2D : NeonI_compare_aliases<"cmlo", ".2d", CMHIvvv_2D, VPR128>;
+
+// Vector Compare Mask Less Than (Integer)
+// CMLT is alias for CMGT with operands reversed.
+def CMLTvvv_8B : NeonI_compare_aliases<"cmlt", ".8b", CMGTvvv_8B, VPR64>;
+def CMLTvvv_16B : NeonI_compare_aliases<"cmlt", ".16b", CMGTvvv_16B, VPR128>;
+def CMLTvvv_4H : NeonI_compare_aliases<"cmlt", ".4h", CMGTvvv_4H, VPR64>;
+def CMLTvvv_8H : NeonI_compare_aliases<"cmlt", ".8h", CMGTvvv_8H, VPR128>;
+def CMLTvvv_2S : NeonI_compare_aliases<"cmlt", ".2s", CMGTvvv_2S, VPR64>;
+def CMLTvvv_4S : NeonI_compare_aliases<"cmlt", ".4s", CMGTvvv_4S, VPR128>;
+def CMLTvvv_2D : NeonI_compare_aliases<"cmlt", ".2d", CMGTvvv_2D, VPR128>;
+
+
+def neon_uimm0_asmoperand : AsmOperandClass
+{
+ let Name = "UImm0";
+ let PredicateMethod = "isUImm<0>";
+ let RenderMethod = "addImmOperands";
+}
+
+def neon_uimm0 : Operand<i32>, ImmLeaf<i32, [{return Imm == 0;}]> {
+ let ParserMatchClass = neon_uimm0_asmoperand;
+ let PrintMethod = "printNeonUImm0Operand";
+
+}
+
+multiclass NeonI_cmpz_sizes<bit u, bits<5> opcode, string asmop, CondCode CC>
+{
+ def _8B : NeonI_2VMisc<0b0, u, 0b00, opcode,
+ (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
+ asmop # "\t$Rd.8b, $Rn.8b, $Imm",
+ [(set (v8i8 VPR64:$Rd),
+ (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))],
+ NoItinerary>;
+
+ def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
+ asmop # "\t$Rd.16b, $Rn.16b, $Imm",
+ [(set (v16i8 VPR128:$Rd),
+ (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))],
+ NoItinerary>;
+
+ def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode,
+ (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
+ asmop # "\t$Rd.4h, $Rn.4h, $Imm",
+ [(set (v4i16 VPR64:$Rd),
+ (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))],
+ NoItinerary>;
+
+ def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
+ asmop # "\t$Rd.8h, $Rn.8h, $Imm",
+ [(set (v8i16 VPR128:$Rd),
+ (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))],
+ NoItinerary>;
+
+ def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode,
+ (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
+ asmop # "\t$Rd.2s, $Rn.2s, $Imm",
+ [(set (v2i32 VPR64:$Rd),
+ (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))],
+ NoItinerary>;
+
+ def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
+ asmop # "\t$Rd.4s, $Rn.4s, $Imm",
+ [(set (v4i32 VPR128:$Rd),
+ (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))],
+ NoItinerary>;
+
+ def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
+ asmop # "\t$Rd.2d, $Rn.2d, $Imm",
+ [(set (v2i64 VPR128:$Rd),
+ (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))],
+ NoItinerary>;
+}
+
+// Vector Compare Mask Equal to Zero (Integer)
+defm CMEQvvi : NeonI_cmpz_sizes<0b0, 0b01001, "cmeq", SETEQ>;
+
+// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
+defm CMGEvvi : NeonI_cmpz_sizes<0b1, 0b01000, "cmge", SETGE>;
+
+// Vector Compare Mask Greater Than Zero (Signed Integer)
+defm CMGTvvi : NeonI_cmpz_sizes<0b0, 0b01000, "cmgt", SETGT>;
+
+// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
+defm CMLEvvi : NeonI_cmpz_sizes<0b1, 0b01001, "cmle", SETLE>;
+
+// Vector Compare Mask Less Than Zero (Signed Integer)
+defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>;
+
+// Vector Comparisons (Floating Point)
+
+// Vector Compare Mask Equal (Floating Point)
+let isCommutable =1 in {
+defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq,
+ Neon_cmeq, Neon_cmeq,
+ v2i32, v4i32, v2i64, 0>;
+}
+
+// Vector Compare Mask Greater Than Or Equal (Floating Point)
+defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge,
+ Neon_cmge, Neon_cmge,
+ v2i32, v4i32, v2i64, 0>;
+
+// Vector Compare Mask Greater Than (Floating Point)
+defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt,
+ Neon_cmgt, Neon_cmgt,
+ v2i32, v4i32, v2i64, 0>;
+
+// Vector Compare Mask Less Than Or Equal (Floating Point)
+// FCMLE is alias for FCMGE with operands reversed.
+def FCMLEvvv_2S : NeonI_compare_aliases<"fcmle", ".2s", FCMGEvvv_2S, VPR64>;
+def FCMLEvvv_4S : NeonI_compare_aliases<"fcmle", ".4s", FCMGEvvv_4S, VPR128>;
+def FCMLEvvv_2D : NeonI_compare_aliases<"fcmle", ".2d", FCMGEvvv_2D, VPR128>;
+
+// Vector Compare Mask Less Than (Floating Point)
+// FCMLT is alias for FCMGT with operands reversed.
+def FCMLTvvv_2S : NeonI_compare_aliases<"fcmlt", ".2s", FCMGTvvv_2S, VPR64>;
+def FCMLTvvv_4S : NeonI_compare_aliases<"fcmlt", ".4s", FCMGTvvv_4S, VPR128>;
+def FCMLTvvv_2D : NeonI_compare_aliases<"fcmlt", ".2d", FCMGTvvv_2D, VPR128>;
+
+
+multiclass NeonI_fpcmpz_sizes<bit u, bit size, bits<5> opcode,
+ string asmop, CondCode CC>
+{
+ def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode,
+ (outs VPR64:$Rd), (ins VPR64:$Rn, fpz32:$FPImm),
+ asmop # "\t$Rd.2s, $Rn.2s, $FPImm",
+ [(set (v2i32 VPR64:$Rd),
+ (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpimm:$FPImm), CC)))],
+ NoItinerary>;
+
+ def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm),
+ asmop # "\t$Rd.4s, $Rn.4s, $FPImm",
+ [(set (v4i32 VPR128:$Rd),
+ (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))],
+ NoItinerary>;
+
+ def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode,
+ (outs VPR128:$Rd), (ins VPR128:$Rn, fpz32:$FPImm),
+ asmop # "\t$Rd.2d, $Rn.2d, $FPImm",
+ [(set (v2i64 VPR128:$Rd),
+ (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpimm:$FPImm), CC)))],
+ NoItinerary>;
+}
+
+// Vector Compare Mask Equal to Zero (Floating Point)
+defm FCMEQvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01101, "fcmeq", SETEQ>;
+
+// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
+defm FCMGEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01100, "fcmge", SETGE>;
+
+// Vector Compare Mask Greater Than Zero (Floating Point)
+defm FCMGTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01100, "fcmgt", SETGT>;
+
+// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
+defm FCMLEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01101, "fcmle", SETLE>;
+
+// Vector Compare Mask Less Than Zero (Floating Point)
+defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>;
+
+// Vector Absolute Comparisons (Floating Point)
+
+// Vector Absolute Compare Mask Greater Than Or Equal (Floating Point)
+defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge",
+ int_arm_neon_vacged, int_arm_neon_vacgeq,
+ int_aarch64_neon_vacgeq,
+ v2i32, v4i32, v2i64, 0>;
+
+// Vector Absolute Compare Mask Greater Than (Floating Point)
+defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt",
+ int_arm_neon_vacgtd, int_arm_neon_vacgtq,
+ int_aarch64_neon_vacgtq,
+ v2i32, v4i32, v2i64, 0>;
+
+// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
+// FACLE is alias for FACGE with operands reversed.
+def FACLEvvv_2S : NeonI_compare_aliases<"facle", ".2s", FACGEvvv_2S, VPR64>;
+def FACLEvvv_4S : NeonI_compare_aliases<"facle", ".4s", FACGEvvv_4S, VPR128>;
+def FACLEvvv_2D : NeonI_compare_aliases<"facle", ".2d", FACGEvvv_2D, VPR128>;
+
+// Vector Absolute Compare Mask Less Than (Floating Point)
+// FACLT is alias for FACGT with operands reversed.
+def FACLTvvv_2S : NeonI_compare_aliases<"faclt", ".2s", FACGTvvv_2S, VPR64>;
+def FACLTvvv_4S : NeonI_compare_aliases<"faclt", ".4s", FACGTvvv_4S, VPR128>;
+def FACLTvvv_2D : NeonI_compare_aliases<"faclt", ".2d", FACGTvvv_2D, VPR128>;
+
+// Vector halving add (Integer Signed, Unsigned)
+defm SHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00000, "shadd",
+ int_arm_neon_vhadds, 1>;
+defm UHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00000, "uhadd",
+ int_arm_neon_vhaddu, 1>;
+
+// Vector halving sub (Integer Signed, Unsigned)
+defm SHSUBvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00100, "shsub",
+ int_arm_neon_vhsubs, 0>;
+defm UHSUBvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00100, "uhsub",
+ int_arm_neon_vhsubu, 0>;
+
+// Vector rouding halving add (Integer Signed, Unsigned)
+defm SRHADDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b00010, "srhadd",
+ int_arm_neon_vrhadds, 1>;
+defm URHADDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b00010, "urhadd",
+ int_arm_neon_vrhaddu, 1>;
+
+// Vector Saturating add (Integer Signed, Unsigned)
+defm SQADDvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00001, "sqadd",
+ int_arm_neon_vqadds, 1>;
+defm UQADDvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00001, "uqadd",
+ int_arm_neon_vqaddu, 1>;
+
+// Vector Saturating sub (Integer Signed, Unsigned)
+defm SQSUBvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00101, "sqsub",
+ int_arm_neon_vqsubs, 1>;
+defm UQSUBvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00101, "uqsub",
+ int_arm_neon_vqsubu, 1>;
+
+// Vector Shift Left (Signed and Unsigned Integer)
+defm SSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01000, "sshl",
+ int_arm_neon_vshifts, 1>;
+defm USHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01000, "ushl",
+ int_arm_neon_vshiftu, 1>;
+
+// Vector Saturating Shift Left (Signed and Unsigned Integer)
+defm SQSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01001, "sqshl",
+ int_arm_neon_vqshifts, 1>;
+defm UQSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01001, "uqshl",
+ int_arm_neon_vqshiftu, 1>;
+
+// Vector Rouding Shift Left (Signed and Unsigned Integer)
+defm SRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01010, "srshl",
+ int_arm_neon_vrshifts, 1>;
+defm URSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01010, "urshl",
+ int_arm_neon_vrshiftu, 1>;
+
+// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
+defm SQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01011, "sqrshl",
+ int_arm_neon_vqrshifts, 1>;
+defm UQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01011, "uqrshl",
+ int_arm_neon_vqrshiftu, 1>;
+
+// Vector Maximum (Signed and Unsigned Integer)
+defm SMAXvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01100, "smax", int_arm_neon_vmaxs, 1>;
+defm UMAXvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01100, "umax", int_arm_neon_vmaxu, 1>;
+
+// Vector Minimum (Signed and Unsigned Integer)
+defm SMINvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01101, "smin", int_arm_neon_vmins, 1>;
+defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu, 1>;
+
+// Vector Maximum (Floating Point)
+defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax",
+ int_arm_neon_vmaxs, int_arm_neon_vmaxs,
+ int_arm_neon_vmaxs, v2f32, v4f32, v2f64, 1>;
+
+// Vector Minimum (Floating Point)
+defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin",
+ int_arm_neon_vmins, int_arm_neon_vmins,
+ int_arm_neon_vmins, v2f32, v4f32, v2f64, 1>;
+
+// Vector maxNum (Floating Point) - prefer a number over a quiet NaN)
+defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm",
+ int_aarch64_neon_vmaxnm,
+ int_aarch64_neon_vmaxnm,
+ int_aarch64_neon_vmaxnm,
+ v2f32, v4f32, v2f64, 1>;
+
+// Vector minNum (Floating Point) - prefer a number over a quiet NaN)
+defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm",
+ int_aarch64_neon_vminnm,
+ int_aarch64_neon_vminnm,
+ int_aarch64_neon_vminnm,
+ v2f32, v4f32, v2f64, 1>;
+
+// Vector Maximum Pairwise (Signed and Unsigned Integer)
+defm SMAXPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10100, "smaxp", int_arm_neon_vpmaxs, 1>;
+defm UMAXPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10100, "umaxp", int_arm_neon_vpmaxu, 1>;
+
+// Vector Minimum Pairwise (Signed and Unsigned Integer)
+defm SMINPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10101, "sminp", int_arm_neon_vpmins, 1>;
+defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpminu, 1>;
+
+// Vector Maximum Pairwise (Floating Point)
+defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp",
+ int_arm_neon_vpmaxs, int_arm_neon_vpmaxs,
+ int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>;
+
+// Vector Minimum Pairwise (Floating Point)
+defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp",
+ int_arm_neon_vpmins, int_arm_neon_vpmins,
+ int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>;
+
+// Vector maxNum Pairwise (Floating Point) - prefer a number over a quiet NaN)
+defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp",
+ int_aarch64_neon_vpmaxnm,
+ int_aarch64_neon_vpmaxnm,
+ int_aarch64_neon_vpmaxnm,
+ v2f32, v4f32, v2f64, 1>;
+
+// Vector minNum Pairwise (Floating Point) - prefer a number over a quiet NaN)
+defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp",
+ int_aarch64_neon_vpminnm,
+ int_aarch64_neon_vpminnm,
+ int_aarch64_neon_vpminnm,
+ v2f32, v4f32, v2f64, 1>;
+
+// Vector Addition Pairwise (Integer)
+defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>;
+
+// Vector Addition Pairwise (Floating Point)
+defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp",
+ int_arm_neon_vpadd,
+ int_arm_neon_vpadd,
+ int_arm_neon_vpadd,
+ v2f32, v4f32, v2f64, 1>;
+
+// Vector Saturating Doubling Multiply High
+defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh",
+ int_arm_neon_vqdmulh, 1>;
+
+// Vector Saturating Rouding Doubling Multiply High
+defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh",
+ int_arm_neon_vqrdmulh, 1>;
+
+// Vector Multiply Extended (Floating Point)
+defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx",
+ int_aarch64_neon_vmulx,
+ int_aarch64_neon_vmulx,
+ int_aarch64_neon_vmulx,
+ v2f32, v4f32, v2f64, 1>;
+
+// Vector Immediate Instructions
+
+multiclass neon_mov_imm_shift_asmoperands<string PREFIX>
+{
+ def _asmoperand : AsmOperandClass
+ {
+ let Name = "NeonMovImmShift" # PREFIX;
+ let RenderMethod = "addNeonMovImmShift" # PREFIX # "Operands";
+ let PredicateMethod = "isNeonMovImmShift" # PREFIX;
+ }
+}
+
+// Definition of vector immediates shift operands
+
+// The selectable use-cases extract the shift operation
+// information from the OpCmode fields encoded in the immediate.
+def neon_mod_shift_imm_XFORM : SDNodeXForm<imm, [{
+ uint64_t OpCmode = N->getZExtValue();
+ unsigned ShiftImm;
+ unsigned ShiftOnesIn;
+ unsigned HasShift =
+ A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
+ if (!HasShift) return SDValue();
+ return CurDAG->getTargetConstant(ShiftImm, MVT::i32);
+}]>;
+
+// Vector immediates shift operands which accept LSL and MSL
+// shift operators with shift value in the range of 0, 8, 16, 24 (LSL),
+// or 0, 8 (LSLH) or 8, 16 (MSL).
+defm neon_mov_imm_LSL : neon_mov_imm_shift_asmoperands<"LSL">;
+defm neon_mov_imm_MSL : neon_mov_imm_shift_asmoperands<"MSL">;
+// LSLH restricts shift amount to 0, 8 out of 0, 8, 16, 24
+defm neon_mov_imm_LSLH : neon_mov_imm_shift_asmoperands<"LSLH">;
+
+multiclass neon_mov_imm_shift_operands<string PREFIX,
+ string HALF, string ISHALF, code pred>
+{
+ def _operand : Operand<i32>, ImmLeaf<i32, pred, neon_mod_shift_imm_XFORM>
+ {
+ let PrintMethod =
+ "printNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
+ let DecoderMethod =
+ "DecodeNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
+ let ParserMatchClass =
+ !cast<AsmOperandClass>("neon_mov_imm_" # PREFIX # HALF # "_asmoperand");
+ }
+}
+
+defm neon_mov_imm_LSL : neon_mov_imm_shift_operands<"LSL", "", "false", [{
+ unsigned ShiftImm;
+ unsigned ShiftOnesIn;
+ unsigned HasShift =
+ A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
+ return (HasShift && !ShiftOnesIn);
+}]>;
+
+defm neon_mov_imm_MSL : neon_mov_imm_shift_operands<"MSL", "", "false", [{
+ unsigned ShiftImm;
+ unsigned ShiftOnesIn;
+ unsigned HasShift =
+ A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
+ return (HasShift && ShiftOnesIn);
+}]>;
+
+defm neon_mov_imm_LSLH : neon_mov_imm_shift_operands<"LSL", "H", "true", [{
+ unsigned ShiftImm;
+ unsigned ShiftOnesIn;
+ unsigned HasShift =
+ A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
+ return (HasShift && !ShiftOnesIn);
+}]>;
+
+def neon_uimm8_asmoperand : AsmOperandClass
+{
+ let Name = "UImm8";
+ let PredicateMethod = "isUImm<8>";
+ let RenderMethod = "addImmOperands";
+}
+
+def neon_uimm8 : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
+ let ParserMatchClass = neon_uimm8_asmoperand;
+ let PrintMethod = "printNeonUImm8Operand";
+}
+
+def neon_uimm64_mask_asmoperand : AsmOperandClass
+{
+ let Name = "NeonUImm64Mask";
+ let PredicateMethod = "isNeonUImm64Mask";
+ let RenderMethod = "addNeonUImm64MaskOperands";
+}
+
+// MCOperand for 64-bit bytemask with each byte having only the
+// value 0x00 and 0xff is encoded as an unsigned 8-bit value
+def neon_uimm64_mask : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
+ let ParserMatchClass = neon_uimm64_mask_asmoperand;
+ let PrintMethod = "printNeonUImm64MaskOperand";
+}
+
+multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
+ SDPatternOperator opnode>
+{
+ // shift zeros, per word
+ def _2S : NeonI_1VModImm<0b0, op,
+ (outs VPR64:$Rd),
+ (ins neon_uimm8:$Imm,
+ neon_mov_imm_LSL_operand:$Simm),
+ !strconcat(asmop, " $Rd.2s, $Imm$Simm"),
+ [(set (v2i32 VPR64:$Rd),
+ (v2i32 (opnode (timm:$Imm),
+ (neon_mov_imm_LSL_operand:$Simm))))],
+ NoItinerary> {
+ bits<2> Simm;
+ let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
+ }
+
+ def _4S : NeonI_1VModImm<0b1, op,
+ (outs VPR128:$Rd),
+ (ins neon_uimm8:$Imm,
+ neon_mov_imm_LSL_operand:$Simm),
+ !strconcat(asmop, " $Rd.4s, $Imm$Simm"),
+ [(set (v4i32 VPR128:$Rd),
+ (v4i32 (opnode (timm:$Imm),
+ (neon_mov_imm_LSL_operand:$Simm))))],
+ NoItinerary> {
+ bits<2> Simm;
+ let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
+ }
+
+ // shift zeros, per halfword
+ def _4H : NeonI_1VModImm<0b0, op,
+ (outs VPR64:$Rd),
+ (ins neon_uimm8:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm),
+ !strconcat(asmop, " $Rd.4h, $Imm$Simm"),
+ [(set (v4i16 VPR64:$Rd),
+ (v4i16 (opnode (timm:$Imm),
+ (neon_mov_imm_LSLH_operand:$Simm))))],
+ NoItinerary> {
+ bit Simm;
+ let cmode = {0b1, 0b0, Simm, 0b0};
+ }
+
+ def _8H : NeonI_1VModImm<0b1, op,
+ (outs VPR128:$Rd),
+ (ins neon_uimm8:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm),
+ !strconcat(asmop, " $Rd.8h, $Imm$Simm"),
+ [(set (v8i16 VPR128:$Rd),
+ (v8i16 (opnode (timm:$Imm),
+ (neon_mov_imm_LSLH_operand:$Simm))))],
+ NoItinerary> {
+ bit Simm;
+ let cmode = {0b1, 0b0, Simm, 0b0};
+ }
+}
+
+multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
+ SDPatternOperator opnode,
+ SDPatternOperator neonopnode>
+{
+ let Constraints = "$src = $Rd" in {
+ // shift zeros, per word
+ def _2S : NeonI_1VModImm<0b0, op,
+ (outs VPR64:$Rd),
+ (ins VPR64:$src, neon_uimm8:$Imm,
+ neon_mov_imm_LSL_operand:$Simm),
+ !strconcat(asmop, " $Rd.2s, $Imm$Simm"),
+ [(set (v2i32 VPR64:$Rd),
+ (v2i32 (opnode (v2i32 VPR64:$src),
+ (v2i32 (bitconvert (v2i32 (neonopnode timm:$Imm,
+ neon_mov_imm_LSL_operand:$Simm)))))))],
+ NoItinerary> {
+ bits<2> Simm;
+ let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
+ }
+
+ def _4S : NeonI_1VModImm<0b1, op,
+ (outs VPR128:$Rd),
+ (ins VPR128:$src, neon_uimm8:$Imm,
+ neon_mov_imm_LSL_operand:$Simm),
+ !strconcat(asmop, " $Rd.4s, $Imm$Simm"),
+ [(set (v4i32 VPR128:$Rd),
+ (v4i32 (opnode (v4i32 VPR128:$src),
+ (v4i32 (bitconvert (v4i32 (neonopnode timm:$Imm,
+ neon_mov_imm_LSL_operand:$Simm)))))))],
+ NoItinerary> {
+ bits<2> Simm;
+ let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
+ }
+
+ // shift zeros, per halfword
+ def _4H : NeonI_1VModImm<0b0, op,
+ (outs VPR64:$Rd),
+ (ins VPR64:$src, neon_uimm8:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm),
+ !strconcat(asmop, " $Rd.4h, $Imm$Simm"),
+ [(set (v4i16 VPR64:$Rd),
+ (v4i16 (opnode (v4i16 VPR64:$src),
+ (v4i16 (bitconvert (v4i16 (neonopnode timm:$Imm,
+ neon_mov_imm_LSL_operand:$Simm)))))))],
+ NoItinerary> {
+ bit Simm;
+ let cmode = {0b1, 0b0, Simm, 0b1};
+ }
+
+ def _8H : NeonI_1VModImm<0b1, op,
+ (outs VPR128:$Rd),
+ (ins VPR128:$src, neon_uimm8:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm),
+ !strconcat(asmop, " $Rd.8h, $Imm$Simm"),
+ [(set (v8i16 VPR128:$Rd),
+ (v8i16 (opnode (v8i16 VPR128:$src),
+ (v8i16 (bitconvert (v8i16 (neonopnode timm:$Imm,
+ neon_mov_imm_LSL_operand:$Simm)))))))],
+ NoItinerary> {
+ bit Simm;
+ let cmode = {0b1, 0b0, Simm, 0b1};
+ }
+ }
+}
+
+multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
+ SDPatternOperator opnode>
+{
+ // shift ones, per word
+ def _2S : NeonI_1VModImm<0b0, op,
+ (outs VPR64:$Rd),
+ (ins neon_uimm8:$Imm,
+ neon_mov_imm_MSL_operand:$Simm),
+ !strconcat(asmop, " $Rd.2s, $Imm$Simm"),
+ [(set (v2i32 VPR64:$Rd),
+ (v2i32 (opnode (timm:$Imm),
+ (neon_mov_imm_MSL_operand:$Simm))))],
+ NoItinerary> {
+ bit Simm;
+ let cmode = {0b1, 0b1, 0b0, Simm};
+ }
+
+ def _4S : NeonI_1VModImm<0b1, op,
+ (outs VPR128:$Rd),
+ (ins neon_uimm8:$Imm,
+ neon_mov_imm_MSL_operand:$Simm),
+ !strconcat(asmop, " $Rd.4s, $Imm$Simm"),
+ [(set (v4i32 VPR128:$Rd),
+ (v4i32 (opnode (timm:$Imm),
+ (neon_mov_imm_MSL_operand:$Simm))))],
+ NoItinerary> {
+ bit Simm;
+ let cmode = {0b1, 0b1, 0b0, Simm};
+ }
+}
+
+// Vector Move Immediate Shifted
+let isReMaterializable = 1 in {
+defm MOVIvi_lsl : NeonI_mov_imm_lsl_sizes<"movi", 0b0, Neon_movi>;
+}
+
+// Vector Move Inverted Immediate Shifted
+let isReMaterializable = 1 in {
+defm MVNIvi_lsl : NeonI_mov_imm_lsl_sizes<"mvni", 0b1, Neon_mvni>;
+}
+
+// Vector Bitwise Bit Clear (AND NOT) - immediate
+let isReMaterializable = 1 in {
+defm BICvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"bic", 0b1,
+ and, Neon_mvni>;
+}
+
+// Vector Bitwise OR - immedidate
+
+let isReMaterializable = 1 in {
+defm ORRvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"orr", 0b0,
+ or, Neon_movi>;
+}
+
+// Additional patterns for Vector Bitwise Bit Clear (AND NOT) - immedidate
+// LowerBUILD_VECTOR favors lowering MOVI over MVNI.
+// BIC immediate instructions selection requires additional patterns to
+// transform Neon_movi operands into BIC immediate operands
+
+def neon_mov_imm_LSLH_transform_XFORM : SDNodeXForm<imm, [{
+ uint64_t OpCmode = N->getZExtValue();
+ unsigned ShiftImm;
+ unsigned ShiftOnesIn;
+ (void)A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
+ // LSLH restricts shift amount to 0, 8 which are encoded as 0 and 1
+ // Transform encoded shift amount 0 to 1 and 1 to 0.
+ return CurDAG->getTargetConstant(!ShiftImm, MVT::i32);
+}]>;
+
+def neon_mov_imm_LSLH_transform_operand
+ : ImmLeaf<i32, [{
+ unsigned ShiftImm;
+ unsigned ShiftOnesIn;
+ unsigned HasShift =
+ A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
+ return (HasShift && !ShiftOnesIn); }],
+ neon_mov_imm_LSLH_transform_XFORM>;
+
+// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0x00, LSL 8)
+// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0x00)
+def : Pat<(v4i16 (and VPR64:$src,
+ (v4i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))),
+ (BICvi_lsl_4H VPR64:$src, 0,
+ neon_mov_imm_LSLH_transform_operand:$Simm)>;
+
+// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0x00, LSL 8)
+// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0x00)
+def : Pat<(v8i16 (and VPR128:$src,
+ (v8i16 (Neon_movi 255, neon_mov_imm_LSLH_transform_operand:$Simm)))),
+ (BICvi_lsl_8H VPR128:$src, 0,
+ neon_mov_imm_LSLH_transform_operand:$Simm)>;
+
+
+multiclass Neon_bitwiseVi_patterns<SDPatternOperator opnode,
+ SDPatternOperator neonopnode,
+ Instruction INST4H,
+ Instruction INST8H> {
+ def : Pat<(v8i8 (opnode VPR64:$src,
+ (bitconvert(v4i16 (neonopnode timm:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm))))),
+ (INST4H VPR64:$src, neon_uimm8:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm)>;
+ def : Pat<(v1i64 (opnode VPR64:$src,
+ (bitconvert(v4i16 (neonopnode timm:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm))))),
+ (INST4H VPR64:$src, neon_uimm8:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm)>;
+
+ def : Pat<(v16i8 (opnode VPR128:$src,
+ (bitconvert(v8i16 (neonopnode timm:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm))))),
+ (INST8H VPR128:$src, neon_uimm8:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm)>;
+ def : Pat<(v4i32 (opnode VPR128:$src,
+ (bitconvert(v8i16 (neonopnode timm:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm))))),
+ (INST8H VPR128:$src, neon_uimm8:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm)>;
+ def : Pat<(v2i64 (opnode VPR128:$src,
+ (bitconvert(v8i16 (neonopnode timm:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm))))),
+ (INST8H VPR128:$src, neon_uimm8:$Imm,
+ neon_mov_imm_LSLH_operand:$Simm)>;
+}
+
+// Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate
+defm : Neon_bitwiseVi_patterns<or, Neon_mvni, BICvi_lsl_4H, BICvi_lsl_8H>;
+
+// Additional patterns for Vector Bitwise OR - immedidate
+defm : Neon_bitwiseVi_patterns<or, Neon_movi, ORRvi_lsl_4H, ORRvi_lsl_8H>;
+
+
+// Vector Move Immediate Masked
+let isReMaterializable = 1 in {
+defm MOVIvi_msl : NeonI_mov_imm_msl_sizes<"movi", 0b0, Neon_movi>;
+}
+
+// Vector Move Inverted Immediate Masked
+let isReMaterializable = 1 in {
+defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>;
+}
+
+class NeonI_mov_imm_lsl_aliases<string asmop, string asmlane,
+ Instruction inst, RegisterClass VPRC>
+ : NeonInstAlias<!strconcat(asmop, " $Rd," # asmlane # ", $Imm"),
+ (inst VPRC:$Rd, neon_uimm8:$Imm, 0), 0b0>;
+
+// Aliases for Vector Move Immediate Shifted
+def : NeonI_mov_imm_lsl_aliases<"movi", ".2s", MOVIvi_lsl_2S, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"movi", ".4s", MOVIvi_lsl_4S, VPR128>;
+def : NeonI_mov_imm_lsl_aliases<"movi", ".4h", MOVIvi_lsl_4H, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"movi", ".8h", MOVIvi_lsl_8H, VPR128>;
+
+// Aliases for Vector Move Inverted Immediate Shifted
+def : NeonI_mov_imm_lsl_aliases<"mvni", ".2s", MVNIvi_lsl_2S, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"mvni", ".4s", MVNIvi_lsl_4S, VPR128>;
+def : NeonI_mov_imm_lsl_aliases<"mvni", ".4h", MVNIvi_lsl_4H, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"mvni", ".8h", MVNIvi_lsl_8H, VPR128>;
+
+// Aliases for Vector Bitwise Bit Clear (AND NOT) - immediate
+def : NeonI_mov_imm_lsl_aliases<"bic", ".2s", BICvi_lsl_2S, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"bic", ".4s", BICvi_lsl_4S, VPR128>;
+def : NeonI_mov_imm_lsl_aliases<"bic", ".4h", BICvi_lsl_4H, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"bic", ".8h", BICvi_lsl_8H, VPR128>;
+
+// Aliases for Vector Bitwise OR - immedidate
+def : NeonI_mov_imm_lsl_aliases<"orr", ".2s", ORRvi_lsl_2S, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"orr", ".4s", ORRvi_lsl_4S, VPR128>;
+def : NeonI_mov_imm_lsl_aliases<"orr", ".4h", ORRvi_lsl_4H, VPR64>;
+def : NeonI_mov_imm_lsl_aliases<"orr", ".8h", ORRvi_lsl_8H, VPR128>;
+
+// Vector Move Immediate - per byte
+let isReMaterializable = 1 in {
+def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0,
+ (outs VPR64:$Rd), (ins neon_uimm8:$Imm),
+ "movi\t$Rd.8b, $Imm",
+ [(set (v8i8 VPR64:$Rd),
+ (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))],
+ NoItinerary> {
+ let cmode = 0b1110;
+}
+
+def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0,
+ (outs VPR128:$Rd), (ins neon_uimm8:$Imm),
+ "movi\t$Rd.16b, $Imm",
+ [(set (v16i8 VPR128:$Rd),
+ (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))],
+ NoItinerary> {
+ let cmode = 0b1110;
+}
+}
+
+// Vector Move Immediate - bytemask, per double word
+let isReMaterializable = 1 in {
+def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1,
+ (outs VPR128:$Rd), (ins neon_uimm64_mask:$Imm),
+ "movi\t $Rd.2d, $Imm",
+ [(set (v2i64 VPR128:$Rd),
+ (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))],
+ NoItinerary> {
+ let cmode = 0b1110;
+}
+}
+
+// Vector Move Immediate - bytemask, one doubleword
+
+let isReMaterializable = 1 in {
+def MOVIdi : NeonI_1VModImm<0b0, 0b1,
+ (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm),
+ "movi\t $Rd, $Imm",
+ [(set (f64 FPR64:$Rd),
+ (f64 (bitconvert
+ (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))))],
+ NoItinerary> {
+ let cmode = 0b1110;
+}
+}
+
+// Vector Floating Point Move Immediate
+
+class NeonI_FMOV_impl<string asmlane, RegisterClass VPRC, ValueType OpTy,
+ Operand immOpType, bit q, bit op>
+ : NeonI_1VModImm<q, op,
+ (outs VPRC:$Rd), (ins immOpType:$Imm),
+ "fmov\t$Rd" # asmlane # ", $Imm",
+ [(set (OpTy VPRC:$Rd),
+ (OpTy (Neon_fmovi (timm:$Imm))))],
+ NoItinerary> {
+ let cmode = 0b1111;
+ }
+
+let isReMaterializable = 1 in {
+def FMOVvi_2S : NeonI_FMOV_impl<".2s", VPR64, v2f32, fmov32_operand, 0b0, 0b0>;
+def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>;
+def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>;
+}
+
+// Scalar Arithmetic
+
+class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
+ : NeonI_Scalar3Same<u, 0b11, opcode,
+ (outs FPR64:$Rd), (ins FPR64:$Rn, FPR64:$Rm),
+ !strconcat(asmop, " $Rd, $Rn, $Rm"),
+ [],
+ NoItinerary>;
+
+multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
+ string asmop, bit Commutable = 0>
+{
+ let isCommutable = Commutable in {
+ def bbb : NeonI_Scalar3Same<u, 0b00, opcode,
+ (outs FPR8:$Rd), (ins FPR8:$Rn, FPR8:$Rm),
+ !strconcat(asmop, " $Rd, $Rn, $Rm"),
+ [],
+ NoItinerary>;
+ def hhh : NeonI_Scalar3Same<u, 0b01, opcode,
+ (outs FPR16:$Rd), (ins FPR16:$Rn, FPR16:$Rm),
+ !strconcat(asmop, " $Rd, $Rn, $Rm"),
+ [],
+ NoItinerary>;
+ def sss : NeonI_Scalar3Same<u, 0b10, opcode,
+ (outs FPR32:$Rd), (ins FPR32:$Rn, FPR32:$Rm),
+ !strconcat(asmop, " $Rd, $Rn, $Rm"),
+ [],
+ NoItinerary>;
+ def ddd : NeonI_Scalar3Same<u, 0b11, opcode,
+ (outs FPR64:$Rd), (ins FPR64:$Rn, FPR64:$Rm),
+ !strconcat(asmop, " $Rd, $Rn, $Rm"),
+ [],
+ NoItinerary>;
+ }
+}
+
+class Neon_Scalar_D_size_patterns<SDPatternOperator opnode, Instruction INSTD>
+ : Pat<(v1i64 (opnode (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))),
+ (SUBREG_TO_REG (i64 0),
+ (INSTD (EXTRACT_SUBREG VPR64:$Rn, sub_64),
+ (EXTRACT_SUBREG VPR64:$Rm, sub_64)),
+ sub_64)>;
+
+
+// Scalar Integer Add
+let isCommutable = 1 in {
+def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">;
+}
+
+// Scalar Integer Sub
+def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">;
+
+// Pattern for Scalar Integer Add and Sub with D register
+def : Neon_Scalar_D_size_patterns<add, ADDddd>;
+def : Neon_Scalar_D_size_patterns<sub, SUBddd>;
+
+// Scalar Integer Saturating Add (Signed, Unsigned)
+defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>;
+defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>;
+
+// Scalar Integer Saturating Sub (Signed, Unsigned)
+defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>;
+defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>;
+
+// Patterns for Scalar Integer Saturating Add, Sub with D register only
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vqadds, SQADDddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vqaddu, UQADDddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vqsubs, SQSUBddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vqsubu, UQSUBddd>;
+
+// Scalar Integer Shift Left (Signed, Unsigned)
+def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">;
+def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">;
+
+// Scalar Integer Saturating Shift Left (Signed, Unsigned)
+defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>;
+defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>;
+
+// Scalar Integer Rouding Shift Left (Signed, Unsigned)
+def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">;
+def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">;
+
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>;
+defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>;
+
+// Patterns for Scalar Integer Shift Lef, Saturating Shift Left,
+// Rounding Shift Left, Rounding Saturating Shift Left with D register only
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
+def : Neon_Scalar_D_size_patterns<shl, SSHLddd>;
+def : Neon_Scalar_D_size_patterns<shl, USHLddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
+def : Neon_Scalar_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
+
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// 64-bit vector bitcasts...
+
+def : Pat<(v1i64 (bitconvert (v8i8 VPR64:$src))), (v1i64 VPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8 VPR64:$src))), (v2f32 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8 VPR64:$src))), (v2i32 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8 VPR64:$src))), (v4i16 VPR64:$src)>;
+
+def : Pat<(v1i64 (bitconvert (v4i16 VPR64:$src))), (v1i64 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 VPR64:$src))), (v2i32 VPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 VPR64:$src))), (v2f32 VPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v4i16 VPR64:$src))), (v8i8 VPR64:$src)>;
+
+def : Pat<(v1i64 (bitconvert (v2i32 VPR64:$src))), (v1i64 VPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 VPR64:$src))), (v2f32 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2i32 VPR64:$src))), (v8i8 VPR64:$src)>;
+
+def : Pat<(v1i64 (bitconvert (v2f32 VPR64:$src))), (v1i64 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 VPR64:$src))), (v2i32 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v2f32 VPR64:$src))), (v8i8 VPR64:$src)>;
+
+def : Pat<(v2f32 (bitconvert (v1i64 VPR64:$src))), (v2f32 VPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 VPR64:$src))), (v2i32 VPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 VPR64:$src))), (v4i16 VPR64:$src)>;
+def : Pat<(v8i8 (bitconvert (v1i64 VPR64:$src))), (v8i8 VPR64:$src)>;
+
+// ..and 128-bit vector bitcasts...
+
+def : Pat<(v2f64 (bitconvert (v16i8 VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 VPR128:$src))), (v8i16 VPR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v8i16 VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 VPR128:$src))), (v16i8 VPR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v4i32 VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VPR128:$src))), (v8i16 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VPR128:$src))), (v16i8 VPR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v4f32 VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VPR128:$src))), (v8i16 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VPR128:$src))), (v16i8 VPR128:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v2i64 VPR128:$src))), (v2f64 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VPR128:$src))), (v8i16 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VPR128:$src))), (v16i8 VPR128:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v2f64 VPR128:$src))), (v2i64 VPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VPR128:$src))), (v4f32 VPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VPR128:$src))), (v4i32 VPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VPR128:$src))), (v8i16 VPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VPR128:$src))), (v16i8 VPR128:$src)>;
+
+
+// ...and scalar bitcasts...
+
+def : Pat<(f64 (bitconvert (v8i8 VPR64:$src))),
+ (f64 (EXTRACT_SUBREG (v8i8 VPR64:$src), sub_64))>;
+def : Pat<(f64 (bitconvert (v4i16 VPR64:$src))),
+ (f64 (EXTRACT_SUBREG (v4i16 VPR64:$src), sub_64))>;
+def : Pat<(f64 (bitconvert (v2i32 VPR64:$src))),
+ (f64 (EXTRACT_SUBREG (v2i32 VPR64:$src), sub_64))>;
+def : Pat<(f64 (bitconvert (v2f32 VPR64:$src))),
+ (f64 (EXTRACT_SUBREG (v2f32 VPR64:$src), sub_64))>;
+def : Pat<(f64 (bitconvert (v1i64 VPR64:$src))),
+ (f64 (EXTRACT_SUBREG (v1i64 VPR64:$src), sub_64))>;
+def : Pat<(f128 (bitconvert (v16i8 VPR128:$src))),
+ (f128 (EXTRACT_SUBREG (v16i8 VPR128:$src), sub_alias))>;
+def : Pat<(f128 (bitconvert (v8i16 VPR128:$src))),
+ (f128 (EXTRACT_SUBREG (v8i16 VPR128:$src), sub_alias))>;
+def : Pat<(f128 (bitconvert (v4i32 VPR128:$src))),
+ (f128 (EXTRACT_SUBREG (v4i32 VPR128:$src), sub_alias))>;
+def : Pat<(f128 (bitconvert (v2i64 VPR128:$src))),
+ (f128 (EXTRACT_SUBREG (v2i64 VPR128:$src), sub_alias))>;
+def : Pat<(f128 (bitconvert (v4f32 VPR128:$src))),
+ (f128 (EXTRACT_SUBREG (v4f32 VPR128:$src), sub_alias))>;
+def : Pat<(f128 (bitconvert (v2f64 VPR128:$src))),
+ (f128 (EXTRACT_SUBREG (v2f64 VPR128:$src), sub_alias))>;
+
+def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))),
+ (v8i8 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>;
+def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))),
+ (v4i16 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>;
+def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))),
+ (v2i32 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>;
+def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))),
+ (v2f32 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>;
+def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))),
+ (v1i64 (SUBREG_TO_REG (i64 0), (f64 FPR64:$src), sub_64))>;
+def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))),
+ (v16i8 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src),
+ sub_alias))>;
+def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))),
+ (v8i16 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src),
+ sub_alias))>;
+def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))),
+ (v4i32 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src),
+ sub_alias))>;
+def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))),
+ (v2i64 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src),
+ sub_alias))>;
+def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))),
+ (v4f32 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src),
+ sub_alias))>;
+def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))),
+ (v2f64 (SUBREG_TO_REG (i128 0), (f128 FPR128:$src),
+ sub_alias))>;
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 3d22330afe..7ce5ce3441 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -109,6 +109,11 @@ bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
case MachineOperand::MO_Immediate:
MCOp = MCOperand::CreateImm(MO.getImm());
break;
+ case MachineOperand::MO_FPImmediate: {
+ assert(MO.getFPImm()->isZero() && "Only fp imm 0.0 is supported");
+ MCOp = MCOperand::CreateFPImm(0.0);
+ break;
+ }
case MachineOperand::MO_BlockAddress:
MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
break;
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index cc2bb6135c..b3a81b1dc0 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -185,7 +185,7 @@ foreach Index = 0-31 in {
// These two classes contain the same registers, which should be reasonably
// sensible for MC and allocation purposes, but allows them to be treated
// separately for things like stack spilling.
-def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8], 64,
+def VPR64 : RegisterClass<"AArch64", [v2f32, v2i32, v4i16, v8i8, v1i64], 64,
(sequence "V%u", 0, 31)>;
def VPR128 : RegisterClass<"AArch64",
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index d17b738209..d71bb4e973 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -26,10 +26,8 @@
using namespace llvm;
AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS)
- : AArch64GenSubtargetInfo(TT, CPU, FS)
- , HasNEON(true)
- , HasCrypto(true)
- , TargetTriple(TT) {
+ : AArch64GenSubtargetInfo(TT, CPU, FS), HasNEON(false), HasCrypto(false),
+ TargetTriple(TT) {
ParseSubtargetFeatures(CPU, FS);
}
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 2e9205fc99..35a7c8d85d 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -48,6 +48,9 @@ public:
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
+ bool hasNEON() const { return HasNEON; }
+
+ bool hasCrypto() const { return HasCrypto; }
};
} // End llvm namespace
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 10a9a6a406..43e91ac4e0 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -664,8 +664,42 @@ public:
return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
}
- template<int MemSize> bool isSImm7Scaled() const {
- if (!isImm()) return false;
+ bool isNeonMovImmShiftLSL() const {
+ if (!isShiftOrExtend())
+ return false;
+
+ if (ShiftExtend.ShiftType != A64SE::LSL)
+ return false;
+
+ // Valid shift amount is 0, 8, 16 and 24.
+ return ShiftExtend.Amount % 8 == 0 && ShiftExtend.Amount <= 24;
+ }
+
+ bool isNeonMovImmShiftLSLH() const {
+ if (!isShiftOrExtend())
+ return false;
+
+ if (ShiftExtend.ShiftType != A64SE::LSL)
+ return false;
+
+ // Valid shift amount is 0 and 8.
+ return ShiftExtend.Amount == 0 || ShiftExtend.Amount == 8;
+ }
+
+ bool isNeonMovImmShiftMSL() const {
+ if (!isShiftOrExtend())
+ return false;
+
+ if (ShiftExtend.ShiftType != A64SE::MSL)
+ return false;
+
+ // Valid shift amount is 8 and 16.
+ return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16;
+ }
+
+ template <int MemSize> bool isSImm7Scaled() const {
+ if (!isImm())
+ return false;
const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
if (!CE) return false;
@@ -705,10 +739,27 @@ public:
return isa<MCConstantExpr>(getImm());
}
+ bool isNeonUImm64Mask() const {
+ if (!isImm())
+ return false;
+
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ if (!CE)
+ return false;
+
+ uint64_t Value = CE->getValue();
+
+ // i64 value with each byte being either 0x00 or 0xff.
+ for (unsigned i = 0; i < 8; ++i, Value >>= 8)
+ if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff)
+ return false;
+ return true;
+ }
+
static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
unsigned ShiftAmount,
bool ImplicitAmount,
- SMLoc S, SMLoc E) {
+ SMLoc S,SMLoc E) {
AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E);
Op->ImmWithLSL.Val = Val;
Op->ImmWithLSL.ShiftAmount = ShiftAmount;
@@ -1026,6 +1077,40 @@ public:
Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
}
+ // For Vector Immediates shifted imm operands.
+ void addNeonMovImmShiftLSLOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ if (ShiftExtend.Amount % 8 != 0 || ShiftExtend.Amount > 24)
+ llvm_unreachable("Invalid shift amount for vector immediate inst.");
+
+ // Encode LSL shift amount 0, 8, 16, 24 as 0, 1, 2, 3.
+ int64_t Imm = ShiftExtend.Amount / 8;
+ Inst.addOperand(MCOperand::CreateImm(Imm));
+ }
+
+ void addNeonMovImmShiftLSLHOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ if (ShiftExtend.Amount != 0 && ShiftExtend.Amount != 8)
+ llvm_unreachable("Invalid shift amount for vector immediate inst.");
+
+ // Encode LSLH shift amount 0, 8 as 0, 1.
+ int64_t Imm = ShiftExtend.Amount / 8;
+ Inst.addOperand(MCOperand::CreateImm(Imm));
+ }
+
+ void addNeonMovImmShiftMSLOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ if (ShiftExtend.Amount != 8 && ShiftExtend.Amount != 16)
+ llvm_unreachable("Invalid shift amount for vector immediate inst.");
+
+ // Encode MSL shift amount 8, 16 as 0, 1.
+ int64_t Imm = ShiftExtend.Amount / 8 - 1;
+ Inst.addOperand(MCOperand::CreateImm(Imm));
+ }
+
// For the extend in load-store (register offset) instructions.
template<unsigned MemSize>
void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const {
@@ -1065,6 +1150,20 @@ public:
Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
}
+
+ void addNeonUImm64MaskOperands(MCInst &Inst, unsigned N) const {
+ assert(N == 1 && "Invalid number of operands!");
+
+ // A bit from each byte in the constant forms the encoded immediate
+ const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+ uint64_t Value = CE->getValue();
+
+ unsigned Imm = 0;
+ for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
+ Imm |= (Value & 1) << i;
+ }
+ Inst.addOperand(MCOperand::CreateImm(Imm));
+ }
};
} // end anonymous namespace.
@@ -1660,20 +1759,21 @@ AArch64AsmParser::ParseShiftExtend(
std::string LowerID = IDVal.lower();
A64SE::ShiftExtSpecifiers Spec =
- StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
- .Case("lsl", A64SE::LSL)
- .Case("lsr", A64SE::LSR)
- .Case("asr", A64SE::ASR)
- .Case("ror", A64SE::ROR)
- .Case("uxtb", A64SE::UXTB)
- .Case("uxth", A64SE::UXTH)
- .Case("uxtw", A64SE::UXTW)
- .Case("uxtx", A64SE::UXTX)
- .Case("sxtb", A64SE::SXTB)
- .Case("sxth", A64SE::SXTH)
- .Case("sxtw", A64SE::SXTW)
- .Case("sxtx", A64SE::SXTX)
- .Default(A64SE::Invalid);
+ StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
+ .Case("lsl", A64SE::LSL)
+ .Case("msl", A64SE::MSL)
+ .Case("lsr", A64SE::LSR)
+ .Case("asr", A64SE::ASR)
+ .Case("ror", A64SE::ROR)
+ .Case("uxtb", A64SE::UXTB)
+ .Case("uxth", A64SE::UXTH)
+ .Case("uxtw", A64SE::UXTW)
+ .Case("uxtx", A64SE::UXTX)
+ .Case("sxtb", A64SE::SXTB)
+ .Case("sxth", A64SE::SXTH)
+ .Case("sxtw", A64SE::SXTW)
+ .Case("sxtx", A64SE::SXTX)
+ .Default(A64SE::Invalid);
if (Spec == A64SE::Invalid)
return MatchOperand_NoMatch;
@@ -1683,8 +1783,8 @@ AArch64AsmParser::ParseShiftExtend(
S = Parser.getTok().getLoc();
Parser.Lex();
- if (Spec != A64SE::LSL && Spec != A64SE::LSR &&
- Spec != A64SE::ASR && Spec != A64SE::ROR) {
+ if (Spec != A64SE::LSL && Spec != A64SE::LSR && Spec != A64SE::ASR &&
+ Spec != A64SE::ROR && Spec != A64SE::MSL) {
// The shift amount can be omitted for the extending versions, but not real
// shifts:
// add x0, x0, x0, uxtb
@@ -2019,7 +2119,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
"expected compatible register or floating-point constant");
case Match_FPZero:
return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
- "expected floating-point constant #0.0");
+ "expected floating-point constant #0.0 or invalid register type");
case Match_Label:
return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
"expected label or encodable integer pc offset");
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 36dd704140..a88a8e8e9e 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -85,6 +85,9 @@ static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
unsigned RegNo, uint64_t Address,
const void *Decoder);
+static DecodeStatus DecodeVPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder);
static DecodeStatus DecodeVPR128RegisterClass(llvm::MCInst &Inst,
unsigned RegNo, uint64_t Address,
const void *Decoder);
@@ -126,6 +129,10 @@ static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
unsigned ShiftAmount,
uint64_t Address,
const void *Decoder);
+template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
+static DecodeStatus
+DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
+ uint64_t Address, const void *Decoder);
static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
unsigned ShiftAmount,
@@ -336,9 +343,20 @@ DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
return MCDisassembler::Success;
}
+static DecodeStatus DecodeVPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+ uint64_t Address,
+ const void *Decoder) {
+ if (RegNo > 31)
+ return MCDisassembler::Fail;
+
+ uint16_t Register = getReg(Decoder, AArch64::VPR64RegClassID, RegNo);
+ Inst.addOperand(MCOperand::CreateReg(Register));
+ return MCDisassembler::Success;
+}
+
static DecodeStatus
DecodeVPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address, const void *Decoder) {
+ uint64_t Address, const void *Decoder) {
if (RegNo > 31)
return MCDisassembler::Fail;
@@ -799,4 +817,24 @@ extern "C" void LLVMInitializeAArch64Disassembler() {
createAArch64Disassembler);
}
+template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
+static DecodeStatus
+DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
+ uint64_t Address, const void *Decoder) {
+ bool IsLSL = false;
+ if (Ext == A64SE::LSL)
+ IsLSL = true;
+ else if (Ext != A64SE::MSL)
+ return MCDisassembler::Fail;
+
+ // MSL and LSLH accepts encoded shift amount 0 or 1.
+ if ((!IsLSL || (IsLSL && IsHalf)) && ShiftAmount != 0 && ShiftAmount != 1)
+ return MCDisassembler::Fail;
+
+ // LSL accepts encoded shift amount 0, 1, 2 or 3.
+ if (IsLSL && ShiftAmount > 3)
+ return MCDisassembler::Fail;
+ Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
+ return MCDisassembler::Success;
+}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 82ce80c8b1..b6243310d5 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -406,3 +406,84 @@ void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
printAnnotation(O, Annot);
}
+
+template <A64SE::ShiftExtSpecifiers Ext, bool isHalf>
+void AArch64InstPrinter::printNeonMovImmShiftOperand(const MCInst *MI,
+ unsigned OpNum,
+ raw_ostream &O) {
+ const MCOperand &MO = MI->getOperand(OpNum);
+
+ assert(MO.isImm() &&
+ "Immediate operand required for Neon vector immediate inst.");
+
+ bool IsLSL = false;
+ if (Ext == A64SE::LSL)
+ IsLSL = true;
+ else if (Ext != A64SE::MSL)
+ llvm_unreachable("Invalid shift specifier in movi instruction");
+
+ int64_t Imm = MO.getImm();
+
+ // MSL and LSLH accepts encoded shift amount 0 or 1.
+ if ((!IsLSL || (IsLSL && isHalf)) && Imm != 0 && Imm != 1)
+ llvm_unreachable("Invalid shift amount in movi instruction");
+
+ // LSH accepts encoded shift amount 0, 1, 2 or 3.
+ if (IsLSL && (Imm < 0 || Imm > 3))
+ llvm_unreachable("Invalid shift amount in movi instruction");
+
+ // Print shift amount as multiple of 8 with MSL encoded shift amount
+ // 0 and 1 printed as 8 and 16.
+ if (!IsLSL)
+ Imm++;
+ Imm *= 8;
+
+ // LSL #0 is not printed
+ if (IsLSL) {
+ if (Imm == 0)
+ return;
+ O << ", lsl";
+ } else
+ O << ", msl";
+
+ O << " #" << Imm;
+}
+
+void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum,
+ raw_ostream &o) {
+ o << "#0x0";
+}
+
+void AArch64InstPrinter::printNeonUImm8Operand(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O) {
+ const MCOperand &MOUImm = MI->getOperand(OpNum);
+
+ assert(MOUImm.isImm() &&
+ "Immediate operand required for Neon vector immediate inst.");
+
+ unsigned Imm = MOUImm.getImm();
+
+ O << "#0x";
+ O.write_hex(Imm);
+}
+
+void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
+ unsigned OpNum,
+ raw_ostream &O) {
+ const MCOperand &MOUImm8 = MI->getOperand(OpNum);
+
+ assert(MOUImm8.isImm() &&
+ "Immediate operand required for Neon vector immediate bytemask inst.");
+
+ uint32_t UImm8 = MOUImm8.getImm();
+ uint64_t Mask = 0;
+
+ // Replicates 0x00 or 0xff byte in a 64-bit vector
+ for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+ if ((UImm8 >> ByteNum) & 1)
+ Mask |= (uint64_t)0xff << (8 * ByteNum);
+ }
+
+ O << "#0x";
+ O.write_hex(Mask);
+}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 639fa869c0..f7439bec66 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -164,9 +164,14 @@ public:
return RegNo == AArch64::XSP || RegNo == AArch64::WSP;
}
-
+ template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
+ void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O);
+ void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printNeonUImm8Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+ void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O);
};
-
}
#endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 48d48190fd..58fc95c2ea 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -40,7 +40,7 @@ MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT,
StringRef CPU,
StringRef FS) {
MCSubtargetInfo *X = new MCSubtargetInfo();
- InitAArch64MCSubtargetInfo(X, TT, CPU, "");
+ InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
return X;
}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 79865f6aa5..2a97cd6325 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -1105,3 +1105,69 @@ bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value,
return isMOVNImm(RegWidth, Value, UImm16, Shift);
}
+
+// decodeNeonModShiftImm - Decode a Neon OpCmode value into the
+// the shift amount and the shift type (shift zeros or ones in) and
+// returns whether the OpCmode value implies a shift operation.
+bool A64Imms::decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
+ unsigned &ShiftOnesIn) {
+ ShiftImm = 0;
+ ShiftOnesIn = false;
+ bool HasShift = true;
+
+ if (OpCmode == 0xe) {
+ // movi byte
+ HasShift = false;
+ } else if (OpCmode == 0x1e) {
+ // movi 64-bit bytemask
+ HasShift = false;
+ } else if ((OpCmode & 0xc) == 0x8) {
+ // shift zeros, per halfword
+ ShiftImm = ((OpCmode & 0x2) >> 1);
+ } else if ((OpCmode & 0x8) == 0) {
+ // shift zeros, per word
+ ShiftImm = ((OpCmode & 0x6) >> 1);
+ } else if ((OpCmode & 0xe) == 0xc) {
+ // shift ones, per word
+ ShiftOnesIn = true;
+ ShiftImm = (OpCmode & 0x1);
+ } else {
+ // per byte, per bytemask
+ llvm_unreachable("Unsupported Neon modified immediate");
+ }
+
+ return HasShift;
+}
+
+// decodeNeonModImm - Decode a NEON modified immediate and OpCmode values
+// into the element value and the element size in bits.
+uint64_t A64Imms::decodeNeonModImm(unsigned Val, unsigned OpCmode,
+ unsigned &EltBits) {
+ uint64_t DecodedVal = Val;
+ EltBits = 0;
+
+ if (OpCmode == 0xe) {
+ // movi byte
+ EltBits = 8;
+ } else if (OpCmode == 0x1e) {
+ // movi 64-bit bytemask
+ DecodedVal = 0;
+ for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
+ if ((Val >> ByteNum) & 1)
+ DecodedVal |= (uint64_t)0xff << (8 * ByteNum);
+ }
+ EltBits = 64;
+ } else if ((OpCmode & 0xc) == 0x8) {
+ // shift zeros, per halfword
+ EltBits = 16;
+ } else if ((OpCmode & 0x8) == 0) {
+ // shift zeros, per word
+ EltBits = 32;
+ } else if ((OpCmode & 0xe) == 0xc) {
+ // shift ones, per word
+ EltBits = 32;
+ } else {
+ llvm_unreachable("Unsupported Neon modified immediate");
+ }
+ return DecodedVal;
+}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 9a1ca6127a..e675efc9d9 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -289,6 +289,7 @@ namespace A64SE {
enum ShiftExtSpecifiers {
Invalid = -1,
LSL,
+ MSL,
LSR,
ASR,
ROR,
@@ -1068,7 +1069,10 @@ namespace A64Imms {
// MOVN but *not* with a MOVZ (because that would take priority).
bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-}
+ uint64_t decodeNeonModImm(unsigned Val, unsigned OpCmode, unsigned &EltBits);
+ bool decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
+ unsigned &ShiftOnesIn);
+ }
} // end namespace llvm;
diff --git a/test/CodeGen/AArch64/complex-copy-noneon.ll b/test/CodeGen/AArch64/complex-copy-noneon.ll
new file mode 100644
index 0000000000..4ae547856e
--- /dev/null
+++ b/test/CodeGen/AArch64/complex-copy-noneon.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s
+
+; The DAG combiner decided to use a vector load/store for this struct copy
+; previously. This probably shouldn't happen without NEON, but the most
+; important thing is that it compiles.
+
+define void @store_combine() nounwind {
+ %src = alloca { double, double }, align 8
+ %dst = alloca { double, double }, align 8
+
+ %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0
+ %src.real = load double* %src.realp
+ %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1
+ %src.imag = load double* %src.imagp
+
+ %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0
+ %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1
+ store double %src.real, double* %dst.realp
+ store double %src.imag, double* %dst.imagp
+ ret void
+}
diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll
index cfa06a4e0b..18a3b37b41 100644
--- a/test/CodeGen/AArch64/inline-asm-constraints.ll
+++ b/test/CodeGen/AArch64/inline-asm-constraints.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
define i64 @test_inline_constraint_r(i64 %base, i32 %offset) {
; CHECK-LABEL: test_inline_constraint_r:
@@ -44,6 +44,26 @@ define i32 @test_inline_constraint_Q(i32 *%ptr) {
@dump = global fp128 zeroinitializer
+define void @test_inline_constraint_w(<8 x i8> %vec64, <4 x float> %vec128, half %hlf, float %flt, double %dbl, fp128 %quad) {
+; CHECK: test_inline_constraint_w:
+ call <8 x i8> asm sideeffect "add $0.8b, $1.8b, $1.8b", "=w,w"(<8 x i8> %vec64)
+ call <8 x i8> asm sideeffect "fadd $0.4s, $1.4s, $1.4s", "=w,w"(<4 x float> %vec128)
+; CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+
+ ; Arguably semantically dodgy to output "vN", but it's what GCC does
+ ; so purely for compatibility we want vector registers to be output.
+ call float asm sideeffect "fcvt ${0:s}, ${1:h}", "=w,w"(half undef)
+ call float asm sideeffect "fadd $0.2s, $0.2s, $0.2s", "=w,w"(float %flt)
+ call double asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(double %dbl)
+ call fp128 asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(fp128 %quad)
+; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}}
+; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ ret void
+}
+
define void @test_inline_constraint_I() {
; CHECK-LABEL: test_inline_constraint_I:
call void asm sideeffect "add x0, x0, $0", "I"(i32 0)
diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/neon-aba-abd.ll
new file mode 100644
index 0000000000..b423666d80
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-aba-abd.ll
@@ -0,0 +1,226 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uabd_v8i8:
+ %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uabd v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uaba_v8i8:
+ %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+ %aba = add <8 x i8> %lhs, %abd
+; CHECK: uaba v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %aba
+}
+
+define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sabd_v8i8:
+ %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sabd v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %abd
+}
+
+define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_saba_v8i8:
+ %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+ %aba = add <8 x i8> %lhs, %abd
+; CHECK: saba v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %aba
+}
+
+declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uabd_v16i8:
+ %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uabd v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uaba_v16i8:
+ %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+ %aba = add <16 x i8> %lhs, %abd
+; CHECK: uaba v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %aba
+}
+
+define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sabd_v16i8:
+ %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sabd v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %abd
+}
+
+define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_saba_v16i8:
+ %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+ %aba = add <16 x i8> %lhs, %abd
+; CHECK: saba v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %aba
+}
+
+declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uabd_v4i16:
+ %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uabd v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uaba_v4i16:
+ %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+ %aba = add <4 x i16> %lhs, %abd
+; CHECK: uaba v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %aba
+}
+
+define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sabd_v4i16:
+ %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sabd v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %abd
+}
+
+define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_saba_v4i16:
+ %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+ %aba = add <4 x i16> %lhs, %abd
+; CHECK: saba v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %aba
+}
+
+declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uabd_v8i16:
+ %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uabd v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uaba_v8i16:
+ %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+ %aba = add <8 x i16> %lhs, %abd
+; CHECK: uaba v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %aba
+}
+
+define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sabd_v8i16:
+ %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sabd v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %abd
+}
+
+define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_saba_v8i16:
+ %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+ %aba = add <8 x i16> %lhs, %abd
+; CHECK: saba v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %aba
+}
+
+declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uabd_v2i32:
+ %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uabd v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uaba_v2i32:
+ %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+ %aba = add <2 x i32> %lhs, %abd
+; CHECK: uaba v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %aba
+}
+
+define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sabd_v2i32:
+ %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sabd v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %abd
+}
+
+define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_saba_v2i32:
+ %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+ %aba = add <2 x i32> %lhs, %abd
+; CHECK: saba v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %aba
+}
+
+declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uabd_v4i32:
+ %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uabd v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uaba_v4i32:
+ %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+ %aba = add <4 x i32> %lhs, %abd
+; CHECK: uaba v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %aba
+}
+
+define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sabd_v4i32:
+ %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sabd v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %abd
+}
+
+define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_saba_v4i32:
+ %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+ %aba = add <4 x i32> %lhs, %abd
+; CHECK: saba v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %aba
+}
+
+declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>)
+
+define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fabd_v2f32:
+ %abd = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fabd v0.2s, v0.2s, v1.2s
+ ret <2 x float> %abd
+}
+
+declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>)
+
+define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fabd_v4f32:
+ %abd = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fabd v0.4s, v0.4s, v1.4s
+ ret <4 x float> %abd
+}
+
+declare <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double>, <2 x double>)
+
+define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fabd_v2f64:
+ %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fabd v0.2d, v0.2d, v1.2d
+ ret <2 x double> %abd
+} \ No newline at end of file
diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll
new file mode 100644
index 0000000000..1abfed3190
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-add-pairwise.ll
@@ -0,0 +1,92 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_addp_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: addp v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_addp_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: addp v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_addp_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: addp v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_addp_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: addp v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_addp_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: addp v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_addp_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: addp v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+
+declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_addp_v2i64:
+ %val = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: addp v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %val
+}
+
+declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_faddp_v2f32:
+ %val = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: faddp v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_faddp_v4f32:
+ %val = call <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: faddp v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_faddp_v2f64:
+ %val = call <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: faddp v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
+
diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll
new file mode 100644
index 0000000000..65ec8a247e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-add-sub.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: add {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp3 = add <8 x i8> %A, %B;
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: add {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp3 = add <16 x i8> %A, %B;
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: add {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+ %tmp3 = add <4 x i16> %A, %B;
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: add {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+ %tmp3 = add <8 x i16> %A, %B;
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: add {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp3 = add <2 x i32> %A, %B;
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: add {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp3 = add <4 x i32> %A, %B;
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %tmp3 = add <2 x i64> %A, %B;
+ ret <2 x i64> %tmp3
+}
+
+define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fadd {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp3 = fadd <2 x float> %A, %B;
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fadd {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp3 = fadd <4 x float> %A, %B;
+ ret <4 x float> %tmp3
+}
+define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %tmp3 = fadd <2 x double> %A, %B;
+ ret <2 x double> %tmp3
+}
+
+define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: sub {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp3 = sub <8 x i8> %A, %B;
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: sub {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp3 = sub <16 x i8> %A, %B;
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: sub {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+ %tmp3 = sub <4 x i16> %A, %B;
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: sub {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+ %tmp3 = sub <8 x i16> %A, %B;
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: sub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp3 = sub <2 x i32> %A, %B;
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: sub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp3 = sub <4 x i32> %A, %B;
+ ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %tmp3 = sub <2 x i64> %A, %B;
+ ret <2 x i64> %tmp3
+}
+
+define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fsub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp3 = fsub <2 x float> %A, %B;
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fsub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp3 = fsub <4 x float> %A, %B;
+ ret <4 x float> %tmp3
+}
+define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %tmp3 = fsub <2 x double> %A, %B;
+ ret <2 x double> %tmp3
+}
+
+define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+ %tmp3 = add <1 x i64> %A, %B;
+ ret <1 x i64> %tmp3
+}
+
+define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) {
+;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}}
+ %tmp3 = sub <1 x i64> %A, %B;
+ ret <1 x i64> %tmp3
+}
+
diff --git a/test/CodeGen/AArch64/neon-bitcast.ll b/test/CodeGen/AArch64/neon-bitcast.ll
new file mode 100644
index 0000000000..f9ec704840
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-bitcast.ll
@@ -0,0 +1,574 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
+
+; From <8 x i8>
+
+define <1 x i64> @test_v8i8_to_v1i64(<8 x i8> %in) nounwind {
+; CHECK: test_v8i8_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i8> %in to <1 x i64>
+ ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v8i8_to_v2i32(<8 x i8> %in) nounwind {
+; CHECK: test_v8i8_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i8> %in to <2 x i32>
+ ret <2 x i32> %val
+}
+
+define <2 x float> @test_v8i8_to_v1f32(<8 x i8> %in) nounwind{
+; CHECK: test_v8i8_to_v1f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i8> %in to <2 x float>
+ ret <2 x float> %val
+}
+
+define <4 x i16> @test_v8i8_to_v4i16(<8 x i8> %in) nounwind{
+; CHECK: test_v8i8_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i8> %in to <4 x i16>
+ ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v8i8_to_v8i8(<8 x i8> %in) nounwind{
+; CHECK: test_v8i8_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i8> %in to <8 x i8>
+ ret <8 x i8> %val
+}
+
+; From <4 x i16>
+
+define <1 x i64> @test_v4i16_to_v1i64(<4 x i16> %in) nounwind {
+; CHECK: test_v4i16_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i16> %in to <1 x i64>
+ ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v4i16_to_v2i32(<4 x i16> %in) nounwind {
+; CHECK: test_v4i16_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i16> %in to <2 x i32>
+ ret <2 x i32> %val
+}
+
+define <2 x float> @test_v4i16_to_v1f32(<4 x i16> %in) nounwind{
+; CHECK: test_v4i16_to_v1f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i16> %in to <2 x float>
+ ret <2 x float> %val
+}
+
+define <4 x i16> @test_v4i16_to_v4i16(<4 x i16> %in) nounwind{
+; CHECK: test_v4i16_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i16> %in to <4 x i16>
+ ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v4i16_to_v8i8(<4 x i16> %in) nounwind{
+; CHECK: test_v4i16_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i16> %in to <8 x i8>
+ ret <8 x i8> %val
+}
+
+; From <2 x i32>
+
+define <1 x i64> @test_v2i32_to_v1i64(<2 x i32> %in) nounwind {
+; CHECK: test_v2i32_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i32> %in to <1 x i64>
+ ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v2i32_to_v2i32(<2 x i32> %in) nounwind {
+; CHECK: test_v2i32_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i32> %in to <2 x i32>
+ ret <2 x i32> %val
+}
+
+define <2 x float> @test_v2i32_to_v1f32(<2 x i32> %in) nounwind{
+; CHECK: test_v2i32_to_v1f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i32> %in to <2 x float>
+ ret <2 x float> %val
+}
+
+define <4 x i16> @test_v2i32_to_v4i16(<2 x i32> %in) nounwind{
+; CHECK: test_v2i32_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i32> %in to <4 x i16>
+ ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v2i32_to_v8i8(<2 x i32> %in) nounwind{
+; CHECK: test_v2i32_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i32> %in to <8 x i8>
+ ret <8 x i8> %val
+}
+
+; From <2 x float>
+
+define <1 x i64> @test_v2f32_to_v1i64(<2 x float> %in) nounwind {
+; CHECK: test_v2f32_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x float> %in to <1 x i64>
+ ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v2f32_to_v2i32(<2 x float> %in) nounwind {
+; CHECK: test_v2f32_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x float> %in to <2 x i32>
+ ret <2 x i32> %val
+}
+
+define <2 x float> @test_v2f32_to_v2f32(<2 x float> %in) nounwind{
+; CHECK: test_v2f32_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x float> %in to <2 x float>
+ ret <2 x float> %val
+}
+
+define <4 x i16> @test_v2f32_to_v4i16(<2 x float> %in) nounwind{
+; CHECK: test_v2f32_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x float> %in to <4 x i16>
+ ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v2f32_to_v8i8(<2 x float> %in) nounwind{
+; CHECK: test_v2f32_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x float> %in to <8 x i8>
+ ret <8 x i8> %val
+}
+
+; From <1 x i64>
+
+define <1 x i64> @test_v1i64_to_v1i64(<1 x i64> %in) nounwind {
+; CHECK: test_v1i64_to_v1i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <1 x i64> %in to <1 x i64>
+ ret <1 x i64> %val
+}
+
+define <2 x i32> @test_v1i64_to_v2i32(<1 x i64> %in) nounwind {
+; CHECK: test_v1i64_to_v2i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <1 x i64> %in to <2 x i32>
+ ret <2 x i32> %val
+}
+
+define <2 x float> @test_v1i64_to_v2f32(<1 x i64> %in) nounwind{
+; CHECK: test_v1i64_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <1 x i64> %in to <2 x float>
+ ret <2 x float> %val
+}
+
+define <4 x i16> @test_v1i64_to_v4i16(<1 x i64> %in) nounwind{
+; CHECK: test_v1i64_to_v4i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <1 x i64> %in to <4 x i16>
+ ret <4 x i16> %val
+}
+
+define <8 x i8> @test_v1i64_to_v8i8(<1 x i64> %in) nounwind{
+; CHECK: test_v1i64_to_v8i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <1 x i64> %in to <8 x i8>
+ ret <8 x i8> %val
+}
+
+
+; From <16 x i8>
+
+define <2 x double> @test_v16i8_to_v2f64(<16 x i8> %in) nounwind {
+; CHECK: test_v16i8_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <16 x i8> %in to <2 x double>
+ ret <2 x double> %val
+}
+
+define <2 x i64> @test_v16i8_to_v2i64(<16 x i8> %in) nounwind {
+; CHECK: test_v16i8_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <16 x i8> %in to <2 x i64>
+ ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v16i8_to_v4i32(<16 x i8> %in) nounwind {
+; CHECK: test_v16i8_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <16 x i8> %in to <4 x i32>
+ ret <4 x i32> %val
+}
+
+define <4 x float> @test_v16i8_to_v2f32(<16 x i8> %in) nounwind{
+; CHECK: test_v16i8_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <16 x i8> %in to <4 x float>
+ ret <4 x float> %val
+}
+
+define <8 x i16> @test_v16i8_to_v8i16(<16 x i8> %in) nounwind{
+; CHECK: test_v16i8_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <16 x i8> %in to <8 x i16>
+ ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v16i8_to_v16i8(<16 x i8> %in) nounwind{
+; CHECK: test_v16i8_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <16 x i8> %in to <16 x i8>
+ ret <16 x i8> %val
+}
+
+; From <8 x i16>
+
+define <2 x double> @test_v8i16_to_v2f64(<8 x i16> %in) nounwind {
+; CHECK: test_v8i16_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i16> %in to <2 x double>
+ ret <2 x double> %val
+}
+
+define <2 x i64> @test_v8i16_to_v2i64(<8 x i16> %in) nounwind {
+; CHECK: test_v8i16_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i16> %in to <2 x i64>
+ ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v8i16_to_v4i32(<8 x i16> %in) nounwind {
+; CHECK: test_v8i16_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i16> %in to <4 x i32>
+ ret <4 x i32> %val
+}
+
+define <4 x float> @test_v8i16_to_v2f32(<8 x i16> %in) nounwind{
+; CHECK: test_v8i16_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i16> %in to <4 x float>
+ ret <4 x float> %val
+}
+
+define <8 x i16> @test_v8i16_to_v8i16(<8 x i16> %in) nounwind{
+; CHECK: test_v8i16_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i16> %in to <8 x i16>
+ ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v8i16_to_v16i8(<8 x i16> %in) nounwind{
+; CHECK: test_v8i16_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <8 x i16> %in to <16 x i8>
+ ret <16 x i8> %val
+}
+
+; From <4 x i32>
+
+define <2 x double> @test_v4i32_to_v2f64(<4 x i32> %in) nounwind {
+; CHECK: test_v4i32_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i32> %in to <2 x double>
+ ret <2 x double> %val
+}
+
+define <2 x i64> @test_v4i32_to_v2i64(<4 x i32> %in) nounwind {
+; CHECK: test_v4i32_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i32> %in to <2 x i64>
+ ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v4i32_to_v4i32(<4 x i32> %in) nounwind {
+; CHECK: test_v4i32_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i32> %in to <4 x i32>
+ ret <4 x i32> %val
+}
+
+define <4 x float> @test_v4i32_to_v2f32(<4 x i32> %in) nounwind{
+; CHECK: test_v4i32_to_v2f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i32> %in to <4 x float>
+ ret <4 x float> %val
+}
+
+define <8 x i16> @test_v4i32_to_v8i16(<4 x i32> %in) nounwind{
+; CHECK: test_v4i32_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i32> %in to <8 x i16>
+ ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v4i32_to_v16i8(<4 x i32> %in) nounwind{
+; CHECK: test_v4i32_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x i32> %in to <16 x i8>
+ ret <16 x i8> %val
+}
+
+; From <4 x float>
+
+define <2 x double> @test_v4f32_to_v2f64(<4 x float> %in) nounwind {
+; CHECK: test_v4f32_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x float> %in to <2 x double>
+ ret <2 x double> %val
+}
+
+define <2 x i64> @test_v4f32_to_v2i64(<4 x float> %in) nounwind {
+; CHECK: test_v4f32_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x float> %in to <2 x i64>
+ ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v4f32_to_v4i32(<4 x float> %in) nounwind {
+; CHECK: test_v4f32_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x float> %in to <4 x i32>
+ ret <4 x i32> %val
+}
+
+define <4 x float> @test_v4f32_to_v4f32(<4 x float> %in) nounwind{
+; CHECK: test_v4f32_to_v4f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x float> %in to <4 x float>
+ ret <4 x float> %val
+}
+
+define <8 x i16> @test_v4f32_to_v8i16(<4 x float> %in) nounwind{
+; CHECK: test_v4f32_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x float> %in to <8 x i16>
+ ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v4f32_to_v16i8(<4 x float> %in) nounwind{
+; CHECK: test_v4f32_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <4 x float> %in to <16 x i8>
+ ret <16 x i8> %val
+}
+
+; From <2 x i64>
+
+define <2 x double> @test_v2i64_to_v2f64(<2 x i64> %in) nounwind {
+; CHECK: test_v2i64_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i64> %in to <2 x double>
+ ret <2 x double> %val
+}
+
+define <2 x i64> @test_v2i64_to_v2i64(<2 x i64> %in) nounwind {
+; CHECK: test_v2i64_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i64> %in to <2 x i64>
+ ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v2i64_to_v4i32(<2 x i64> %in) nounwind {
+; CHECK: test_v2i64_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i64> %in to <4 x i32>
+ ret <4 x i32> %val
+}
+
+define <4 x float> @test_v2i64_to_v4f32(<2 x i64> %in) nounwind{
+; CHECK: test_v2i64_to_v4f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i64> %in to <4 x float>
+ ret <4 x float> %val
+}
+
+define <8 x i16> @test_v2i64_to_v8i16(<2 x i64> %in) nounwind{
+; CHECK: test_v2i64_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i64> %in to <8 x i16>
+ ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v2i64_to_v16i8(<2 x i64> %in) nounwind{
+; CHECK: test_v2i64_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x i64> %in to <16 x i8>
+ ret <16 x i8> %val
+}
+
+; From <2 x double>
+
+define <2 x double> @test_v2f64_to_v2f64(<2 x double> %in) nounwind {
+; CHECK: test_v2f64_to_v2f64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x double> %in to <2 x double>
+ ret <2 x double> %val
+}
+
+define <2 x i64> @test_v2f64_to_v2i64(<2 x double> %in) nounwind {
+; CHECK: test_v2f64_to_v2i64:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x double> %in to <2 x i64>
+ ret <2 x i64> %val
+}
+
+define <4 x i32> @test_v2f64_to_v4i32(<2 x double> %in) nounwind {
+; CHECK: test_v2f64_to_v4i32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x double> %in to <4 x i32>
+ ret <4 x i32> %val
+}
+
+define <4 x float> @test_v2f64_to_v4f32(<2 x double> %in) nounwind{
+; CHECK: test_v2f64_to_v4f32:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x double> %in to <4 x float>
+ ret <4 x float> %val
+}
+
+define <8 x i16> @test_v2f64_to_v8i16(<2 x double> %in) nounwind{
+; CHECK: test_v2f64_to_v8i16:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x double> %in to <8 x i16>
+ ret <8 x i16> %val
+}
+
+define <16 x i8> @test_v2f64_to_v16i8(<2 x double> %in) nounwind{
+; CHECK: test_v2f64_to_v16i8:
+; CHECK-NEXT: // BB#0:
+; CHECK-NEXT: ret
+
+ %val = bitcast <2 x double> %in to <16 x i8>
+ ret <16 x i8> %val
+}
+
diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
new file mode 100644
index 0000000000..1c43b979fc
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -0,0 +1,594 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
+;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = and <8 x i8> %a, %b;
+ ret <8 x i8> %tmp1
+}
+
+define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = and <16 x i8> %a, %b;
+ ret <16 x i8> %tmp1
+}
+
+
+define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) {
+;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = or <8 x i8> %a, %b;
+ ret <8 x i8> %tmp1
+}
+
+define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = or <16 x i8> %a, %b;
+ ret <16 x i8> %tmp1
+}
+
+
+define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) {
+;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <8 x i8> %a, %b;
+ ret <8 x i8> %tmp1
+}
+
+define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <16 x i8> %a, %b;
+ ret <16 x i8> %tmp1
+}
+
+define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b) {
+;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+ %tmp3 = or <8 x i8> %tmp1, %tmp2
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) {
+;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 >
+ %tmp3 = or <16 x i8> %tmp1, %tmp2
+ ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b) {
+;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp2 = or <8 x i8> %a, %tmp1
+ ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp2 = or <16 x i8> %a, %tmp1
+ ret <16 x i8> %tmp2
+}
+
+define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b) {
+;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp2 = and <8 x i8> %a, %tmp1
+ ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) {
+;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 >
+ %tmp2 = and <16 x i8> %a, %tmp1
+ ret <16 x i8> %tmp2
+}
+
+define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) {
+;CHECK: orr {{v[0-31]+}}.2s, #0xff
+ %tmp1 = or <2 x i32> %a, < i32 255, i32 255>
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) {
+;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #8
+ %tmp1 = or <2 x i32> %a, < i32 65280, i32 65280>
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) {
+;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #16
+ %tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680>
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) {
+;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #24
+ %tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080>
+ ret <2 x i32> %tmp1
+}
+
+define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) {
+;CHECK: orr {{v[0-31]+}}.4s, #0xff
+ %tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255>
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) {
+;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #8
+ %tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280>
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) {
+;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #16
+ %tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680>
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) {
+;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #24
+ %tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080>
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) {
+;CHECK: orr {{v[0-31]+}}.4h, #0xff
+ %tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 >
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) {
+;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8
+ %tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 >
+ ret <4 x i16> %tmp1
+}
+
+define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) {
+;CHECK: orr {{v[0-31]+}}.8h, #0xff
+ %tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) {
+;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8
+ %tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
+ ret <8 x i16> %tmp1
+}
+
+define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) {
+;CHECK: bic {{v[0-31]+}}.2s, #0x10
+ %tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 >
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) {
+;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #8
+ %tmp1 = and <2 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519 >
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) {
+;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #16
+ %tmp1 = and <2 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039 >
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) {
+;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #24
+ %tmp1 = and <2 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159>
+ ret <2 x i32> %tmp1
+}
+
+define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) {
+;CHECK: bic {{v[0-31]+}}.4s, #0x10
+ %tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) {
+;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #8
+ %tmp1 = and <4 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519 >
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) {
+;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #16
+ %tmp1 = and <4 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039 >
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) {
+;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #24
+ %tmp1 = and <4 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159>
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) {
+;CHECK: bic {{v[0-31]+}}.4h, #0x10
+ %tmp1 = and <4 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 >
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) {
+;CHECK: bic {{v[0-31]+}}.4h, #0x0
+ %tmp1 = and <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 >
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) {
+;CHECK: bic {{v[0-31]+}}.4h, #0x10, lsl #8
+ %tmp1 = and <4 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519>
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) {
+;CHECK: bic {{v[0-31]+}}.4h, #0x0, lsl #8
+ %tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255>
+ ret <4 x i16> %tmp1
+}
+
+define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) {
+;CHECK: bic {{v[0-31]+}}.8h, #0x10
+ %tmp1 = and <8 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599,
+ i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 >
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) {
+;CHECK: bic {{v[0-31]+}}.8h, #0x0
+ %tmp1 = and <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) {
+;CHECK: bic {{v[0-31]+}}.8h, #0x10, lsl #8
+ %tmp1 = and <8 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519,
+ i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519>
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) {
+;CHECK: bic {{v[0-31]+}}.8h, #0x0, lsl #8
+ %tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
+ ret <8 x i16> %tmp1
+}
+
+define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) {
+;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = and <2 x i32> %a, %b;
+ ret <2 x i32> %tmp1
+}
+
+define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) {
+;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = and <4 x i16> %a, %b;
+ ret <4 x i16> %tmp1
+}
+
+define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) {
+;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = and <1 x i64> %a, %b;
+ ret <1 x i64> %tmp1
+}
+
+define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) {
+;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = and <4 x i32> %a, %b;
+ ret <4 x i32> %tmp1
+}
+
+define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) {
+;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = and <8 x i16> %a, %b;
+ ret <8 x i16> %tmp1
+}
+
+define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) {
+;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = and <2 x i64> %a, %b;
+ ret <2 x i64> %tmp1
+}
+
+define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) {
+;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = or <2 x i32> %a, %b;
+ ret <2 x i32> %tmp1
+}
+
+define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) {
+;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = or <4 x i16> %a, %b;
+ ret <4 x i16> %tmp1
+}
+
+define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) {
+;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = or <1 x i64> %a, %b;
+ ret <1 x i64> %tmp1
+}
+
+define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) {
+;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = or <4 x i32> %a, %b;
+ ret <4 x i32> %tmp1
+}
+
+define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) {
+;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = or <8 x i16> %a, %b;
+ ret <8 x i16> %tmp1
+}
+
+define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) {
+;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = or <2 x i64> %a, %b;
+ ret <2 x i64> %tmp1
+}
+
+define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) {
+;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <2 x i32> %a, %b;
+ ret <2 x i32> %tmp1
+}
+
+define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) {
+;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <4 x i16> %a, %b;
+ ret <4 x i16> %tmp1
+}
+
+define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) {
+;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <1 x i64> %a, %b;
+ ret <1 x i64> %tmp1
+}
+
+define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) {
+;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <4 x i32> %a, %b;
+ ret <4 x i32> %tmp1
+}
+
+define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) {
+;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <8 x i16> %a, %b;
+ ret <8 x i16> %tmp1
+}
+
+define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) {
+;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <2 x i64> %a, %b;
+ ret <2 x i64> %tmp1
+}
+
+
+define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b) {
+;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
+ %tmp2 = and <2 x i32> %a, %tmp1
+ ret <2 x i32> %tmp2
+}
+
+define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b) {
+;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
+ %tmp2 = and <4 x i16> %a, %tmp1
+ ret <4 x i16> %tmp2
+}
+
+define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b) {
+;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <1 x i64> %b, < i64 -1>
+ %tmp2 = and <1 x i64> %a, %tmp1
+ ret <1 x i64> %tmp2
+}
+
+define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b) {
+;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
+ %tmp2 = and <4 x i32> %a, %tmp1
+ ret <4 x i32> %tmp2
+}
+
+define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b) {
+;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
+ %tmp2 = and <8 x i16> %a, %tmp1
+ ret <8 x i16> %tmp2
+}
+
+define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b) {
+;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
+ %tmp2 = and <2 x i64> %a, %tmp1
+ ret <2 x i64> %tmp2
+}
+
+define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b) {
+;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 >
+ %tmp2 = or <2 x i32> %a, %tmp1
+ ret <2 x i32> %tmp2
+}
+
+define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b) {
+;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 >
+ %tmp2 = or <4 x i16> %a, %tmp1
+ ret <4 x i16> %tmp2
+}
+
+define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b) {
+;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = xor <1 x i64> %b, < i64 -1>
+ %tmp2 = or <1 x i64> %a, %tmp1
+ ret <1 x i64> %tmp2
+}
+
+define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b) {
+;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1>
+ %tmp2 = or <4 x i32> %a, %tmp1
+ ret <4 x i32> %tmp2
+}
+
+define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b) {
+;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 >
+ %tmp2 = or <8 x i16> %a, %tmp1
+ ret <8 x i16> %tmp2
+}
+
+define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b) {
+;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1>
+ %tmp2 = or <2 x i64> %a, %tmp1
+ ret <2 x i64> %tmp2
+}
+define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) {
+;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = and <2 x i32> %a, < i32 -1, i32 -1 >
+ %tmp2 = and <2 x i32> %b, < i32 0, i32 0 >
+ %tmp3 = or <2 x i32> %tmp1, %tmp2
+ ret <2 x i32> %tmp3
+}
+
+
+define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b) {
+;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = and <4 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1 >
+ %tmp2 = and <4 x i16> %b, < i16 0, i16 0,i16 0, i16 0 >
+ %tmp3 = or <4 x i16> %tmp1, %tmp2
+ ret <4 x i16> %tmp3
+}
+
+define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) {
+;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = and <1 x i64> %a, < i64 -1 >
+ %tmp2 = and <1 x i64> %b, < i64 0 >
+ %tmp3 = or <1 x i64> %tmp1, %tmp2
+ ret <1 x i64> %tmp3
+}
+
+define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b) {
+;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = and <4 x i32> %a, < i32 -1, i32 -1, i32 -1, i32 -1 >
+ %tmp2 = and <4 x i32> %b, < i32 0, i32 0, i32 0, i32 0 >
+ %tmp3 = or <4 x i32> %tmp1, %tmp2
+ ret <4 x i32> %tmp3
+}
+
+define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b) {
+;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1, i16 -1, i16 -1, i16 -1,i16 -1 >
+ %tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0 >
+ %tmp3 = or <8 x i16> %tmp1, %tmp2
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) {
+;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = and <2 x i64> %a, < i64 -1, i64 -1 >
+ %tmp2 = and <2 x i64> %b, < i64 0, i64 0 >
+ %tmp3 = or <2 x i64> %tmp1, %tmp2
+ ret <2 x i64> %tmp3
+}
+
+
+define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) {
+;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %1 = and <8 x i8> %v1, %v2
+ %2 = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %3 = and <8 x i8> %2, %v3
+ %4 = or <8 x i8> %1, %3
+ ret <8 x i8> %4
+}
+
+define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) {
+;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %1 = and <4 x i16> %v1, %v2
+ %2 = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1>
+ %3 = and <4 x i16> %2, %v3
+ %4 = or <4 x i16> %1, %3
+ ret <4 x i16> %4
+}
+
+define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) {
+;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %1 = and <2 x i32> %v1, %v2
+ %2 = xor <2 x i32> %v1, <i32 -1, i32 -1>
+ %3 = and <2 x i32> %2, %v3
+ %4 = or <2 x i32> %1, %3
+ ret <2 x i32> %4
+}
+
+define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) {
+;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %1 = and <1 x i64> %v1, %v2
+ %2 = xor <1 x i64> %v1, <i64 -1>
+ %3 = and <1 x i64> %2, %v3
+ %4 = or <1 x i64> %1, %3
+ ret <1 x i64> %4
+}
+
+define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) {
+;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %1 = and <16 x i8> %v1, %v2
+ %2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %3 = and <16 x i8> %2, %v3
+ %4 = or <16 x i8> %1, %3
+ ret <16 x i8> %4
+}
+
+define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) {
+;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %1 = and <8 x i16> %v1, %v2
+ %2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %3 = and <8 x i16> %2, %v3
+ %4 = or <8 x i16> %1, %3
+ ret <8 x i16> %4
+}
+
+define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) {
+;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %1 = and <4 x i32> %v1, %v2
+ %2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %3 = and <4 x i32> %2, %v3
+ %4 = or <4 x i32> %1, %3
+ ret <4 x i32> %4
+}
+
+define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) {
+;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %1 = and <2 x i64> %v1, %v2
+ %2 = xor <2 x i64> %v1, <i64 -1, i64 -1>
+ %3 = and <2 x i64> %2, %v3
+ %4 = or <2 x i64> %1, %3
+ ret <2 x i64> %4
+}
+
+define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) {
+;CHECK: orr {{v[0-31]+}}.4h, #0xff
+ %val = or <8 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
+ ret <8 x i8> %val
+}
+
+define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) {
+;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8
+ %val = or <8 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
+ ret <8 x i8> %val
+}
+
+define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) {
+;CHECK: orr {{v[0-31]+}}.8h, #0xff
+ %val = or <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0>
+ ret <16 x i8> %val
+}
+
+define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) {
+;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8
+ %val = or <16 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255>
+ ret <16 x i8> %val
+}
+
+
diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll
new file mode 100644
index 0000000000..0848f9b03d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -0,0 +1,1982 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp eq <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp eq <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp eq <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp eq <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp eq <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp eq <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp eq <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp sgt <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp sgt <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp sgt <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp sgt <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp sgt <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp sgt <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp sgt <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+ %tmp3 = icmp slt <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp slt <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp slt <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp slt <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp slt <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp slt <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp slt <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp sge <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp sge <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp sge <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp sge <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp sge <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp sge <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp sge <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+ %tmp3 = icmp sle <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp sle <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp sle <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp sle <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp sle <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp sle <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp sle <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ugt <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ugt <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp ugt <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp ugt <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp ugt <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp ugt <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp ugt <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+ %tmp3 = icmp ult <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp ult <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp ult <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp ult <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp ult <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp ult <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp ult <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp uge <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp uge <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp uge <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp uge <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp uge <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp uge <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp uge <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+ %tmp3 = icmp ule <8 x i8> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp ule <16 x i8> %A, %B;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp ule <4 x i16> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp ule <8 x i16> %A, %B;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp ule <2 x i32> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp ule <4 x i32> %A, %B;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp ule <2 x i64> %A, %B;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = and <8 x i8> %A, %B
+ %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer
+ %tmp5 = sext <8 x i1> %tmp4 to <8 x i8>
+ ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = and <16 x i8> %A, %B
+ %tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer
+ %tmp5 = sext <16 x i1> %tmp4 to <16 x i8>
+ ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = and <4 x i16> %A, %B
+ %tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer
+ %tmp5 = sext <4 x i1> %tmp4 to <4 x i16>
+ ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = and <8 x i16> %A, %B
+ %tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer
+ %tmp5 = sext <8 x i1> %tmp4 to <8 x i16>
+ ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = and <2 x i32> %A, %B
+ %tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer
+ %tmp5 = sext <2 x i1> %tmp4 to <2 x i32>
+ ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = and <4 x i32> %A, %B
+ %tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer
+ %tmp5 = sext <4 x i1> %tmp4 to <4 x i32>
+ ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = and <2 x i64> %A, %B
+ %tmp4 = icmp ne <2 x i64> %tmp3, zeroinitializer
+ %tmp5 = sext <2 x i1> %tmp4 to <2 x i64>
+ ret <2 x i64> %tmp5
+}
+
+
+
+define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+ %tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+ %tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+ %tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+ %tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+ %tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+ %tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+ %tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+ %tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+ %tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+ %tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+ %tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+ %tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+ %tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+ %tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+ %tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+ %tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+ %tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+ %tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+ %tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+ %tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+ %tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+ %tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+ %tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+ %tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+ %tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+ %tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+ %tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
+;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+ %tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+ %tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+ %tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+ %tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+ %tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+ %tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+ %tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+ %tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+ %tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+ %tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
+;CHECK: movi {{v[0-9]+}}.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
+;CHECK: movi {{v[0-9]+}}.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+ %tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b
+ %tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+ ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+ %tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
+ %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+ ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+ %tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+ ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+ %tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
+ %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+ ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.8b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v1.16b, #0x0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmoeq2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = fcmp oeq <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmoeq4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = fcmp oeq <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmoeq2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = fcmp oeq <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmoge2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = fcmp oge <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmoge4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = fcmp oge <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmoge2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = fcmp oge <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmogt2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ %tmp3 = fcmp ogt <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmogt4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ %tmp3 = fcmp ogt <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmogt2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ %tmp3 = fcmp ogt <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmole2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = fcmp ole <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmole4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = fcmp ole <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmole2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = fcmp ole <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmolt2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+ %tmp3 = fcmp olt <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmolt4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+ %tmp3 = fcmp olt <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmolt2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; OLE implemented as OGE, so check reversed operands.
+;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+ %tmp3 = fcmp olt <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmone2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp one <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmone4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp one <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+; todo check reversed operands
+ %tmp3 = fcmp one <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ord <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+
+define <4 x i32> @fcmord4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ord <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ord <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp uno <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp uno <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands.
+;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp uno <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ueq <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ueq <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands
+;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ueq <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGE = ULE with swapped operands, ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp uge <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGE = ULE with swapped operands, ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp uge <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGE = ULE with swapped operands, ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp uge <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGT = ULT with swapped operands, ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ugt <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UGT = ULT with swapped operands, ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ugt <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ugt <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ule <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ule <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULE implemented as !OGT.
+;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ule <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ult <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ult <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; ULT implemented as !OGE.
+;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ult <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNE = !OEQ.
+;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp une <2 x float> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNE = !OEQ.
+;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp une <4 x float> %A, %B
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; UNE = !OEQ.
+;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp une <2 x double> %A, %B
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmoeqz2xfloat(<2 x float> %A) {
+;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+ %tmp3 = fcmp oeq <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmoeqz4xfloat(<4 x float> %A) {
+;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+ %tmp3 = fcmp oeq <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) {
+;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+ %tmp3 = fcmp oeq <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmogez2xfloat(<2 x float> %A) {
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+ %tmp3 = fcmp oge <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmogez4xfloat(<4 x float> %A) {
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+ %tmp3 = fcmp oge <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmogez2xdouble(<2 x double> %A) {
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+ %tmp3 = fcmp oge <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmogtz2xfloat(<2 x float> %A) {
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+ %tmp3 = fcmp ogt <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmogtz4xfloat(<4 x float> %A) {
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+ %tmp3 = fcmp ogt <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmogtz2xdouble(<2 x double> %A) {
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+ %tmp3 = fcmp ogt <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmoltz2xfloat(<2 x float> %A) {
+;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+ %tmp3 = fcmp olt <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmoltz4xfloat(<4 x float> %A) {
+;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+ %tmp3 = fcmp olt <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmoltz2xdouble(<2 x double> %A) {
+;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+ %tmp3 = fcmp olt <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmolez2xfloat(<2 x float> %A) {
+;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+ %tmp3 = fcmp ole <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmolez4xfloat(<4 x float> %A) {
+;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+ %tmp3 = fcmp ole <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmolez2xdouble(<2 x double> %A) {
+;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+ %tmp3 = fcmp ole <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmonez2xfloat(<2 x float> %A) {
+; ONE with zero = OLT | OGT
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp one <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmonez4xfloat(<4 x float> %A) {
+; ONE with zero = OLT | OGT
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp one <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmonez2xdouble(<2 x double> %A) {
+; ONE with zero = OLT | OGT
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp one <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmordz2xfloat(<2 x float> %A) {
+; ORD with zero = OLT | OGE
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ord <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmordz4xfloat(<4 x float> %A) {
+; ORD with zero = OLT | OGE
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ord <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmordz2xdouble(<2 x double> %A) {
+; ORD with zero = OLT | OGE
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ord <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) {
+; UEQ with zero = !ONE = !(OLT |OGT)
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ueq <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) {
+; UEQ with zero = !ONE = !(OLT |OGT)
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ueq <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) {
+; UEQ with zero = !ONE = !(OLT |OGT)
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ueq <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmugez2xfloat(<2 x float> %A) {
+; UGE with zero = !OLT
+;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp uge <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmugez4xfloat(<4 x float> %A) {
+; UGE with zero = !OLT
+;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp uge <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmugez2xdouble(<2 x double> %A) {
+; UGE with zero = !OLT
+;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp uge <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) {
+; UGT with zero = !OLE
+;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ugt <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) {
+; UGT with zero = !OLE
+;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ugt <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) {
+; UGT with zero = !OLE
+;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ugt <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmultz2xfloat(<2 x float> %A) {
+; ULT with zero = !OGE
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ult <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmultz4xfloat(<4 x float> %A) {
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ult <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmultz2xdouble(<2 x double> %A) {
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ult <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmulez2xfloat(<2 x float> %A) {
+; ULE with zero = !OGT
+;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp ule <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmulez4xfloat(<4 x float> %A) {
+; ULE with zero = !OGT
+;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ule <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmulez2xdouble(<2 x double> %A) {
+; ULE with zero = !OGT
+;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp ule <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+define <2 x i32> @fcmunez2xfloat(<2 x float> %A) {
+; UNE with zero = !OEQ with zero
+;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp une <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmunez4xfloat(<4 x float> %A) {
+; UNE with zero = !OEQ with zero
+;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp une <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+define <2 x i64> @fcmunez2xdouble(<2 x double> %A) {
+; UNE with zero = !OEQ with zero
+;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp une <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+}
+
+
+define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) {
+; UNO with zero = !ORD = !(OLT | OGE)
+;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+ %tmp3 = fcmp uno <2 x float> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+ ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) {
+; UNO with zero = !ORD = !(OLT | OGE)
+;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp uno <4 x float> %A, zeroinitializer
+ %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+ ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) {
+; UNO with zero = !ORD = !(OLT | OGE)
+;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0
+;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff
+;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+ %tmp3 = fcmp uno <2 x double> %A, zeroinitializer
+ %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+ ret <2 x i64> %tmp4
+
+}
diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll
new file mode 100644
index 0000000000..146256e4be
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-facge-facgt.ll
@@ -0,0 +1,56 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>)
+declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>)
+declare <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double>, <2 x double>)
+
+define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facge_from_intr_v2i32:
+ %val = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %A, <2 x float> %B)
+; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ ret <2 x i32> %val
+}
+define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facge_from_intr_v4i32:
+ %val = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %A, <4 x float> %B)
+; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ ret <4 x i32> %val
+}
+
+define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facge_from_intr_v2i64:
+ %val = call <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double> %A, <2 x double> %B)
+; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ ret <2 x i64> %val
+}
+
+declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>)
+declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>)
+declare <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double>, <2 x double>)
+
+define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facgt_from_intr_v2i32:
+ %val = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %A, <2 x float> %B)
+; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+ ret <2 x i32> %val
+}
+define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facgt_from_intr_v4i32:
+ %val = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %A, <4 x float> %B)
+; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+ ret <4 x i32> %val
+}
+
+define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: facgt_from_intr_v2i64:
+ %val = call <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double> %A, <2 x double> %B)
+; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+ ret <2 x i64> %val
+}
+
diff --git a/test/CodeGen/AArch64/neon-fma.ll b/test/CodeGen/AArch64/neon-fma.ll
new file mode 100644
index 0000000000..dcf4e28780
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-fma.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp1 = fmul <2 x float> %A, %B;
+ %tmp2 = fadd <2 x float> %C, %tmp1;
+ ret <2 x float> %tmp2
+}
+
+define <4 x float> @fmla4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp1 = fmul <4 x float> %A, %B;
+ %tmp2 = fadd <4 x float> %C, %tmp1;
+ ret <4 x float> %tmp2
+}
+
+define <2 x double> @fmla2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %tmp1 = fmul <2 x double> %A, %B;
+ %tmp2 = fadd <2 x double> %C, %tmp1;
+ ret <2 x double> %tmp2
+}
+
+
+define <2 x float> @fmls2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp1 = fmul <2 x float> %A, %B;
+ %tmp2 = fsub <2 x float> %C, %tmp1;
+ ret <2 x float> %tmp2
+}
+
+define <4 x float> @fmls4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp1 = fmul <4 x float> %A, %B;
+ %tmp2 = fsub <4 x float> %C, %tmp1;
+ ret <4 x float> %tmp2
+}
+
+define <2 x double> @fmls2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %tmp1 = fmul <2 x double> %A, %B;
+ %tmp2 = fsub <2 x double> %C, %tmp1;
+ ret <2 x double> %tmp2
+}
+
+
+; Another set of tests for when the intrinsic is used.
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x float> @fmla2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C)
+ ret <2 x float> %val
+}
+
+define <4 x float> @fmla4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C)
+ ret <4 x float> %val
+}
+
+define <2 x double> @fmla2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C)
+ ret <2 x double> %val
+}
+
+define <2 x float> @fmls2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %negA = fsub <2 x float> <float -0.0, float -0.0>, %A
+ %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %negA, <2 x float> %B, <2 x float> %C)
+ ret <2 x float> %val
+}
+
+define <4 x float> @fmls4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %negA = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %A
+ %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %negA, <4 x float> %B, <4 x float> %C)
+ ret <4 x float> %val
+}
+
+define <2 x double> @fmls2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %negA = fsub <2 x double> <double -0.0, double -0.0>, %A
+ %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %negA, <2 x double> %B, <2 x double> %C)
+ ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>)
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define <2 x float> @fmuladd2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %val = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C)
+ ret <2 x float> %val
+}
+
+define <4 x float> @fmuladd4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) {
+;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %val = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C)
+ ret <4 x float> %val
+}
+
+define <2 x double> @fmuladd2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) {
+;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %val = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C)
+ ret <2 x double> %val
+}
diff --git a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
new file mode 100644
index 0000000000..46fe25d74d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+; Set of tests for when the intrinsic is used.
+
+declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @frsqrts_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frsqrts v0.2s, v0.2s, v1.2s
+ %val = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+ ret <2 x float> %val
+}
+
+define <4 x float> @frsqrts_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frsqrts v0.4s, v0.4s, v1.4s
+ %val = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+ ret <4 x float> %val
+}
+
+define <2 x double> @frsqrts_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frsqrts v0.2d, v0.2d, v1.2d
+ %val = call <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+ ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @frecps_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frecps v0.2s, v0.2s, v1.2s
+ %val = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+ ret <2 x float> %val
+}
+
+define <4 x float> @frecps_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frecps v0.4s, v0.4s, v1.4s
+ %val = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+ ret <4 x float> %val
+}
+
+define <2 x double> @frecps_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: frecps v0.2d, v0.2d, v1.2d
+ %val = call <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+ ret <2 x double> %val
+}
+
diff --git a/test/CodeGen/AArch64/neon-halving-add-sub.ll b/test/CodeGen/AArch64/neon-halving-add-sub.ll
new file mode 100644
index 0000000000..a8f59dbdb0
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-halving-add-sub.ll
@@ -0,0 +1,207 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uhadd_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uhadd v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_shadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_shadd_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: shadd v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uhadd_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uhadd v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_shadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_shadd_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: shadd v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uhadd_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uhadd v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_shadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_shadd_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: shadd v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uhadd_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uhadd v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_shadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_shadd_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: shadd v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uhadd_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uhadd v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_shadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_shadd_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: shadd v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uhadd_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uhadd v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_shadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_shadd_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: shadd v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+
+declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uhsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uhsub_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uhsub v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_shsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_shsub_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: shsub v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uhsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uhsub_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uhsub v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_shsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_shsub_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: shsub v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uhsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uhsub_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uhsub v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_shsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_shsub_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: shsub v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uhsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uhsub_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uhsub v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_shsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_shsub_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: shsub v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uhsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uhsub_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uhsub v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_shsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_shsub_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: shsub v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uhsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uhsub_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uhsub v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_shsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_shsub_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: shsub v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll
new file mode 100644
index 0000000000..d757aca86a
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-max-min-pairwise.ll
@@ -0,0 +1,310 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_smaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_smaxp_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: smaxp v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_umaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: umaxp v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_smaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_smaxp_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: smaxp v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_umaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_umaxp_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: umaxp v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_smaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_smaxp_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: smaxp v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_umaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_umaxp_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: umaxp v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+
+declare <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_smaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_smaxp_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: smaxp v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_umaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_umaxp_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: umaxp v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+
+declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_smaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_smaxp_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: smaxp v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_umaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_umaxp_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: umaxp v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_smaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_smaxp_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: smaxp v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_umaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_umaxp_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: umaxp v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_sminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_sminp_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sminp v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_uminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uminp v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_sminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sminp_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sminp v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_uminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uminp_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uminp v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_sminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sminp_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sminp v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_uminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uminp_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uminp v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+
+declare <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_sminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sminp_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sminp v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_uminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uminp_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uminp v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+
+declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_sminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sminp_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sminp v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_uminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uminp_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uminp v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_sminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sminp_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sminp v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_uminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uminp_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uminp v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmaxp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmaxp_v2f32:
+ %val = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmaxp v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_fmaxp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmaxp_v4f32:
+ %val = call <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmaxp v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_fmaxp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmaxp_v2f64:
+ %val = call <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmaxp v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fminp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fminp_v2f32:
+ %val = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fminp v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_fminp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fminp_v4f32:
+ %val = call <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fminp v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_fminp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fminp_v2f64:
+ %val = call <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fminp v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmaxnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmaxnmp_v2f32:
+ %val = call <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmaxnmp v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_fmaxnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmaxnmp_v4f32:
+ %val = call <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmaxnmp v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_fmaxnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmaxnmp_v2f64:
+ %val = call <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmaxnmp v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fminnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fminnmp_v2f32:
+ %val = call <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fminnmp v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_fminnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fminnmp_v4f32:
+ %val = call <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fminnmp v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fminnmp_v2f64:
+ %val = call <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fminnmp v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
+
diff --git a/test/CodeGen/AArch64/neon-max-min.ll b/test/CodeGen/AArch64/neon-max-min.ll
new file mode 100644
index 0000000000..7889c77e37
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-max-min.ll
@@ -0,0 +1,310 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_smax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_smax_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: smax v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_umax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: umax v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_smax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_smax_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: smax v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_umax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_umax_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: umax v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_smax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_smax_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: smax v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_umax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_umax_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: umax v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+
+declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_smax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_smax_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: smax v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_umax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_umax_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: umax v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+
+declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_smax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_smax_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: smax v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_umax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_umax_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: umax v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_smax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_smax_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: smax v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_umax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_umax_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: umax v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_smin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; CHECK: test_smin_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: smin v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_umin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: umin v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_smin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_smin_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: smin v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_umin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_umin_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: umin v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_smin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_smin_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: smin v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_umin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_umin_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: umin v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+
+declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_smin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_smin_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: smin v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_umin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_umin_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: umin v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+
+declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_smin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_smin_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: smin v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_umin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_umin_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: umin v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_smin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_smin_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: smin v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_umin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_umin_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: umin v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmax_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmax_v2f32:
+ %val = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmax v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_fmax_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmax_v4f32:
+ %val = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmax v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_fmax_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmax_v2f64:
+ %val = call <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmax v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmin_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmin_v2f32:
+ %val = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmin v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_fmin_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmin_v4f32:
+ %val = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmin v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_fmin_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmin_v2f64:
+ %val = call <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmin v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
+
+
+declare <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fmaxnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fmaxnm_v2f32:
+ %val = call <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fmaxnm v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_fmaxnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fmaxnm_v4f32:
+ %val = call <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fmaxnm v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_fmaxnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fmaxnm_v2f64:
+ %val = call <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fmaxnm v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
+
+declare <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @test_fminnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; CHECK: test_fminnm_v2f32:
+ %val = call <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+; CHECK: fminnm v0.2s, v0.2s, v1.2s
+ ret <2 x float> %val
+}
+
+define <4 x float> @test_fminnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; CHECK: test_fminnm_v4f32:
+ %val = call <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+; CHECK: fminnm v0.4s, v0.4s, v1.4s
+ ret <4 x float> %val
+}
+
+define <2 x double> @test_fminnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; CHECK: test_fminnm_v2f64:
+ %val = call <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+; CHECK: fminnm v0.2d, v0.2d, v1.2d
+ ret <2 x double> %val
+}
diff --git a/test/CodeGen/AArch64/neon-mla-mls.ll b/test/CodeGen/AArch64/neon-mla-mls.ll
new file mode 100644
index 0000000000..23e9223a8b
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-mla-mls.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+;CHECK: mla {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = mul <8 x i8> %A, %B;
+ %tmp2 = add <8 x i8> %C, %tmp1;
+ ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @mla16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+;CHECK: mla {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = mul <16 x i8> %A, %B;
+ %tmp2 = add <16 x i8> %C, %tmp1;
+ ret <16 x i8> %tmp2
+}
+
+define <4 x i16> @mla4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+;CHECK: mla {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+ %tmp1 = mul <4 x i16> %A, %B;
+ %tmp2 = add <4 x i16> %C, %tmp1;
+ ret <4 x i16> %tmp2
+}
+
+define <8 x i16> @mla8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+;CHECK: mla {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+ %tmp1 = mul <8 x i16> %A, %B;
+ %tmp2 = add <8 x i16> %C, %tmp1;
+ ret <8 x i16> %tmp2
+}
+
+define <2 x i32> @mla2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+;CHECK: mla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp1 = mul <2 x i32> %A, %B;
+ %tmp2 = add <2 x i32> %C, %tmp1;
+ ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @mla4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+;CHECK: mla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp1 = mul <4 x i32> %A, %B;
+ %tmp2 = add <4 x i32> %C, %tmp1;
+ ret <4 x i32> %tmp2
+}
+
+define <8 x i8> @mls8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) {
+;CHECK: mls {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp1 = mul <8 x i8> %A, %B;
+ %tmp2 = sub <8 x i8> %C, %tmp1;
+ ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @mls16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+;CHECK: mls {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp1 = mul <16 x i8> %A, %B;
+ %tmp2 = sub <16 x i8> %C, %tmp1;
+ ret <16 x i8> %tmp2
+}
+
+define <4 x i16> @mls4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) {
+;CHECK: mls {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+ %tmp1 = mul <4 x i16> %A, %B;
+ %tmp2 = sub <4 x i16> %C, %tmp1;
+ ret <4 x i16> %tmp2
+}
+
+define <8 x i16> @mls8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) {
+;CHECK: mls {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+ %tmp1 = mul <8 x i16> %A, %B;
+ %tmp2 = sub <8 x i16> %C, %tmp1;
+ ret <8 x i16> %tmp2
+}
+
+define <2 x i32> @mls2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) {
+;CHECK: mls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp1 = mul <2 x i32> %A, %B;
+ %tmp2 = sub <2 x i32> %C, %tmp1;
+ ret <2 x i32> %tmp2
+}
+
+define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+;CHECK: mls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp1 = mul <4 x i32> %A, %B;
+ %tmp2 = sub <4 x i32> %C, %tmp1;
+ ret <4 x i32> %tmp2
+}
+
+
diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll
new file mode 100644
index 0000000000..42f6a894da
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-mov.ll
@@ -0,0 +1,205 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <8 x i8> @movi8b() {
+;CHECK: movi {{v[0-31]+}}.8b, #0x8
+ ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <16 x i8> @movi16b() {
+;CHECK: movi {{v[0-31]+}}.16b, #0x8
+ ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 >
+}
+
+define <2 x i32> @movi2s_lsl0() {
+;CHECK: movi {{v[0-31]+}}.2s, #0xff
+ ret <2 x i32> < i32 255, i32 255 >
+}
+
+define <2 x i32> @movi2s_lsl8() {
+;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #8
+ ret <2 x i32> < i32 65280, i32 65280 >
+}
+
+define <2 x i32> @movi2s_lsl16() {
+;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #16
+ ret <2 x i32> < i32 16711680, i32 16711680 >
+
+}
+
+define <2 x i32> @movi2s_lsl24() {
+;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #24
+ ret <2 x i32> < i32 4278190080, i32 4278190080 >
+}
+
+define <4 x i32> @movi4s_lsl0() {
+;CHECK: movi {{v[0-31]+}}.4s, #0xff
+ ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 >
+}
+
+define <4 x i32> @movi4s_lsl8() {
+;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #8
+ ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 >
+}
+
+define <4 x i32> @movi4s_lsl16() {
+;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #16
+ ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 >
+
+}
+
+define <4 x i32> @movi4s_lsl24() {
+;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #24
+ ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 >
+}
+
+define <4 x i16> @movi4h_lsl0() {
+;CHECK: movi {{v[0-31]+}}.4h, #0xff
+ ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 >
+}
+
+define <4 x i16> @movi4h_lsl8() {
+;CHECK: movi {{v[0-31]+}}.4h, #0xff, lsl #8
+ ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 >
+}
+
+define <8 x i16> @movi8h_lsl0() {
+;CHECK: movi {{v[0-31]+}}.8h, #0xff
+ ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 >
+}
+
+define <8 x i16> @movi8h_lsl8() {
+;CHECK: movi {{v[0-31]+}}.8h, #0xff, lsl #8
+ ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 >
+}
+
+
+define <2 x i32> @mvni2s_lsl0() {
+;CHECK: mvni {{v[0-31]+}}.2s, #0x10
+ ret <2 x i32> < i32 4294967279, i32 4294967279 >
+}
+
+define <2 x i32> @mvni2s_lsl8() {
+;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #8
+ ret <2 x i32> < i32 4294963199, i32 4294963199 >
+}
+
+define <2 x i32> @mvni2s_lsl16() {
+;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #16
+ ret <2 x i32> < i32 4293918719, i32 4293918719 >
+}
+
+define <2 x i32> @mvni2s_lsl24() {
+;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #24
+ ret <2 x i32> < i32 4026531839, i32 4026531839 >
+}
+
+define <4 x i32> @mvni4s_lsl0() {
+;CHECK: mvni {{v[0-31]+}}.4s, #0x10
+ ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 >
+}
+
+define <4 x i32> @mvni4s_lsl8() {
+;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #8
+ ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 >
+}
+
+define <4 x i32> @mvni4s_lsl16() {
+;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #16
+ ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 >
+
+}
+
+define <4 x i32> @mvni4s_lsl24() {
+;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #24
+ ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 >
+}
+
+
+define <4 x i16> @mvni4h_lsl0() {
+;CHECK: mvni {{v[0-31]+}}.4h, #0x10
+ ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 >
+}
+
+define <4 x i16> @mvni4h_lsl8() {
+;CHECK: mvni {{v[0-31]+}}.4h, #0x10, lsl #8
+ ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 >
+}
+
+define <8 x i16> @mvni8h_lsl0() {
+;CHECK: mvni {{v[0-31]+}}.8h, #0x10
+ ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 >
+}
+
+define <8 x i16> @mvni8h_lsl8() {
+;CHECK: mvni {{v[0-31]+}}.8h, #0x10, lsl #8
+ ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 >
+}
+
+
+define <2 x i32> @movi2s_msl8(<2 x i32> %a) {
+;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #8
+ ret <2 x i32> < i32 65535, i32 65535 >
+}
+
+define <2 x i32> @movi2s_msl16() {
+;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #16
+ ret <2 x i32> < i32 16777215, i32 16777215 >
+}
+
+
+define <4 x i32> @movi4s_msl8() {
+;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #8
+ ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 >
+}
+
+define <4 x i32> @movi4s_msl16() {
+;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #16
+ ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 >
+}
+
+define <2 x i32> @mvni2s_msl8() {
+;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #8
+ ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264>
+}
+
+define <2 x i32> @mvni2s_msl16() {
+;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #16
+ ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504>
+}
+
+define <4 x i32> @mvni4s_msl8() {
+;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #8
+ ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264>
+}
+
+define <4 x i32> @mvni4s_msl16() {
+;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #16
+ ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504>
+}
+
+define <2 x i64> @movi2d() {
+;CHECK: movi {{v[0-31]+}}.2d, #0xff0000ff0000ffff
+ ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 >
+}
+
+define <1 x i64> @movid() {
+;CHECK: movi {{d[0-31]+}}, #0xff0000ff0000ffff
+ ret <1 x i64> < i64 18374687574888349695 >
+}
+
+define <2 x float> @fmov2s() {
+;CHECK: fmov {{v[0-31]+}}.2s, #-12.00000000
+ ret <2 x float> < float -1.2e1, float -1.2e1>
+}
+
+define <4 x float> @fmov4s() {
+;CHECK: fmov {{v[0-31]+}}.4s, #-12.00000000
+ ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1>
+}
+
+define <2 x double> @fmov2d() {
+;CHECK: fmov {{v[0-31]+}}.2d, #-12.00000000
+ ret <2 x double> < double -1.2e1, double -1.2e1>
+}
+
+
diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll
new file mode 100644
index 0000000000..e1be313266
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-mul-div.ll
@@ -0,0 +1,181 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+
+define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: mul {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b
+ %tmp3 = mul <8 x i8> %A, %B;
+ ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: mul {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b
+ %tmp3 = mul <16 x i8> %A, %B;
+ ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: mul {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h
+ %tmp3 = mul <4 x i16> %A, %B;
+ ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: mul {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h
+ %tmp3 = mul <8 x i16> %A, %B;
+ ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: mul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp3 = mul <2 x i32> %A, %B;
+ ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: mul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp3 = mul <4 x i32> %A, %B;
+ ret <4 x i32> %tmp3
+}
+
+ define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fmul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp3 = fmul <2 x float> %A, %B;
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fmul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp3 = fmul <4 x float> %A, %B;
+ ret <4 x float> %tmp3
+}
+define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fmul {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %tmp3 = fmul <2 x double> %A, %B;
+ ret <2 x double> %tmp3
+}
+
+
+ define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) {
+;CHECK: fdiv {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s
+ %tmp3 = fdiv <2 x float> %A, %B;
+ ret <2 x float> %tmp3
+}
+
+define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) {
+;CHECK: fdiv {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s
+ %tmp3 = fdiv <4 x float> %A, %B;
+ ret <4 x float> %tmp3
+}
+define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) {
+;CHECK: fdiv {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d
+ %tmp3 = fdiv <2 x double> %A, %B;
+ ret <2 x double> %tmp3
+}
+
+declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>)
+declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>)
+
+define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: poly_mulv8i8:
+ %prod = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: pmul v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %prod
+}
+
+define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: poly_mulv16i8:
+ %prod = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: pmul v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %prod
+}
+
+declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqdmulh_v4i16:
+ %prod = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqdmulh v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqdmulh_v8i16:
+ %prod = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqdmulh v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqdmulh_v2i32:
+ %prod = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqdmulh v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqdmulh_v4i32:
+ %prod = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqdmulh v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %prod
+}
+
+declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqrdmulh_v4i16:
+ %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %prod
+}
+
+define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqrdmulh_v8i16:
+ %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %prod
+}
+
+define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqrdmulh_v2i32:
+ %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %prod
+}
+
+define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqrdmulh_v4i32:
+ %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %prod
+}
+
+declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>)
+declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
+
+define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2s, v0.2s, v1.2s
+ %val = call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs)
+ ret <2 x float> %val
+}
+
+define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.4s, v0.4s, v1.4s
+ %val = call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs)
+ ret <4 x float> %val
+}
+
+define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
+; Using registers other than v0, v1 and v2 are possible, but would be odd.
+; CHECK: fmulx v0.2d, v0.2d, v1.2d
+ %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs)
+ ret <2 x double> %val
+}
diff --git a/test/CodeGen/AArch64/neon-rounding-halving-add.ll b/test/CodeGen/AArch64/neon-rounding-halving-add.ll
new file mode 100644
index 0000000000..009da3b51a
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-rounding-halving-add.ll
@@ -0,0 +1,105 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_urhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_urhadd_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: urhadd v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_srhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_srhadd_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: srhadd v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_urhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_urhadd_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: urhadd v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_srhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_srhadd_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: srhadd v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_urhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_urhadd_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: urhadd v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_srhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_srhadd_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: srhadd v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_urhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_urhadd_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: urhadd v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_srhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_srhadd_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: srhadd v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_urhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_urhadd_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: urhadd v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_srhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_srhadd_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: srhadd v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_urhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_urhadd_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: urhadd v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_srhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_srhadd_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: srhadd v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+
diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll
new file mode 100644
index 0000000000..404e49185e
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-rounding-shift.ll
@@ -0,0 +1,138 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_urshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_urshl_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: urshl v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_srshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_srshl_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: srshl v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_urshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_urshl_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: urshl v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_srshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_srshl_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: srshl v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_urshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_urshl_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: urshl v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_srshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_srshl_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: srshl v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_urshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_urshl_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: urshl v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_srshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_srshl_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: srshl v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_urshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_urshl_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: urshl v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_srshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_srshl_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: srshl v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_urshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_urshl_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: urshl v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_srshl_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: srshl v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_urshl_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: urshl d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_srshl_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: srshl d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_urshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_urshl_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: urshl v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_srshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_srshl_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: srshl v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
new file mode 100644
index 0000000000..b2fac1fbc1
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-saturating-add-sub.ll
@@ -0,0 +1,274 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqadd_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uqadd v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqadd_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sqadd v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uqadd_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uqadd v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sqadd_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sqadd v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uqadd_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uqadd v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqadd_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqadd v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uqadd_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uqadd v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqadd_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqadd v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uqadd_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uqadd v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqadd_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqadd v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uqadd_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uqadd v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqadd_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqadd v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqadd_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: uqadd d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqadd_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sqadd d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_uqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_uqadd_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: uqadd v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sqadd_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sqadd v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
+declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqsub_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uqsub v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqsub_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sqsub v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uqsub_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uqsub v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sqsub_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sqsub v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uqsub_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uqsub v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqsub_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqsub v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uqsub_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uqsub v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqsub_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqsub v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uqsub_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uqsub v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqsub_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqsub v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uqsub_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uqsub v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqsub_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqsub v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_uqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_uqsub_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: uqsub v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sqsub_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sqsub v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqsub_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: uqsub d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqsub_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sqsub d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
new file mode 100644
index 0000000000..05d8dfea9d
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
@@ -0,0 +1,138 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqrshl_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uqrshl v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqrshl_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sqrshl v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uqrshl_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uqrshl v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sqrshl_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sqrshl v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uqrshl_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uqrshl v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqrshl_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqrshl v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uqrshl_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uqrshl v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqrshl_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqrshl v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uqrshl_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uqrshl v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqrshl_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqrshl v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uqrshl_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uqrshl v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqrshl_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqrshl v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqrshl_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: uqrshl d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqrshl_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sqrshl d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_uqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_uqrshl_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: uqrshl v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sqrshl_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sqrshl v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll
new file mode 100644
index 0000000000..3b7f78cc79
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-saturating-shift.ll
@@ -0,0 +1,138 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqshl_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: uqshl v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqshl_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sqshl v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_uqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_uqshl_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: uqshl v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sqshl_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sqshl v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_uqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_uqshl_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: uqshl v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sqshl_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sqshl v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_uqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_uqshl_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: uqshl v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sqshl_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sqshl v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_uqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_uqshl_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: uqshl v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sqshl_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sqshl v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_uqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_uqshl_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: uqshl v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sqshl_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sqshl v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_uqshl_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: uqshl d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sqshl_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sqshl d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_uqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_uqshl_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: uqshl v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sqshl_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sqshl v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll
new file mode 100644
index 0000000000..45a2605799
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-shift.ll
@@ -0,0 +1,140 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>)
+declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>)
+
+define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_uqshl_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: ushl v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK: test_sqshl_v8i8:
+ %tmp1 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
+; CHECK: sshl v0.8b, v0.8b, v1.8b
+ ret <8 x i8> %tmp1
+}
+
+declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>)
+declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @test_ushl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_ushl_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: ushl v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+define <16 x i8> @test_sshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
+; CHECK: test_sshl_v16i8:
+ %tmp1 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
+; CHECK: sshl v0.16b, v0.16b, v1.16b
+ ret <16 x i8> %tmp1
+}
+
+declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>)
+declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>)
+
+define <4 x i16> @test_ushl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_ushl_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: ushl v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+define <4 x i16> @test_sshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
+; CHECK: test_sshl_v4i16:
+ %tmp1 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
+; CHECK: sshl v0.4h, v0.4h, v1.4h
+ ret <4 x i16> %tmp1
+}
+
+declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>)
+
+define <8 x i16> @test_ushl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_ushl_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: ushl v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+define <8 x i16> @test_sshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
+; CHECK: test_sshl_v8i16:
+ %tmp1 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
+; CHECK: sshl v0.8h, v0.8h, v1.8h
+ ret <8 x i16> %tmp1
+}
+
+declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>)
+declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>)
+
+define <2 x i32> @test_ushl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_ushl_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: ushl v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+define <2 x i32> @test_sshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
+; CHECK: test_sshl_v2i32:
+ %tmp1 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
+; CHECK: sshl v0.2s, v0.2s, v1.2s
+ ret <2 x i32> %tmp1
+}
+
+declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i32> @test_ushl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_ushl_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: ushl v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK: test_sshl_v4i32:
+ %tmp1 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
+; CHECK: sshl v0.4s, v0.4s, v1.4s
+ ret <4 x i32> %tmp1
+}
+
+declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
+declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>)
+
+define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_ushl_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: ushl d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) {
+; CHECK: test_sshl_v1i64:
+ %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs)
+; CHECK: sshl d0, d0, d1
+ ret <1 x i64> %tmp1
+}
+
+declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>)
+declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>)
+
+define <2 x i64> @test_ushl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_ushl_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: ushl v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
+define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
+; CHECK: test_sshl_v2i64:
+ %tmp1 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
+; CHECK: sshl v0.2d, v0.2d, v1.2d
+ ret <2 x i64> %tmp1
+}
+
+
+
diff --git a/test/MC/AArch64/basic-a64-diagnostics.s b/test/MC/AArch64/basic-a64-diagnostics.s
index 1e9024c5ee..2e6e0bbd38 100644
--- a/test/MC/AArch64/basic-a64-diagnostics.s
+++ b/test/MC/AArch64/basic-a64-diagnostics.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple=aarch64 < %s 2> %t
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu < %s 2> %t
// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
//------------------------------------------------------------------------------
@@ -2892,13 +2892,13 @@
movi wzr, #0x44444444
movi w3, #0xffff
movi x9, #0x0000ffff00000000
-// CHECK-ERROR: error: invalid instruction
+// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR-NEXT: movi wzr, #0x44444444
// CHECK-ERROR-NEXT: ^
-// CHECK-ERROR: error: invalid instruction
+// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR-NEXT: movi w3, #0xffff
// CHECK-ERROR-NEXT: ^
-// CHECK-ERROR: error: invalid instruction
+// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR-NEXT: movi x9, #0x0000ffff00000000
// CHECK-ERROR-NEXT: ^
diff --git a/test/MC/AArch64/basic-a64-instructions.s b/test/MC/AArch64/basic-a64-instructions.s
index ad3064e5e5..e4f6b21892 100644
--- a/test/MC/AArch64/basic-a64-instructions.s
+++ b/test/MC/AArch64/basic-a64-instructions.s
@@ -1,4 +1,4 @@
-// RUN: llvm-mc -triple=aarch64 -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding < %s | FileCheck %s
.globl _func
// Check that the assembler can handle the documented syntax from the ARM ARM.
diff --git a/test/MC/AArch64/neon-aba-abd.s b/test/MC/AArch64/neon-aba-abd.s
new file mode 100644
index 0000000000..178eb26f64
--- /dev/null
+++ b/test/MC/AArch64/neon-aba-abd.s
@@ -0,0 +1,78 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference and Accumulate (Signed, Unsigned)
+//----------------------------------------------------------------------
+ uaba v0.8b, v1.8b, v2.8b
+ uaba v0.16b, v1.16b, v2.16b
+ uaba v0.4h, v1.4h, v2.4h
+ uaba v0.8h, v1.8h, v2.8h
+ uaba v0.2s, v1.2s, v2.2s
+ uaba v0.4s, v1.4s, v2.4s
+
+// CHECK: uaba v0.8b, v1.8b, v2.8b // encoding: [0x20,0x7c,0x22,0x2e]
+// CHECK: uaba v0.16b, v1.16b, v2.16b // encoding: [0x20,0x7c,0x22,0x6e]
+// CHECK: uaba v0.4h, v1.4h, v2.4h // encoding: [0x20,0x7c,0x62,0x2e]
+// CHECK: uaba v0.8h, v1.8h, v2.8h // encoding: [0x20,0x7c,0x62,0x6e]
+// CHECK: uaba v0.2s, v1.2s, v2.2s // encoding: [0x20,0x7c,0xa2,0x2e]
+// CHECK: uaba v0.4s, v1.4s, v2.4s // encoding: [0x20,0x7c,0xa2,0x6e]
+
+
+ saba v0.8b, v1.8b, v2.8b
+ saba v0.16b, v1.16b, v2.16b
+ saba v0.4h, v1.4h, v2.4h
+ saba v0.8h, v1.8h, v2.8h
+ saba v0.2s, v1.2s, v2.2s
+ saba v0.4s, v1.4s, v2.4s
+
+// CHECK: saba v0.8b, v1.8b, v2.8b // encoding: [0x20,0x7c,0x22,0x0e]
+// CHECK: saba v0.16b, v1.16b, v2.16b // encoding: [0x20,0x7c,0x22,0x4e]
+// CHECK: saba v0.4h, v1.4h, v2.4h // encoding: [0x20,0x7c,0x62,0x0e]
+// CHECK: saba v0.8h, v1.8h, v2.8h // encoding: [0x20,0x7c,0x62,0x4e]
+// CHECK: saba v0.2s, v1.2s, v2.2s // encoding: [0x20,0x7c,0xa2,0x0e]
+// CHECK: saba v0.4s, v1.4s, v2.4s // encoding: [0x20,0x7c,0xa2,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference (Signed, Unsigned)
+//----------------------------------------------------------------------
+ uabd v0.8b, v1.8b, v2.8b
+ uabd v0.16b, v1.16b, v2.16b
+ uabd v0.4h, v1.4h, v2.4h
+ uabd v0.8h, v1.8h, v2.8h
+ uabd v0.2s, v1.2s, v2.2s
+ uabd v0.4s, v1.4s, v2.4s
+
+// CHECK: uabd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x74,0x22,0x2e]
+// CHECK: uabd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x74,0x22,0x6e]
+// CHECK: uabd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x74,0x62,0x2e]
+// CHECK: uabd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x74,0x62,0x6e]
+// CHECK: uabd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x74,0xa2,0x2e]
+// CHECK: uabd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x74,0xa2,0x6e]
+
+ sabd v0.8b, v1.8b, v2.8b
+ sabd v0.16b, v1.16b, v2.16b
+ sabd v0.4h, v1.4h, v2.4h
+ sabd v0.8h, v1.8h, v2.8h
+ sabd v0.2s, v1.2s, v2.2s
+ sabd v0.4s, v1.4s, v2.4s
+
+// CHECK: sabd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x74,0x22,0x0e]
+// CHECK: sabd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x74,0x22,0x4e]
+// CHECK: sabd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x74,0x62,0x0e]
+// CHECK: sabd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x74,0x62,0x4e]
+// CHECK: sabd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x74,0xa2,0x0e]
+// CHECK: sabd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x74,0xa2,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference (Floating Point)
+//----------------------------------------------------------------------
+ fabd v0.2s, v1.2s, v2.2s
+ fabd v31.4s, v15.4s, v16.4s
+ fabd v7.2d, v8.2d, v25.2d
+
+// CHECK: fabd v0.2s, v1.2s, v2.2s // encoding: [0x20,0xd4,0xa2,0x2e]
+// CHECK: fabd v31.4s, v15.4s, v16.4s // encoding: [0xff,0xd5,0xb0,0x6e]
+// CHECK: fabd v7.2d, v8.2d, v25.2d // encoding: [0x07,0xd5,0xf9,0x6e]
+
diff --git a/test/MC/AArch64/neon-add-pairwise.s b/test/MC/AArch64/neon-add-pairwise.s
new file mode 100644
index 0000000000..b586c22548
--- /dev/null
+++ b/test/MC/AArch64/neon-add-pairwise.s
@@ -0,0 +1,35 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Add Pairwise (Integer)
+//------------------------------------------------------------------------------
+ addp v0.8b, v1.8b, v2.8b
+ addp v0.16b, v1.16b, v2.16b
+ addp v0.4h, v1.4h, v2.4h
+ addp v0.8h, v1.8h, v2.8h
+ addp v0.2s, v1.2s, v2.2s
+ addp v0.4s, v1.4s, v2.4s
+ addp v0.2d, v1.2d, v2.2d
+
+// CHECK: addp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xbc,0x22,0x0e]
+// CHECK: addp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xbc,0x22,0x4e]
+// CHECK: addp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xbc,0x62,0x0e]
+// CHECK: addp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xbc,0x62,0x4e]
+// CHECK: addp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xbc,0xa2,0x0e]
+// CHECK: addp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xbc,0xa2,0x4e]
+// CHECK: addp v0.2d, v1.2d, v2.2d // encoding: [0x20,0xbc,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Add Pairwise (Floating Point
+//------------------------------------------------------------------------------
+ faddp v0.2s, v1.2s, v2.2s
+ faddp v0.4s, v1.4s, v2.4s
+ faddp v0.2d, v1.2d, v2.2d
+
+// CHECK: faddp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xd4,0x22,0x2e]
+// CHECK: faddp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xd4,0x22,0x6e]
+// CHECK: faddp v0.2d, v1.2d, v2.2d // encoding: [0x20,0xd4,0x62,0x6e]
+
diff --git a/test/MC/AArch64/neon-add-sub-instructions.s b/test/MC/AArch64/neon-add-sub-instructions.s
new file mode 100644
index 0000000000..863798eaf0
--- /dev/null
+++ b/test/MC/AArch64/neon-add-sub-instructions.s
@@ -0,0 +1,82 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Add
+//------------------------------------------------------------------------------
+ add v0.8b, v1.8b, v2.8b
+ add v0.16b, v1.16b, v2.16b
+ add v0.4h, v1.4h, v2.4h
+ add v0.8h, v1.8h, v2.8h
+ add v0.2s, v1.2s, v2.2s
+ add v0.4s, v1.4s, v2.4s
+ add v0.2d, v1.2d, v2.2d
+
+// CHECK: add v0.8b, v1.8b, v2.8b // encoding: [0x20,0x84,0x22,0x0e]
+// CHECK: add v0.16b, v1.16b, v2.16b // encoding: [0x20,0x84,0x22,0x4e]
+// CHECK: add v0.4h, v1.4h, v2.4h // encoding: [0x20,0x84,0x62,0x0e]
+// CHECK: add v0.8h, v1.8h, v2.8h // encoding: [0x20,0x84,0x62,0x4e]
+// CHECK: add v0.2s, v1.2s, v2.2s // encoding: [0x20,0x84,0xa2,0x0e]
+// CHECK: add v0.4s, v1.4s, v2.4s // encoding: [0x20,0x84,0xa2,0x4e]
+// CHECK: add v0.2d, v1.2d, v2.2d // encoding: [0x20,0x84,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Sub
+//------------------------------------------------------------------------------
+ sub v0.8b, v1.8b, v2.8b
+ sub v0.16b, v1.16b, v2.16b
+ sub v0.4h, v1.4h, v2.4h
+ sub v0.8h, v1.8h, v2.8h
+ sub v0.2s, v1.2s, v2.2s
+ sub v0.4s, v1.4s, v2.4s
+ sub v0.2d, v1.2d, v2.2d
+
+// CHECK: sub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x84,0x22,0x2e]
+// CHECK: sub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x84,0x22,0x6e]
+// CHECK: sub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x84,0x62,0x2e]
+// CHECK: sub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x84,0x62,0x6e]
+// CHECK: sub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x84,0xa2,0x2e]
+// CHECK: sub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x84,0xa2,0x6e]
+// CHECK: sub v0.2d, v1.2d, v2.2d // encoding: [0x20,0x84,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Vector Floating-Point Add
+//------------------------------------------------------------------------------
+ fadd v0.2s, v1.2s, v2.2s
+ fadd v0.4s, v1.4s, v2.4s
+ fadd v0.2d, v1.2d, v2.2d
+
+// CHECK: fadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0xd4,0x22,0x0e]
+// CHECK: fadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0xd4,0x22,0x4e]
+// CHECK: fadd v0.2d, v1.2d, v2.2d // encoding: [0x20,0xd4,0x62,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Vector Floating-Point Sub
+//------------------------------------------------------------------------------
+ fsub v0.2s, v1.2s, v2.2s
+ fsub v0.4s, v1.4s, v2.4s
+ fsub v0.2d, v1.2d, v2.2d
+
+// CHECK: fsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0xd4,0xa2,0x0e]
+// CHECK: fsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0xd4,0xa2,0x4e]
+// CHECK: fsub v0.2d, v1.2d, v2.2d // encoding: [0x20,0xd4,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Add
+//------------------------------------------------------------------------------
+ add d31, d0, d16
+
+// CHECK: add d31, d0, d16 // encoding: [0x1f,0x84,0xf0,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Sub
+//------------------------------------------------------------------------------
+ sub d1, d7, d8
+
+// CHECK: sub d1, d7, d8 // encoding: [0xe1,0x84,0xe8,0x7e]
+
+
+
diff --git a/test/MC/AArch64/neon-bitwise-instructions.s b/test/MC/AArch64/neon-bitwise-instructions.s
new file mode 100644
index 0000000000..79d0a9b70b
--- /dev/null
+++ b/test/MC/AArch64/neon-bitwise-instructions.s
@@ -0,0 +1,60 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//------------------------------------------------------------------------------
+// Vector And
+//------------------------------------------------------------------------------
+ and v0.8b, v1.8b, v2.8b
+ and v0.16b, v1.16b, v2.16b
+
+// CHECK: and v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0x22,0x0e]
+// CHECK: and v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0x22,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Vector Orr
+//------------------------------------------------------------------------------
+ orr v0.8b, v1.8b, v2.8b
+ orr v0.16b, v1.16b, v2.16b
+
+// CHECK: orr v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0xa2,0x0e]
+// CHECK: orr v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0xa2,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Vector Eor
+//------------------------------------------------------------------------------
+ eor v0.8b, v1.8b, v2.8b
+ eor v0.16b, v1.16b, v2.16b
+
+// CHECK: eor v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0x22,0x2e]
+// CHECK: eor v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0x22,0x6e]
+
+
+//----------------------------------------------------------------------
+// Vector Bitwise
+//----------------------------------------------------------------------
+
+ bit v0.8b, v1.8b, v2.8b
+ bit v0.16b, v1.16b, v2.16b
+ bif v0.8b, v1.8b, v2.8b
+ bif v0.16b, v1.16b, v2.16b
+ bsl v0.8b, v1.8b, v2.8b
+ bsl v0.16b, v1.16b, v2.16b
+ orn v0.8b, v1.8b, v2.8b
+ orn v0.16b, v1.16b, v2.16b
+ bic v0.8b, v1.8b, v2.8b
+ bic v0.16b, v1.16b, v2.16b
+
+// CHECK: bit v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0xa2,0x2e]
+// CHECK: bit v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0xa2,0x6e]
+// CHECK: bif v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0xe2,0x2e]
+// CHECK: bif v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0xe2,0x6e]
+// CHECK: bsl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0x62,0x2e]
+// CHECK: bsl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0x62,0x6e]
+// CHECK: orn v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0xe2,0x0e]
+// CHECK: orn v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0xe2,0x4e]
+// CHECK: bic v0.8b, v1.8b, v2.8b // encoding: [0x20,0x1c,0x62,0x0e]
+// CHECK: bic v0.16b, v1.16b, v2.16b // encoding: [0x20,0x1c,0x62,0x4e]
+
diff --git a/test/MC/AArch64/neon-compare-instructions.s b/test/MC/AArch64/neon-compare-instructions.s
new file mode 100644
index 0000000000..e4bc202583
--- /dev/null
+++ b/test/MC/AArch64/neon-compare-instructions.s
@@ -0,0 +1,405 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal (Integer)
+//----------------------------------------------------------------------
+
+ cmeq v0.8b, v15.8b, v17.8b
+ cmeq v1.16b, v31.16b, v8.16b
+ cmeq v15.4h, v16.4h, v17.4h
+ cmeq v5.8h, v6.8h, v7.8h
+ cmeq v29.2s, v27.2s, v28.2s
+ cmeq v9.4s, v7.4s, v8.4s
+ cmeq v3.2d, v31.2d, v21.2d
+
+// CHECK: cmeq v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x8d,0x31,0x2e]
+// CHECK: cmeq v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x8f,0x28,0x6e]
+// CHECK: cmeq v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x8e,0x71,0x2e]
+// CHECK: cmeq v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x8c,0x67,0x6e]
+// CHECK: cmeq v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x8f,0xbc,0x2e]
+// CHECK: cmeq v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x8c,0xa8,0x6e]
+// CHECK: cmeq v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x8f,0xf5,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Higher or Same (Unsigned Integer)
+// Vector Compare Mask Less or Same (Unsigned Integer)
+// CMLS is alias for CMHS with operands reversed.
+//----------------------------------------------------------------------
+
+ cmhs v0.8b, v15.8b, v17.8b
+ cmhs v1.16b, v31.16b, v8.16b
+ cmhs v15.4h, v16.4h, v17.4h
+ cmhs v5.8h, v6.8h, v7.8h
+ cmhs v29.2s, v27.2s, v28.2s
+ cmhs v9.4s, v7.4s, v8.4s
+ cmhs v3.2d, v31.2d, v21.2d
+
+ cmls v0.8b, v17.8b, v15.8b
+ cmls v1.16b, v8.16b, v31.16b
+ cmls v15.4h, v17.4h, v16.4h
+ cmls v5.8h, v7.8h, v6.8h
+ cmls v29.2s, v28.2s, v27.2s
+ cmls v9.4s, v8.4s, v7.4s
+ cmls v3.2d, v21.2d, v31.2d
+
+// CHECK: cmhs v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x3d,0x31,0x2e]
+// CHECK: cmhs v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x6e]
+// CHECK: cmhs v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x3e,0x71,0x2e]
+// CHECK: cmhs v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x3c,0x67,0x6e]
+// CHECK: cmhs v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x3f,0xbc,0x2e]
+// CHECK: cmhs v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x3c,0xa8,0x6e]
+// CHECK: cmhs v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x3f,0xf5,0x6e]
+// CHECK: cmhs v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x3d,0x31,0x2e]
+// CHECK: cmhs v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x6e]
+// CHECK: cmhs v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x3e,0x71,0x2e]
+// CHECK: cmhs v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x3c,0x67,0x6e]
+// CHECK: cmhs v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x3f,0xbc,0x2e]
+// CHECK: cmhs v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x3c,0xa8,0x6e]
+// CHECK: cmhs v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x3f,0xf5,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal (Integer)
+// Vector Compare Mask Less Than or Equal (Integer)
+// CMLE is alias for CMGE with operands reversed.
+//----------------------------------------------------------------------
+
+ cmge v0.8b, v15.8b, v17.8b
+ cmge v1.16b, v31.16b, v8.16b
+ cmge v15.4h, v16.4h, v17.4h
+ cmge v5.8h, v6.8h, v7.8h
+ cmge v29.2s, v27.2s, v28.2s
+ cmge v9.4s, v7.4s, v8.4s
+ cmge v3.2d, v31.2d, v21.2d
+
+ cmle v0.8b, v17.8b, v15.8b
+ cmle v1.16b, v8.16b, v31.16b
+ cmle v15.4h, v17.4h, v16.4h
+ cmle v5.8h, v7.8h, v6.8h
+ cmle v29.2s, v28.2s, v27.2s
+ cmle v9.4s, v8.4s, v7.4s
+ cmle v3.2d, v21.2d, v31.2d
+
+// CHECK: cmge v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x3d,0x31,0x0e]
+// CHECK: cmge v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x4e]
+// CHECK: cmge v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x3e,0x71,0x0e]
+// CHECK: cmge v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x3c,0x67,0x4e]
+// CHECK: cmge v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x3f,0xbc,0x0e]
+// CHECK: cmge v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x3c,0xa8,0x4e]
+// CHECK: cmge v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x3f,0xf5,0x4e]
+// CHECK: cmge v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x3d,0x31,0x0e]
+// CHECK: cmge v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x3f,0x28,0x4e]
+// CHECK: cmge v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x3e,0x71,0x0e]
+// CHECK: cmge v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x3c,0x67,0x4e]
+// CHECK: cmge v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x3f,0xbc,0x0e]
+// CHECK: cmge v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x3c,0xa8,0x4e]
+// CHECK: cmge v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x3f,0xf5,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Higher (Unsigned Integer)
+// Vector Compare Mask Lower (Unsigned Integer)
+// CMLO is alias for CMHI with operands reversed.
+//----------------------------------------------------------------------
+
+ cmhi v0.8b, v15.8b, v17.8b
+ cmhi v1.16b, v31.16b, v8.16b
+ cmhi v15.4h, v16.4h, v17.4h
+ cmhi v5.8h, v6.8h, v7.8h
+ cmhi v29.2s, v27.2s, v28.2s
+ cmhi v9.4s, v7.4s, v8.4s
+ cmhi v3.2d, v31.2d, v21.2d
+
+ cmlo v0.8b, v17.8b, v15.8b
+ cmlo v1.16b, v8.16b, v31.16b
+ cmlo v15.4h, v17.4h, v16.4h
+ cmlo v5.8h, v7.8h, v6.8h
+ cmlo v29.2s, v28.2s, v27.2s
+ cmlo v9.4s, v8.4s, v7.4s
+ cmlo v3.2d, v21.2d, v31.2d
+
+// CHECK: cmhi v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x35,0x31,0x2e]
+// CHECK: cmhi v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x37,0x28,0x6e]
+// CHECK: cmhi v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x36,0x71,0x2e]
+// CHECK: cmhi v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x34,0x67,0x6e]
+// CHECK: cmhi v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x37,0xbc,0x2e]
+// CHECK: cmhi v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x34,0xa8,0x6e]
+// CHECK: cmhi v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x37,0xf5,0x6e]
+// CHECK: cmhi v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x35,0x31,0x2e]
+// CHECK: cmhi v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x37,0x28,0x6e]
+// CHECK: cmhi v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x36,0x71,0x2e]
+// CHECK: cmhi v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x34,0x67,0x6e]
+// CHECK: cmhi v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x37,0xbc,0x2e]
+// CHECK: cmhi v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x34,0xa8,0x6e]
+// CHECK: cmhi v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x37,0xf5,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than (Integer)
+// Vector Compare Mask Less Than (Integer)
+// CMLT is alias for CMGT with operands reversed.
+//----------------------------------------------------------------------
+
+ cmgt v0.8b, v15.8b, v17.8b
+ cmgt v1.16b, v31.16b, v8.16b
+ cmgt v15.4h, v16.4h, v17.4h
+ cmgt v5.8h, v6.8h, v7.8h
+ cmgt v29.2s, v27.2s, v28.2s
+ cmgt v9.4s, v7.4s, v8.4s
+ cmgt v3.2d, v31.2d, v21.2d
+
+ cmlt v0.8b, v17.8b, v15.8b
+ cmlt v1.16b, v8.16b, v31.16b
+ cmlt v15.4h, v17.4h, v16.4h
+ cmlt v5.8h, v7.8h, v6.8h
+ cmlt v29.2s, v28.2s, v27.2s
+ cmlt v9.4s, v8.4s, v7.4s
+ cmlt v3.2d, v21.2d, v31.2d
+
+// CHECK: cmgt v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x35,0x31,0x0e]
+// CHECK: cmgt v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x37,0x28,0x4e]
+// CHECK: cmgt v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x36,0x71,0x0e]
+// CHECK: cmgt v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x34,0x67,0x4e]
+// CHECK: cmgt v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x37,0xbc,0x0e]
+// CHECK: cmgt v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x34,0xa8,0x4e]
+// CHECK: cmgt v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x37,0xf5,0x4e]
+// CHECK: cmgt v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x35,0x31,0x0e]
+// CHECK: cmgt v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x37,0x28,0x4e]
+// CHECK: cmgt v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x36,0x71,0x0e]
+// CHECK: cmgt v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x34,0x67,0x4e]
+// CHECK: cmgt v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x37,0xbc,0x0e]
+// CHECK: cmgt v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x34,0xa8,0x4e]
+// CHECK: cmgt v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x37,0xf5,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Bitwise Test (Integer)
+//----------------------------------------------------------------------
+
+ cmtst v0.8b, v15.8b, v17.8b
+ cmtst v1.16b, v31.16b, v8.16b
+ cmtst v15.4h, v16.4h, v17.4h
+ cmtst v5.8h, v6.8h, v7.8h
+ cmtst v29.2s, v27.2s, v28.2s
+ cmtst v9.4s, v7.4s, v8.4s
+ cmtst v3.2d, v31.2d, v21.2d
+
+// CHECK: cmtst v0.8b, v15.8b, v17.8b // encoding: [0xe0,0x8d,0x31,0x0e]
+// CHECK: cmtst v1.16b, v31.16b, v8.16b // encoding: [0xe1,0x8f,0x28,0x4e]
+// CHECK: cmtst v15.4h, v16.4h, v17.4h // encoding: [0x0f,0x8e,0x71,0x0e]
+// CHECK: cmtst v5.8h, v6.8h, v7.8h // encoding: [0xc5,0x8c,0x67,0x4e]
+// CHECK: cmtst v29.2s, v27.2s, v28.2s // encoding: [0x7d,0x8f,0xbc,0x0e]
+// CHECK: cmtst v9.4s, v7.4s, v8.4s // encoding: [0xe9,0x8c,0xa8,0x4e]
+// CHECK: cmtst v3.2d, v31.2d, v21.2d // encoding: [0xe3,0x8f,0xf5,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal (Floating Point)
+//----------------------------------------------------------------------
+
+ fcmeq v0.2s, v31.2s, v16.2s
+ fcmeq v4.4s, v7.4s, v15.4s
+ fcmeq v29.2d, v2.2d, v5.2d
+
+// CHECK: fcmeq v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xe7,0x30,0x0e]
+// CHECK: fcmeq v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xe4,0x2f,0x4e]
+// CHECK: fcmeq v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xe4,0x65,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Or Equal (Floating Point)
+// Vector Compare Mask Less Than Or Equal (Floating Point)
+// FCMLE is alias for FCMGE with operands reversed.
+//----------------------------------------------------------------------
+
+ fcmge v31.4s, v29.4s, v28.4s
+ fcmge v3.2s, v8.2s, v12.2s
+ fcmge v17.2d, v15.2d, v13.2d
+ fcmle v31.4s, v28.4s, v29.4s
+ fcmle v3.2s, v12.2s, v8.2s
+ fcmle v17.2d, v13.2d, v15.2d
+
+// CHECK: fcmge v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xe7,0x3c,0x6e]
+// CHECK: fcmge v3.2s, v8.2s, v12.2s // encoding: [0x03,0xe5,0x2c,0x2e]
+// CHECK: fcmge v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xe5,0x6d,0x6e]
+// CHECK: fcmge v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xe7,0x3c,0x6e]
+// CHECK: fcmge v3.2s, v8.2s, v12.2s // encoding: [0x03,0xe5,0x2c,0x2e]
+// CHECK: fcmge v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xe5,0x6d,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than (Floating Point)
+// Vector Compare Mask Less Than (Floating Point)
+// FCMLT is alias for FCMGT with operands reversed.
+//----------------------------------------------------------------------
+
+ fcmgt v0.2s, v31.2s, v16.2s
+ fcmgt v4.4s, v7.4s, v15.4s
+ fcmgt v29.2d, v2.2d, v5.2d
+ fcmlt v0.2s, v16.2s, v31.2s
+ fcmlt v4.4s, v15.4s, v7.4s
+ fcmlt v29.2d, v5.2d, v2.2d
+
+// CHECK: fcmgt v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xe7,0xb0,0x2e]
+// CHECK: fcmgt v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xe4,0xaf,0x6e]
+// CHECK: fcmgt v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xe4,0xe5,0x6e]
+// CHECK: fcmgt v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xe7,0xb0,0x2e]
+// CHECK: fcmgt v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xe4,0xaf,0x6e]
+// CHECK: fcmgt v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xe4,0xe5,0x6e]
+
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal to Zero (Integer)
+//----------------------------------------------------------------------
+
+ cmeq v0.8b, v15.8b, #0
+ cmeq v1.16b, v31.16b, #0
+ cmeq v15.4h, v16.4h, #0
+ cmeq v5.8h, v6.8h, #0
+ cmeq v29.2s, v27.2s, #0
+ cmeq v9.4s, v7.4s, #0
+ cmeq v3.2d, v31.2d, #0
+
+// CHECK: cmeq v0.8b, v15.8b, #0x0 // encoding: [0xe0,0x99,0x20,0x0e]
+// CHECK: cmeq v1.16b, v31.16b, #0x0 // encoding: [0xe1,0x9b,0x20,0x4e]
+// CHECK: cmeq v15.4h, v16.4h, #0x0 // encoding: [0x0f,0x9a,0x60,0x0e]
+// CHECK: cmeq v5.8h, v6.8h, #0x0 // encoding: [0xc5,0x98,0x60,0x4e]
+// CHECK: cmeq v29.2s, v27.2s, #0x0 // encoding: [0x7d,0x9b,0xa0,0x0e]
+// CHECK: cmeq v9.4s, v7.4s, #0x0 // encoding: [0xe9,0x98,0xa0,0x4e]
+// CHECK: cmeq v3.2d, v31.2d, #0x0 // encoding: [0xe3,0x9b,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
+//----------------------------------------------------------------------
+ cmge v0.8b, v15.8b, #0
+ cmge v1.16b, v31.16b, #0
+ cmge v15.4h, v16.4h, #0
+ cmge v5.8h, v6.8h, #0
+ cmge v29.2s, v27.2s, #0
+ cmge v17.4s, v20.4s, #0
+ cmge v3.2d, v31.2d, #0
+
+// CHECK: cmge v0.8b, v15.8b, #0x0 // encoding: [0xe0,0x89,0x20,0x2e]
+// CHECK: cmge v1.16b, v31.16b, #0x0 // encoding: [0xe1,0x8b,0x20,0x6e]
+// CHECK: cmge v15.4h, v16.4h, #0x0 // encoding: [0x0f,0x8a,0x60,0x2e]
+// CHECK: cmge v5.8h, v6.8h, #0x0 // encoding: [0xc5,0x88,0x60,0x6e]
+// CHECK: cmge v29.2s, v27.2s, #0x0 // encoding: [0x7d,0x8b,0xa0,0x2e]
+// CHECK: cmge v17.4s, v20.4s, #0x0 // encoding: [0x91,0x8a,0xa0,0x6e]
+// CHECK: cmge v3.2d, v31.2d, #0x0 // encoding: [0xe3,0x8b,0xe0,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Zero (Signed Integer)
+//----------------------------------------------------------------------
+
+ cmgt v0.8b, v15.8b, #0
+ cmgt v1.16b, v31.16b, #0
+ cmgt v15.4h, v16.4h, #0
+ cmgt v5.8h, v6.8h, #0
+ cmgt v29.2s, v27.2s, #0
+ cmgt v9.4s, v7.4s, #0
+ cmgt v3.2d, v31.2d, #0
+
+// CHECK: cmgt v0.8b, v15.8b, #0x0 // encoding: [0xe0,0x89,0x20,0x0e]
+// CHECK: cmgt v1.16b, v31.16b, #0x0 // encoding: [0xe1,0x8b,0x20,0x4e]
+// CHECK: cmgt v15.4h, v16.4h, #0x0 // encoding: [0x0f,0x8a,0x60,0x0e]
+// CHECK: cmgt v5.8h, v6.8h, #0x0 // encoding: [0xc5,0x88,0x60,0x4e]
+// CHECK: cmgt v29.2s, v27.2s, #0x0 // encoding: [0x7d,0x8b,0xa0,0x0e]
+// CHECK: cmgt v9.4s, v7.4s, #0x0 // encoding: [0xe9,0x88,0xa0,0x4e]
+// CHECK: cmgt v3.2d, v31.2d, #0x0 // encoding: [0xe3,0x8b,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
+//----------------------------------------------------------------------
+ cmle v0.8b, v15.8b, #0
+ cmle v1.16b, v31.16b, #0
+ cmle v15.4h, v16.4h, #0
+ cmle v5.8h, v6.8h, #0
+ cmle v29.2s, v27.2s, #0
+ cmle v9.4s, v7.4s, #0
+ cmle v3.2d, v31.2d, #0
+
+// CHECK: cmle v0.8b, v15.8b, #0x0 // encoding: [0xe0,0x99,0x20,0x2e]
+// CHECK: cmle v1.16b, v31.16b, #0x0 // encoding: [0xe1,0x9b,0x20,0x6e]
+// CHECK: cmle v15.4h, v16.4h, #0x0 // encoding: [0x0f,0x9a,0x60,0x2e]
+// CHECK: cmle v5.8h, v6.8h, #0x0 // encoding: [0xc5,0x98,0x60,0x6e]
+// CHECK: cmle v29.2s, v27.2s, #0x0 // encoding: [0x7d,0x9b,0xa0,0x2e]
+// CHECK: cmle v9.4s, v7.4s, #0x0 // encoding: [0xe9,0x98,0xa0,0x6e]
+// CHECK: cmle v3.2d, v31.2d, #0x0 // encoding: [0xe3,0x9b,0xe0,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than Zero (Signed Integer)
+//----------------------------------------------------------------------
+ cmlt v0.8b, v15.8b, #0
+ cmlt v1.16b, v31.16b, #0
+ cmlt v15.4h, v16.4h, #0
+ cmlt v5.8h, v6.8h, #0
+ cmlt v29.2s, v27.2s, #0
+ cmlt v9.4s, v7.4s, #0
+ cmlt v3.2d, v31.2d, #0
+
+// CHECK: cmlt v0.8b, v15.8b, #0x0 // encoding: [0xe0,0xa9,0x20,0x0e]
+// CHECK: cmlt v1.16b, v31.16b, #0x0 // encoding: [0xe1,0xab,0x20,0x4e]
+// CHECK: cmlt v15.4h, v16.4h, #0x0 // encoding: [0x0f,0xaa,0x60,0x0e]
+// CHECK: cmlt v5.8h, v6.8h, #0x0 // encoding: [0xc5,0xa8,0x60,0x4e]
+// CHECK: cmlt v29.2s, v27.2s, #0x0 // encoding: [0x7d,0xab,0xa0,0x0e]
+// CHECK: cmlt v9.4s, v7.4s, #0x0 // encoding: [0xe9,0xa8,0xa0,0x4e]
+// CHECK: cmlt v3.2d, v31.2d, #0x0 // encoding: [0xe3,0xab,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal to Zero (Floating Point)
+//----------------------------------------------------------------------
+ fcmeq v0.2s, v31.2s, #0.0
+ fcmeq v4.4s, v7.4s, #0.0
+ fcmeq v29.2d, v2.2d, #0.0
+
+// CHECK: fcmeq v0.2s, v31.2s, #0.0 // encoding: [0xe0,0xdb,0xa0,0x0e]
+// CHECK: fcmeq v4.4s, v7.4s, #0.0 // encoding: [0xe4,0xd8,0xa0,0x4e]
+// CHECK: fcmeq v29.2d, v2.2d, #0.0 // encoding: [0x5d,0xd8,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
+//----------------------------------------------------------------------
+ fcmge v31.4s, v29.4s, #0.0
+ fcmge v3.2s, v8.2s, #0.0
+ fcmge v17.2d, v15.2d, #0.0
+
+// CHECK: fcmge v31.4s, v29.4s, #0.0 // encoding: [0xbf,0xcb,0xa0,0x6e]
+// CHECK: fcmge v3.2s, v8.2s, #0.0 // encoding: [0x03,0xc9,0xa0,0x2e]
+// CHECK: fcmge v17.2d, v15.2d, #0.0 // encoding: [0xf1,0xc9,0xe0,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Zero (Floating Point)
+//----------------------------------------------------------------------
+ fcmgt v0.2s, v31.2s, #0.0
+ fcmgt v4.4s, v7.4s, #0.0
+ fcmgt v29.2d, v2.2d, #0.0
+
+// CHECK: fcmgt v0.2s, v31.2s, #0.0 // encoding: [0xe0,0xcb,0xa0,0x0e]
+// CHECK: fcmgt v4.4s, v7.4s, #0.0 // encoding: [0xe4,0xc8,0xa0,0x4e]
+// CHECK: fcmgt v29.2d, v2.2d, #0.0 // encoding: [0x5d,0xc8,0xe0,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
+//----------------------------------------------------------------------
+ fcmle v1.4s, v8.4s, #0.0
+ fcmle v3.2s, v20.2s, #0.0
+ fcmle v7.2d, v13.2d, #0.0
+
+// CHECK: fcmle v1.4s, v8.4s, #0.0 // encoding: [0x01,0xd9,0xa0,0x6e]
+// CHECK: fcmle v3.2s, v20.2s, #0.0 // encoding: [0x83,0xda,0xa0,0x2e]
+// CHECK: fcmle v7.2d, v13.2d, #0.0 // encoding: [0xa7,0xd9,0xe0,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than Zero (Floating Point)
+//----------------------------------------------------------------------
+ fcmlt v16.2s, v2.2s, #0.0
+ fcmlt v15.4s, v4.4s, #0.0
+ fcmlt v5.2d, v29.2d, #0.0
+
+// CHECK: fcmlt v16.2s, v2.2s, #0.0 // encoding: [0x50,0xe8,0xa0,0x0e]
+// CHECK: fcmlt v15.4s, v4.4s, #0.0 // encoding: [0x8f,0xe8,0xa0,0x4e]
+// CHECK: fcmlt v5.2d, v29.2d, #0.0 // encoding: [0xa5,0xeb,0xe0,0x4e]
+
+
+
+
+
+
+
+
+
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
new file mode 100644
index 0000000000..5373889222
--- /dev/null
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -0,0 +1,1207 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+
+//------------------------------------------------------------------------------
+// Vector Integer Add/sub
+//------------------------------------------------------------------------------
+
+ // Mismatched vector types
+ add v0.16b, v1.8b, v2.8b
+ sub v0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: add v0.16b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sub v0.2d, v1.2d, v2.2s
+// CHECK-ERROR: ^
+
+//------------------------------------------------------------------------------
+// Vector Floating-Point Add/sub
+//------------------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ fadd v0.2d, v1.2s, v2.2s
+ fsub v0.4s, v1.2s, v2.4s
+ fsub v0.8b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fadd v0.2d, v1.2s, v2.2s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fsub v0.4s, v1.2s, v2.4s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fsub v0.8b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Integer Mul
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ mul v0.16b, v1.8b, v2.8b
+ mul v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: mul v0.16b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: mul v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Mul/Div
+//----------------------------------------------------------------------
+ // Mismatched vector types
+ fmul v0.16b, v1.8b, v2.8b
+ fdiv v0.2s, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmul v0.16b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fdiv v0.2s, v1.2d, v2.2d
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector And Orr Eor Bsl Bit Bif, Orn, Bic,
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ and v0.8b, v1.16b, v2.8b
+ orr v0.4h, v1.4h, v2.4h
+ eor v0.2s, v1.2s, v2.2s
+ bsl v0.8b, v1.16b, v2.8b
+ bsl v0.2s, v1.2s, v2.2s
+ bit v0.2d, v1.2d, v2.2d
+ bif v0.4h, v1.4h, v2.4h
+ orn v0.8b, v1.16b, v2.16b
+ bic v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: and v0.8b, v1.16b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: orr v0.4h, v1.4h, v2.4h
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: eor v0.2s, v1.2s, v2.2s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: bsl v0.8b, v1.16b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: bsl v0.2s, v1.2s, v2.2s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: bit v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: bif v0.4h, v1.4h, v2.4h
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: orn v0.8b, v1.16b, v2.16b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: bic v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Integer Multiply-accumulate and Multiply-subtract
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ mla v0.16b, v1.8b, v2.8b
+ mls v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: mla v0.16b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: mls v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Multiply-accumulate and Multiply-subtract
+//----------------------------------------------------------------------
+ // Mismatched vector types
+ fmla v0.2s, v1.2d, v2.2d
+ fmls v0.16b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmla v0.2s, v1.2d, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmls v0.16b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Move Immediate Shifted
+// Vector Move Inverted Immediate Shifted
+// Vector Bitwise Bit Clear (AND NOT) - immediate
+// Vector Bitwise OR - immedidate
+//----------------------------------------------------------------------
+ // out of range immediate (0 to 0xff)
+ movi v0.2s, #-1
+ mvni v1.4s, #256
+ // out of range shift (0, 8, 16, 24 and 0, 8)
+ bic v15.4h, #1, lsl #7
+ orr v31.2s, #1, lsl #25
+ movi v5.4h, #10, lsl #16
+ // invalid vector type (2s, 4s, 4h, 8h)
+ movi v5.8b, #1, lsl #8
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: movi v0.2s, #-1
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: mvni v1.4s, #256
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: bic v15.4h, #1, lsl #7
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: orr v31.2s, #1, lsl #25
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: movi v5.4h, #10, lsl #16
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: movi v5.8b, #1, lsl #8
+// CHECK-ERROR: ^
+//----------------------------------------------------------------------
+// Vector Move Immediate Masked
+// Vector Move Inverted Immediate Masked
+//----------------------------------------------------------------------
+ // out of range immediate (0 to 0xff)
+ movi v0.2s, #-1, msl #8
+ mvni v7.4s, #256, msl #16
+ // out of range shift (8, 16)
+ movi v3.2s, #1, msl #0
+ mvni v17.4s, #255, msl #32
+ // invalid vector type (2s, 4s)
+ movi v5.4h, #31, msl #8
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: movi v0.2s, #-1, msl #8
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: mvni v7.4s, #256, msl #16
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: movi v3.2s, #1, msl #0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: mvni v17.4s, #255, msl #32
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: movi v5.4h, #31, msl #8
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Immediate - per byte
+//----------------------------------------------------------------------
+ // out of range immediate (0 to 0xff)
+ movi v0.8b, #-1
+ movi v1.16b, #256
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: movi v0.8b, #-1
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: movi v1.16b, #256
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Move Immediate - bytemask, per doubleword
+//---------------------------------------------------------------------
+ // invalid bytemask (0x00 or 0xff)
+ movi v0.2d, #0x10ff00ff00ff00ff
+
+// CHECK:ERROR: error: invalid operand for instruction
+// CHECK:ERROR: movi v0.2d, #0x10ff00ff00ff00ff
+// CHECK:ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Move Immediate - bytemask, one doubleword
+//----------------------------------------------------------------------
+ // invalid bytemask (0x00 or 0xff)
+ movi v0.2d, #0xffff00ff001f00ff
+
+// CHECK:ERROR: error: invalid operand for instruction
+// CHECK:ERROR: movi v0.2d, #0xffff00ff001f00ff
+// CHECK:ERROR: ^
+//----------------------------------------------------------------------
+// Vector Floating Point Move Immediate
+//----------------------------------------------------------------------
+ // invalid vector type (2s, 4s, 2d)
+ fmov v0.4h, #1.0
+
+// CHECK:ERROR: error: invalid operand for instruction
+// CHECK:ERROR: fmov v0.4h, #1.0
+// CHECK:ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Move - register
+//----------------------------------------------------------------------
+ // invalid vector type (8b, 16b)
+ mov v0.2s, v31.8b
+// CHECK:ERROR: error: invalid operand for instruction
+// CHECK:ERROR: mov v0.2s, v31.8b
+// CHECK:ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference and Accumulate (Signed, Unsigned)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types (2d)
+ saba v0.16b, v1.8b, v2.8b
+ uaba v0.2d, v1.2d, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: saba v0.16b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uaba v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference and Accumulate (Signed, Unsigned)
+// Vector Absolute Difference (Signed, Unsigned)
+
+ // Mismatched and invalid vector types (2d)
+ uaba v0.16b, v1.8b, v2.8b
+ saba v0.2d, v1.2d, v2.2d
+ uabd v0.4s, v1.2s, v2.2s
+ sabd v0.4h, v1.8h, v8.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uaba v0.16b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: saba v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uabd v0.4s, v1.2s, v2.2s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sabd v0.4h, v1.8h, v8.8h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Absolute Difference (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fabd v0.2s, v1.4s, v2.2d
+ fabd v0.4h, v1.4h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fabd v0.2s, v1.4s, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fabd v0.4h, v1.4h, v2.4h
+// CHECK-ERROR: ^
+//----------------------------------------------------------------------
+// Vector Multiply (Polynomial)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ pmul v0.8b, v1.8b, v2.16b
+ pmul v0.2s, v1.2s, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: pmul v0.8b, v1.8b, v2.16b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: pmul v0.2s, v1.2s, v2.2s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Add and Sub
+//----------------------------------------------------------------------
+
+ // Mismatched registers
+ add d0, s1, d2
+ sub s1, d1, d2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: add d0, s1, d2
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sub s1, d1, d2
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Reciprocal Step (Floating Point)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ frecps v0.4s, v1.2d, v2.4s
+ frecps v0.8h, v1.8h, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: frecps v0.4s, v1.2d, v2.4s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: frecps v0.8h, v1.8h, v2.8h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Reciprocal Square Root Step (Floating Point)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ frsqrts v0.2d, v1.2d, v2.2s
+ frsqrts v0.4h, v1.4h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: frsqrts v0.2d, v1.2d, v2.2s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: frsqrts v0.4h, v1.4h, v2.4h
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ facge v0.2d, v1.2s, v2.2d
+ facge v0.4h, v1.4h, v2.4h
+ facle v0.8h, v1.4h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: facge v0.2d, v1.2s, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: facge v0.4h, v1.4h, v2.4h
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: facle v0.8h, v1.4h, v2.4h
+// CHECK-ERROR: ^
+//----------------------------------------------------------------------
+// Vector Absolute Compare Mask Less Than (Floating Point)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ facgt v0.2d, v1.2d, v2.4s
+ facgt v0.8h, v1.8h, v2.8h
+ faclt v0.8b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: facgt v0.2d, v1.2d, v2.4s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: facgt v0.8h, v1.8h, v2.8h
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: faclt v0.8b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal (Integer)
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ cmeq c0.2d, v1.2d, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmeq c0.2d, v1.2d, v2.2s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Higher or Same (Unsigned Integer)
+// Vector Compare Mask Less or Same (Unsigned Integer)
+// CMLS is alias for CMHS with operands reversed.
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ cmhs c0.4h, v1.8b, v2.8b
+ cmls c0.16b, v1.16b, v2.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmhs c0.4h, v1.8b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmls c0.16b, v1.16b, v2.2d
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal (Integer)
+// Vector Compare Mask Less Than or Equal (Integer)
+// CMLE is alias for CMGE with operands reversed.
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ cmge c0.8h, v1.8b, v2.8b
+ cmle c0.4h, v1.2s, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmge c0.8h, v1.8b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmle c0.4h, v1.2s, v2.2s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Higher (Unsigned Integer)
+// Vector Compare Mask Lower (Unsigned Integer)
+// CMLO is alias for CMHI with operands reversed.
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ cmhi c0.4s, v1.4s, v2.16b
+ cmlo c0.8b, v1.8b, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmhi c0.4s, v1.4s, v2.16b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmlo c0.8b, v1.8b, v2.2s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than (Integer)
+// Vector Compare Mask Less Than (Integer)
+// CMLT is alias for CMGT with operands reversed.
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ cmgt c0.8b, v1.4s, v2.16b
+ cmlt c0.8h, v1.16b, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmgt c0.8b, v1.4s, v2.16b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmlt c0.8h, v1.16b, v2.4s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Bitwise Test (Integer)
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ cmtst c0.16b, v1.16b, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmtst c0.16b, v1.16b, v2.4s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal (Floating Point)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ fcmeq v0.2d, v1.2s, v2.2d
+ fcmeq v0.16b, v1.16b, v2.16b
+ fcmeq v0.8b, v1.4h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmeq v0.2d, v1.2s, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmeq v0.16b, v1.16b, v2.16b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmeq v0.8b, v1.4h, v2.4h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Or Equal (Floating Point)
+// Vector Compare Mask Less Than Or Equal (Floating Point)
+// FCMLE is alias for FCMGE with operands reversed.
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ fcmge v31.4s, v29.2s, v28.4s
+ fcmge v3.8b, v8.2s, v12.2s
+ fcmle v17.8h, v15.2d, v13.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmge v31.4s, v29.2s, v28.4s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmge v3.8b, v8.2s, v12.2s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmle v17.8h, v15.2d, v13.2d
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than (Floating Point)
+// Vector Compare Mask Less Than (Floating Point)
+// FCMLT is alias for FCMGT with operands reversed.
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types
+ fcmgt v0.2d, v31.2s, v16.2s
+ fcmgt v4.4s, v7.4s, v15.4h
+ fcmlt v29.2d, v5.2d, v2.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmgt v0.2d, v31.2s, v16.2s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR: fcmgt v4.4s, v7.4s, v15.4h
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR: fcmlt v29.2d, v5.2d, v2.16b
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal to Zero (Integer)
+//----------------------------------------------------------------------
+ // Mismatched vector types and invalid imm
+ // Mismatched vector types
+ cmeq c0.2d, v1.2s, #0
+ cmeq c0.2d, v1.2d, #1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmeq c0.2d, v1.2s, #0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmeq c0.2d, v1.2d, #1
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
+//----------------------------------------------------------------------
+ // Mismatched vector types and invalid imm
+ cmge c0.8h, v1.8b, #0
+ cmge c0.4s, v1.4s, #-1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmge c0.8h, v1.8b, #0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmge c0.4s, v1.4s, #-1
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Zero (Signed Integer)
+//----------------------------------------------------------------------
+ // Mismatched vector types and invalid imm
+ cmgt c0.8b, v1.4s, #0
+ cmgt c0.8b, v1.8b, #-255
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmgt c0.8b, v1.4s, #0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmgt c0.8b, v1.8b, #-255
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
+//----------------------------------------------------------------------
+ // Mismatched vector types and invalid imm
+ cmle c0.4h, v1.2s, #0
+ cmle c0.16b, v1.16b, #16
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmle c0.4h, v1.2s, #0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmle c0.16b, v1.16b, #16
+// CHECK-ERROR: ^
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than Zero (Signed Integer)
+//----------------------------------------------------------------------
+ // Mismatched vector types and invalid imm
+ cmlt c0.8h, v1.16b, #0
+ cmlt c0.8h, v1.8h, #-15
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmlt c0.8h, v1.16b, #0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: cmlt c0.8h, v1.8h, #-15
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Equal to Zero (Floating Point)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types, invalid imm
+ fcmeq v0.2d, v1.2s, #0.0
+ fcmeq v0.16b, v1.16b, #0.0
+ fcmeq v0.8b, v1.4h, #1.0
+ fcmeq v0.8b, v1.4h, #1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmeq v0.2d, v1.2s, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmeq v0.16b, v1.16b, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmeq v0.8b, v1.4h, #1.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: Expected floating-point immediate
+// CHECK-ERROR: fcmeq v0.8b, v1.4h, #1
+// CHECK-ERROR: ^
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types, invalid imm
+ fcmge v31.4s, v29.2s, #0.0
+ fcmge v3.8b, v8.2s, #0.0
+ fcmle v17.8h, v15.2d, #-1.0
+ fcmle v17.8h, v15.2d, #0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmge v31.4s, v29.2s, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmge v3.8b, v8.2s, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmle v17.8h, v15.2d, #-1.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: Expected floating-point immediate
+// CHECK-ERROR: fcmle v17.8h, v15.2d, #0
+// CHECK-ERROR: ^
+//----------------------------------------------------------------------
+// Vector Compare Mask Greater Than Zero (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types, invalid imm
+ fcmgt v0.2d, v31.2s, #0.0
+ fcmgt v4.4s, v7.4h, #0.0
+ fcmlt v29.2d, v5.2d, #255.0
+ fcmlt v29.2d, v5.2d, #255
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmgt v0.2d, v31.2s, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmgt v4.4s, v7.4h, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR: fcmlt v29.2d, v5.2d, #255.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: Expected floating-point immediate
+// CHECK-ERROR: fcmlt v29.2d, v5.2d, #255
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types, invalid imm
+ fcmge v31.4s, v29.2s, #0.0
+ fcmge v3.8b, v8.2s, #0.0
+ fcmle v17.2d, v15.2d, #15.0
+ fcmle v17.2d, v15.2d, #15
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmge v31.4s, v29.2s, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmge v3.8b, v8.2s, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR: fcmle v17.2d, v15.2d, #15.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: Expected floating-point immediate
+// CHECK-ERROR: fcmle v17.2d, v15.2d, #15
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Compare Mask Less Than Zero (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types, invalid imm
+ fcmgt v0.2d, v31.2s, #0.0
+ fcmgt v4.4s, v7.4h, #0.0
+ fcmlt v29.2d, v5.2d, #16.0
+ fcmlt v29.2d, v5.2d, #2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmgt v0.2d, v31.2s, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fcmgt v4.4s, v7.4h, #0.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: expected floating-point constant #0.0 or invalid register type
+// CHECK-ERROR: fcmlt v29.2d, v5.2d, #16.0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: Expected floating-point immediate
+// CHECK-ERROR: fcmlt v29.2d, v5.2d, #2
+// CHECK-ERROR: ^
+
+/-----------------------------------------------------------------------
+// Vector Integer Halving Add (Signed)
+// Vector Integer Halving Add (Unsigned)
+// Vector Integer Halving Sub (Signed)
+// Vector Integer Halving Sub (Unsigned)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types (2d)
+ shadd v0.2d, v1.2d, v2.2d
+ uhadd v4.2s, v5.2s, v5.4h
+ shsub v11.4h, v12.8h, v13.4h
+ uhsub v31.16b, v29.8b, v28.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: shadd v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uhadd v4.2s, v5.2s, v5.4h
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: shsub v11.4h, v12.8h, v13.4h
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uhsub v31.16b, v29.8b, v28.8b
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Integer Rouding Halving Add (Signed)
+// Vector Integer Rouding Halving Add (Unsigned)
+//----------------------------------------------------------------------
+
+ // Mismatched and invalid vector types (2d)
+ srhadd v0.2s, v1.2s, v2.2d
+ urhadd v0.16b, v1.16b, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: srhadd v0.2s, v1.2s, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: urhadd v0.16b, v1.16b, v2.8h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Integer Saturating Add (Signed)
+// Vector Integer Saturating Add (Unsigned)
+// Vector Integer Saturating Sub (Signed)
+// Vector Integer Saturating Sub (Unsigned)
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ sqadd v0.2s, v1.2s, v2.2d
+ uqadd v31.8h, v1.4h, v2.4h
+ sqsub v10.8h, v1.16b, v2.16b
+ uqsub v31.8b, v1.8b, v2.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqadd v0.2s, v1.2s, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqadd v31.8h, v1.4h, v2.4h
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqsub v10.8h, v1.16b, v2.16b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqsub v31.8b, v1.8b, v2.4s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Saturating Add (Signed)
+// Scalar Integer Saturating Add (Unsigned)
+// Scalar Integer Saturating Sub (Signed)
+// Scalar Integer Saturating Sub (Unsigned)
+//----------------------------------------------------------------------
+
+ // Mismatched registers
+ sqadd d0, s31, d2
+ uqadd s0, s1, d2
+ sqsub b0, b2, s18
+ uqsub h1, h2, d2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqadd d0, s31, d2
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqadd s0, s1, d2
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqsub b0, b2, s18
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqsub h1, h2, d2
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Shift Left (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+ // Mismatched vector types
+ sshl v0.4s, v15.2s, v16.2s
+ ushl v1.16b, v25.16b, v6.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sshl v0.4s, v15.2s, v16.2s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ushl v1.16b, v25.16b, v6.8h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Saturating Shift Left (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+ // Mismatched vector types
+ sqshl v0.2s, v15.2s, v16.2d
+ uqshl v1.8b, v25.4h, v6.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqshl v0.2s, v15.2s, v16.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqshl v1.8b, v25.4h, v6.8h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Rouding Shift Left (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+ // Mismatched vector types
+ srshl v0.8h, v15.8h, v16.16b
+ urshl v1.2d, v25.2d, v6.4s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: srshl v0.8h, v15.8h, v16.16b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: urshl v1.2d, v25.2d, v6.4s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+ // Mismatched vector types
+ sqrshl v0.2s, v15.8h, v16.16b
+ uqrshl v1.4h, v25.4h, v6.2d
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqrshl v0.2s, v15.8h, v16.16b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqrshl v1.4h, v25.4h, v6.2d
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Shift Left (Signed, Unsigned)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ sshl d0, d1, s2
+ ushl b2, b0, b1
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sshl d0, d1, s2
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: ushl b2, b0, b1
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Saturating Shift Left (Signed, Unsigned)
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ sqshl b0, b1, s0
+ uqshl h0, h1, b0
+ sqshl s0, s1, h0
+ uqshl d0, d1, b0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqshl b0, b1, s0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqshl h0, h1, b0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqshl s0, s1, h0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqshl d0, d1, b0
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Scalar Integer Rouding Shift Left (Signed, Unsigned)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ srshl h0, h1, h2
+ urshl s0, s1, s2
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: srshl h0, h1, h2
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: urshl s0, s1, s2
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ sqrshl b0, b1, s0
+ uqrshl h0, h1, b0
+ sqrshl s0, s1, h0
+ uqrshl d0, d1, b0
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqrshl b0, b1, s0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqrshl h0, h1, b0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqrshl s0, s1, h0
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uqrshl d0, d1, b0
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Maximum (Signed, Unsigned)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ smax v0.2d, v1.2d, v2.2d
+ umax v0.4h, v1.4h, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: smax v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: umax v0.4h, v1.4h, v2.2s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Minimum (Signed, Unsigned)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ smin v0.2d, v1.2d, v2.2d
+ umin v0.2s, v1.2s, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: smin v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: umin v0.2s, v1.2s, v2.8b
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Maximum (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fmax v0.2s, v1.2s, v2.4s
+ fmax v0.8b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmax v0.2s, v1.2s, v2.4s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmax v0.8b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+//----------------------------------------------------------------------
+// Vector Minimum (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fmin v0.4s, v1.4s, v2.2d
+ fmin v0.8h, v1.8h, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmin v0.4s, v1.4s, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmin v0.8h, v1.8h, v2.8h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector maxNum (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fmaxnm v0.2s, v1.2s, v2.2d
+ fmaxnm v0.4h, v1.8h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmaxnm v0.2s, v1.2s, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmaxnm v0.4h, v1.8h, v2.4h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector minNum (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fminnm v0.4s, v1.2s, v2.4s
+ fminnm v0.16b, v0.16b, v0.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fminnm v0.4s, v1.2s, v2.4s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fminnm v0.16b, v0.16b, v0.16b
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Maximum Pairwise (Signed, Unsigned)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ smaxp v0.2d, v1.2d, v2.2d
+ umaxp v0.4h, v1.4h, v2.2s
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: smaxp v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: umaxp v0.4h, v1.4h, v2.2s
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Minimum Pairwise (Signed, Unsigned)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ sminp v0.2d, v1.2d, v2.2d
+ uminp v0.2s, v1.2s, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sminp v0.2d, v1.2d, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: uminp v0.2s, v1.2s, v2.8b
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Maximum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fmaxp v0.2s, v1.2s, v2.4s
+ fmaxp v0.8b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmaxp v0.2s, v1.2s, v2.4s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmaxp v0.8b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+//----------------------------------------------------------------------
+// Vector Minimum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fminp v0.4s, v1.4s, v2.2d
+ fminp v0.8h, v1.8h, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fminp v0.4s, v1.4s, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fminp v0.8h, v1.8h, v2.8h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector maxNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fmaxnmp v0.2s, v1.2s, v2.2d
+ fmaxnmp v0.4h, v1.8h, v2.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmaxnmp v0.2s, v1.2s, v2.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmaxnmp v0.4h, v1.8h, v2.4h
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector minNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fminnmp v0.4s, v1.2s, v2.4s
+ fminnmp v0.16b, v0.16b, v0.16b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fminnmp v0.4s, v1.2s, v2.4s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fminnmp v0.16b, v0.16b, v0.16b
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Add Pairwise (Integer)
+//----------------------------------------------------------------------
+
+ // Mismatched vector types
+ addp v0.16b, v1.8b, v2.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: addp v0.16b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Add Pairwise (Floating Point)
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ faddp v0.16b, v1.8b, v2.8b
+ faddp v0.2d, v1.2d, v2.8h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: faddp v0.16b, v1.8b, v2.8b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: faddp v0.2d, v1.2d, v2.8h
+// CHECK-ERROR: ^
+
+
+//----------------------------------------------------------------------
+// Vector Saturating Doubling Multiply High
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ sqdmulh v2.4h, v25.8h, v3.4h
+ sqdmulh v12.2d, v5.2d, v13.2d
+ sqdmulh v3.8b, v1.8b, v30.8b
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqdmulh v2.4h, v25.8h, v3.4h
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqdmulh v12.2d, v5.2d, v13.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqdmulh v3.8b, v1.8b, v30.8b
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Saturating Rouding Doubling Multiply High
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ sqrdmulh v2.2s, v25.4s, v3.4s
+ sqrdmulh v12.16b, v5.16b, v13.16b
+ sqrdmulh v3.4h, v1.4h, v30.2d
+
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqrdmulh v2.2s, v25.4s, v3.4s
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqrdmulh v12.16b, v5.16b, v13.16b
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: sqrdmulh v3.4h, v1.4h, v30.2d
+// CHECK-ERROR: ^
+
+//----------------------------------------------------------------------
+// Vector Multiply Extended
+//----------------------------------------------------------------------
+ // Mismatched and invalid vector types
+ fmulx v21.2s, v5.2s, v13.2d
+ fmulx v1.4h, v25.4h, v3.4h
+
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmulx v21.2s, v5.2s, v13.2d
+// CHECK-ERROR: ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: fmulx v1.4h, v25.4h, v3.4h
+// CHECK-ERROR: ^
diff --git a/test/MC/AArch64/neon-facge-facgt.s b/test/MC/AArch64/neon-facge-facgt.s
new file mode 100644
index 0000000000..212eda2f20
--- /dev/null
+++ b/test/MC/AArch64/neon-facge-facgt.s
@@ -0,0 +1,41 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
+// FACLE is alias for FACGE with operands reversed
+//----------------------------------------------------------------------
+ facge v0.2s, v31.2s, v16.2s
+ facge v4.4s, v7.4s, v15.4s
+ facge v29.2d, v2.2d, v5.2d
+ facle v0.2s, v16.2s, v31.2s
+ facle v4.4s, v15.4s, v7.4s
+ facle v29.2d, v5.2d, v2.2d
+
+// CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e]
+// CHECK: facge v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xec,0x2f,0x6e]
+// CHECK: facge v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xec,0x65,0x6e]
+// CHECK: facge v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xef,0x30,0x2e]
+// CHECK: facge v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xec,0x2f,0x6e]
+// CHECK: facge v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xec,0x65,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Absolute Compare Mask Less Than (Floating Point)
+// FACLT is alias for FACGT with operands reversed
+//----------------------------------------------------------------------
+ facgt v31.4s, v29.4s, v28.4s
+ facgt v3.2s, v8.2s, v12.2s
+ facgt v17.2d, v15.2d, v13.2d
+ faclt v31.4s, v28.4s, v29.4s
+ faclt v3.2s, v12.2s, v8.2s
+ faclt v17.2d, v13.2d, v15.2d
+
+// CHECK: facgt v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xef,0xbc,0x6e]
+// CHECK: facgt v3.2s, v8.2s, v12.2s // encoding: [0x03,0xed,0xac,0x2e]
+// CHECK: facgt v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xed,0xed,0x6e]
+// CHECK: facgt v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xef,0xbc,0x6e]
+// CHECK: facgt v3.2s, v8.2s, v12.2s // encoding: [0x03,0xed,0xac,0x2e]
+// CHECK: facgt v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xed,0xed,0x6e]
+
+
diff --git a/test/MC/AArch64/neon-frsqrt-frecp.s b/test/MC/AArch64/neon-frsqrt-frecp.s
new file mode 100644
index 0000000000..79fe5da5e7
--- /dev/null
+++ b/test/MC/AArch64/neon-frsqrt-frecp.s
@@ -0,0 +1,27 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Reciprocal Square Root Step (Floating Point)
+//----------------------------------------------------------------------
+ frsqrts v0.2s, v31.2s, v16.2s
+ frsqrts v4.4s, v7.4s, v15.4s
+ frsqrts v29.2d, v2.2d, v5.2d
+
+// CHECK: frsqrts v0.2s, v31.2s, v16.2s // encoding: [0xe0,0xff,0xb0,0x0e]
+// CHECK: frsqrts v4.4s, v7.4s, v15.4s // encoding: [0xe4,0xfc,0xaf,0x4e]
+// CHECK: frsqrts v29.2d, v2.2d, v5.2d // encoding: [0x5d,0xfc,0xe5,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Reciprocal Step (Floating Point)
+//----------------------------------------------------------------------
+ frecps v31.4s, v29.4s, v28.4s
+ frecps v3.2s, v8.2s, v12.2s
+ frecps v17.2d, v15.2d, v13.2d
+
+// CHECK: frecps v31.4s, v29.4s, v28.4s // encoding: [0xbf,0xff,0x3c,0x4e]
+// CHECK: frecps v3.2s, v8.2s, v12.2s // encoding: [0x03,0xfd,0x2c,0x0e]
+// CHECK: frecps v17.2d, v15.2d, v13.2d // encoding: [0xf1,0xfd,0x6d,0x4e]
+
+
diff --git a/test/MC/AArch64/neon-halving-add-sub.s b/test/MC/AArch64/neon-halving-add-sub.s
new file mode 100644
index 0000000000..555f1b83b4
--- /dev/null
+++ b/test/MC/AArch64/neon-halving-add-sub.s
@@ -0,0 +1,74 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Halving Add (Signed)
+//------------------------------------------------------------------------------
+ shadd v0.8b, v1.8b, v2.8b
+ shadd v0.16b, v1.16b, v2.16b
+ shadd v0.4h, v1.4h, v2.4h
+ shadd v0.8h, v1.8h, v2.8h
+ shadd v0.2s, v1.2s, v2.2s
+ shadd v0.4s, v1.4s, v2.4s
+
+// CHECK: shadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x04,0x22,0x0e]
+// CHECK: shadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x04,0x22,0x4e]
+// CHECK: shadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x04,0x62,0x0e]
+// CHECK: shadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x04,0x62,0x4e]
+// CHECK: shadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x04,0xa2,0x0e]
+// CHECK: shadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x04,0xa2,0x4e]
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Halving Add (Unsigned)
+//------------------------------------------------------------------------------
+ uhadd v0.8b, v1.8b, v2.8b
+ uhadd v0.16b, v1.16b, v2.16b
+ uhadd v0.4h, v1.4h, v2.4h
+ uhadd v0.8h, v1.8h, v2.8h
+ uhadd v0.2s, v1.2s, v2.2s
+ uhadd v0.4s, v1.4s, v2.4s
+
+// CHECK: uhadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x04,0x22,0x2e]
+// CHECK: uhadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x04,0x22,0x6e]
+// CHECK: uhadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x04,0x62,0x2e]
+// CHECK: uhadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x04,0x62,0x6e]
+// CHECK: uhadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x04,0xa2,0x2e]
+// CHECK: uhadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x04,0xa2,0x6e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Halving Sub (Signed)
+//------------------------------------------------------------------------------
+ shsub v0.8b, v1.8b, v2.8b
+ shsub v0.16b, v1.16b, v2.16b
+ shsub v0.4h, v1.4h, v2.4h
+ shsub v0.8h, v1.8h, v2.8h
+ shsub v0.2s, v1.2s, v2.2s
+ shsub v0.4s, v1.4s, v2.4s
+
+// CHECK: shsub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x24,0x22,0x0e]
+// CHECK: shsub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x24,0x22,0x4e]
+// CHECK: shsub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x24,0x62,0x0e]
+// CHECK: shsub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x24,0x62,0x4e]
+// CHECK: shsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x24,0xa2,0x0e]
+// CHECK: shsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x24,0xa2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Halving Sub (Unsigned)
+//------------------------------------------------------------------------------
+ uhsub v0.8b, v1.8b, v2.8b
+ uhsub v0.16b, v1.16b, v2.16b
+ uhsub v0.4h, v1.4h, v2.4h
+ uhsub v0.8h, v1.8h, v2.8h
+ uhsub v0.2s, v1.2s, v2.2s
+ uhsub v0.4s, v1.4s, v2.4s
+
+// CHECK: uhsub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x24,0x22,0x2e]
+// CHECK: uhsub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x24,0x22,0x6e]
+// CHECK: uhsub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x24,0x62,0x2e]
+// CHECK: uhsub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x24,0x62,0x6e]
+// CHECK: uhsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x24,0xa2,0x2e]
+// CHECK: uhsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x24,0xa2,0x6e]
+
diff --git a/test/MC/AArch64/neon-max-min-pairwise.s b/test/MC/AArch64/neon-max-min-pairwise.s
new file mode 100644
index 0000000000..8d2dadb199
--- /dev/null
+++ b/test/MC/AArch64/neon-max-min-pairwise.s
@@ -0,0 +1,110 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Maximum Pairwise (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+ smaxp v0.8b, v1.8b, v2.8b
+ smaxp v0.16b, v1.16b, v2.16b
+ smaxp v0.4h, v1.4h, v2.4h
+ smaxp v0.8h, v1.8h, v2.8h
+ smaxp v0.2s, v1.2s, v2.2s
+ smaxp v0.4s, v1.4s, v2.4s
+
+// CHECK: smaxp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xa4,0x22,0x0e]
+// CHECK: smaxp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xa4,0x22,0x4e]
+// CHECK: smaxp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xa4,0x62,0x0e]
+// CHECK: smaxp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xa4,0x62,0x4e]
+// CHECK: smaxp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xa4,0xa2,0x0e]
+// CHECK: smaxp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xa4,0xa2,0x4e]
+
+ umaxp v0.8b, v1.8b, v2.8b
+ umaxp v0.16b, v1.16b, v2.16b
+ umaxp v0.4h, v1.4h, v2.4h
+ umaxp v0.8h, v1.8h, v2.8h
+ umaxp v0.2s, v1.2s, v2.2s
+ umaxp v0.4s, v1.4s, v2.4s
+
+// CHECK: umaxp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xa4,0x22,0x2e]
+// CHECK: umaxp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xa4,0x22,0x6e]
+// CHECK: umaxp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xa4,0x62,0x2e]
+// CHECK: umaxp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xa4,0x62,0x6e]
+// CHECK: umaxp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xa4,0xa2,0x2e]
+// CHECK: umaxp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xa4,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Minimum Pairwise (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+ sminp v0.8b, v1.8b, v2.8b
+ sminp v0.16b, v1.16b, v2.16b
+ sminp v0.4h, v1.4h, v2.4h
+ sminp v0.8h, v1.8h, v2.8h
+ sminp v0.2s, v1.2s, v2.2s
+ sminp v0.4s, v1.4s, v2.4s
+
+// CHECK: sminp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xac,0x22,0x0e]
+// CHECK: sminp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xac,0x22,0x4e]
+// CHECK: sminp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xac,0x62,0x0e]
+// CHECK: sminp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xac,0x62,0x4e]
+// CHECK: sminp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xac,0xa2,0x0e]
+// CHECK: sminp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xac,0xa2,0x4e]
+
+ uminp v0.8b, v1.8b, v2.8b
+ uminp v0.16b, v1.16b, v2.16b
+ uminp v0.4h, v1.4h, v2.4h
+ uminp v0.8h, v1.8h, v2.8h
+ uminp v0.2s, v1.2s, v2.2s
+ uminp v0.4s, v1.4s, v2.4s
+
+// CHECK: uminp v0.8b, v1.8b, v2.8b // encoding: [0x20,0xac,0x22,0x2e]
+// CHECK: uminp v0.16b, v1.16b, v2.16b // encoding: [0x20,0xac,0x22,0x6e]
+// CHECK: uminp v0.4h, v1.4h, v2.4h // encoding: [0x20,0xac,0x62,0x2e]
+// CHECK: uminp v0.8h, v1.8h, v2.8h // encoding: [0x20,0xac,0x62,0x6e]
+// CHECK: uminp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xac,0xa2,0x2e]
+// CHECK: uminp v0.4s, v1.4s, v2.4s // encoding: [0x20,0xac,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Maximum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+ fmaxp v0.2s, v1.2s, v2.2s
+ fmaxp v31.4s, v15.4s, v16.4s
+ fmaxp v7.2d, v8.2d, v25.2d
+
+// CHECK: fmaxp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xf4,0x22,0x2e]
+// CHECK: fmaxp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x6e]
+// CHECK: fmaxp v7.2d, v8.2d, v25.2d // encoding: [0x07,0xf5,0x79,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Minimum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+ fminp v10.2s, v15.2s, v22.2s
+ fminp v3.4s, v5.4s, v6.4s
+ fminp v17.2d, v13.2d, v2.2d
+
+// CHECK: fminp v10.2s, v15.2s, v22.2s // encoding: [0xea,0xf5,0xb6,0x2e]
+// CHECK: fminp v3.4s, v5.4s, v6.4s // encoding: [0xa3,0xf4,0xa6,0x6e]
+// CHECK: fminp v17.2d, v13.2d, v2.2d // encoding: [0xb1,0xf5,0xe2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector maxNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+ fmaxnmp v0.2s, v1.2s, v2.2s
+ fmaxnmp v31.4s, v15.4s, v16.4s
+ fmaxnmp v7.2d, v8.2d, v25.2d
+
+// CHECK: fmaxnmp v0.2s, v1.2s, v2.2s // encoding: [0x20,0xc4,0x22,0x2e]
+// CHECK: fmaxnmp v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x6e]
+// CHECK: fmaxnmp v7.2d, v8.2d, v25.2d // encoding: [0x07,0xc5,0x79,0x6e]
+
+//----------------------------------------------------------------------
+// Vector minNum Pairwise (Floating Point)
+//----------------------------------------------------------------------
+ fminnmp v10.2s, v15.2s, v22.2s
+ fminnmp v3.4s, v5.4s, v6.4s
+ fminnmp v17.2d, v13.2d, v2.2d
+
+// CHECK: fminnmp v10.2s, v15.2s, v22.2s // encoding: [0xea,0xc5,0xb6,0x2e]
+// CHECK: fminnmp v3.4s, v5.4s, v6.4s // encoding: [0xa3,0xc4,0xa6,0x6e]
+// CHECK: fminnmp v17.2d, v13.2d, v2.2d // encoding: [0xb1,0xc5,0xe2,0x6e]
+
diff --git a/test/MC/AArch64/neon-max-min.s b/test/MC/AArch64/neon-max-min.s
new file mode 100644
index 0000000000..6d1efde507
--- /dev/null
+++ b/test/MC/AArch64/neon-max-min.s
@@ -0,0 +1,110 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Maximum (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+ smax v0.8b, v1.8b, v2.8b
+ smax v0.16b, v1.16b, v2.16b
+ smax v0.4h, v1.4h, v2.4h
+ smax v0.8h, v1.8h, v2.8h
+ smax v0.2s, v1.2s, v2.2s
+ smax v0.4s, v1.4s, v2.4s
+
+// CHECK: smax v0.8b, v1.8b, v2.8b // encoding: [0x20,0x64,0x22,0x0e]
+// CHECK: smax v0.16b, v1.16b, v2.16b // encoding: [0x20,0x64,0x22,0x4e]
+// CHECK: smax v0.4h, v1.4h, v2.4h // encoding: [0x20,0x64,0x62,0x0e]
+// CHECK: smax v0.8h, v1.8h, v2.8h // encoding: [0x20,0x64,0x62,0x4e]
+// CHECK: smax v0.2s, v1.2s, v2.2s // encoding: [0x20,0x64,0xa2,0x0e]
+// CHECK: smax v0.4s, v1.4s, v2.4s // encoding: [0x20,0x64,0xa2,0x4e]
+
+ umax v0.8b, v1.8b, v2.8b
+ umax v0.16b, v1.16b, v2.16b
+ umax v0.4h, v1.4h, v2.4h
+ umax v0.8h, v1.8h, v2.8h
+ umax v0.2s, v1.2s, v2.2s
+ umax v0.4s, v1.4s, v2.4s
+
+// CHECK: umax v0.8b, v1.8b, v2.8b // encoding: [0x20,0x64,0x22,0x2e]
+// CHECK: umax v0.16b, v1.16b, v2.16b // encoding: [0x20,0x64,0x22,0x6e]
+// CHECK: umax v0.4h, v1.4h, v2.4h // encoding: [0x20,0x64,0x62,0x2e]
+// CHECK: umax v0.8h, v1.8h, v2.8h // encoding: [0x20,0x64,0x62,0x6e]
+// CHECK: umax v0.2s, v1.2s, v2.2s // encoding: [0x20,0x64,0xa2,0x2e]
+// CHECK: umax v0.4s, v1.4s, v2.4s // encoding: [0x20,0x64,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Minimum (Signed and Unsigned Integer)
+//----------------------------------------------------------------------
+ smin v0.8b, v1.8b, v2.8b
+ smin v0.16b, v1.16b, v2.16b
+ smin v0.4h, v1.4h, v2.4h
+ smin v0.8h, v1.8h, v2.8h
+ smin v0.2s, v1.2s, v2.2s
+ smin v0.4s, v1.4s, v2.4s
+
+// CHECK: smin v0.8b, v1.8b, v2.8b // encoding: [0x20,0x6c,0x22,0x0e]
+// CHECK: smin v0.16b, v1.16b, v2.16b // encoding: [0x20,0x6c,0x22,0x4e]
+// CHECK: smin v0.4h, v1.4h, v2.4h // encoding: [0x20,0x6c,0x62,0x0e]
+// CHECK: smin v0.8h, v1.8h, v2.8h // encoding: [0x20,0x6c,0x62,0x4e]
+// CHECK: smin v0.2s, v1.2s, v2.2s // encoding: [0x20,0x6c,0xa2,0x0e]
+// CHECK: smin v0.4s, v1.4s, v2.4s // encoding: [0x20,0x6c,0xa2,0x4e]
+
+ umin v0.8b, v1.8b, v2.8b
+ umin v0.16b, v1.16b, v2.16b
+ umin v0.4h, v1.4h, v2.4h
+ umin v0.8h, v1.8h, v2.8h
+ umin v0.2s, v1.2s, v2.2s
+ umin v0.4s, v1.4s, v2.4s
+
+// CHECK: umin v0.8b, v1.8b, v2.8b // encoding: [0x20,0x6c,0x22,0x2e]
+// CHECK: umin v0.16b, v1.16b, v2.16b // encoding: [0x20,0x6c,0x22,0x6e]
+// CHECK: umin v0.4h, v1.4h, v2.4h // encoding: [0x20,0x6c,0x62,0x2e]
+// CHECK: umin v0.8h, v1.8h, v2.8h // encoding: [0x20,0x6c,0x62,0x6e]
+// CHECK: umin v0.2s, v1.2s, v2.2s // encoding: [0x20,0x6c,0xa2,0x2e]
+// CHECK: umin v0.4s, v1.4s, v2.4s // encoding: [0x20,0x6c,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Maximum (Floating Point)
+//----------------------------------------------------------------------
+ fmax v0.2s, v1.2s, v2.2s
+ fmax v31.4s, v15.4s, v16.4s
+ fmax v7.2d, v8.2d, v25.2d
+
+// CHECK: fmax v0.2s, v1.2s, v2.2s // encoding: [0x20,0xf4,0x22,0x0e]
+// CHECK: fmax v31.4s, v15.4s, v16.4s // encoding: [0xff,0xf5,0x30,0x4e]
+// CHECK: fmax v7.2d, v8.2d, v25.2d // encoding: [0x07,0xf5,0x79,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Minimum (Floating Point)
+//----------------------------------------------------------------------
+ fmin v10.2s, v15.2s, v22.2s
+ fmin v3.4s, v5.4s, v6.4s
+ fmin v17.2d, v13.2d, v2.2d
+
+// CHECK: fmin v10.2s, v15.2s, v22.2s // encoding: [0xea,0xf5,0xb6,0x0e]
+// CHECK: fmin v3.4s, v5.4s, v6.4s // encoding: [0xa3,0xf4,0xa6,0x4e]
+// CHECK: fmin v17.2d, v13.2d, v2.2d // encoding: [0xb1,0xf5,0xe2,0x4e]
+
+//----------------------------------------------------------------------
+// Vector maxNum (Floating Point)
+//----------------------------------------------------------------------
+ fmaxnm v0.2s, v1.2s, v2.2s
+ fmaxnm v31.4s, v15.4s, v16.4s
+ fmaxnm v7.2d, v8.2d, v25.2d
+
+// CHECK: fmaxnm v0.2s, v1.2s, v2.2s // encoding: [0x20,0xc4,0x22,0x0e]
+// CHECK: fmaxnm v31.4s, v15.4s, v16.4s // encoding: [0xff,0xc5,0x30,0x4e]
+// CHECK: fmaxnm v7.2d, v8.2d, v25.2d // encoding: [0x07,0xc5,0x79,0x4e]
+
+//----------------------------------------------------------------------
+// Vector minNum (Floating Point)
+//----------------------------------------------------------------------
+ fminnm v10.2s, v15.2s, v22.2s
+ fminnm v3.4s, v5.4s, v6.4s
+ fminnm v17.2d, v13.2d, v2.2d
+
+// CHECK: fminnm v10.2s, v15.2s, v22.2s // encoding: [0xea,0xc5,0xb6,0x0e]
+// CHECK: fminnm v3.4s, v5.4s, v6.4s // encoding: [0xa3,0xc4,0xa6,0x4e]
+// CHECK: fminnm v17.2d, v13.2d, v2.2d // encoding: [0xb1,0xc5,0xe2,0x4e]
+
diff --git a/test/MC/AArch64/neon-mla-mls-instructions.s b/test/MC/AArch64/neon-mla-mls-instructions.s
new file mode 100644
index 0000000000..3072e6f120
--- /dev/null
+++ b/test/MC/AArch64/neon-mla-mls-instructions.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Integer Multiply-accumulate
+//----------------------------------------------------------------------
+ mla v0.8b, v1.8b, v2.8b
+ mla v0.16b, v1.16b, v2.16b
+ mla v0.4h, v1.4h, v2.4h
+ mla v0.8h, v1.8h, v2.8h
+ mla v0.2s, v1.2s, v2.2s
+ mla v0.4s, v1.4s, v2.4s
+
+// CHECK: mla v0.8b, v1.8b, v2.8b // encoding: [0x20,0x94,0x22,0x0e]
+// CHECK: mla v0.16b, v1.16b, v2.16b // encoding: [0x20,0x94,0x22,0x4e]
+// CHECK: mla v0.4h, v1.4h, v2.4h // encoding: [0x20,0x94,0x62,0x0e]
+// CHECK: mla v0.8h, v1.8h, v2.8h // encoding: [0x20,0x94,0x62,0x4e]
+// CHECK: mla v0.2s, v1.2s, v2.2s // encoding: [0x20,0x94,0xa2,0x0e]
+// CHECK: mla v0.4s, v1.4s, v2.4s // encoding: [0x20,0x94,0xa2,0x4e]
+
+
+//----------------------------------------------------------------------
+// Vector Integer Multiply-subtract
+//----------------------------------------------------------------------
+ mls v0.8b, v1.8b, v2.8b
+ mls v0.16b, v1.16b, v2.16b
+ mls v0.4h, v1.4h, v2.4h
+ mls v0.8h, v1.8h, v2.8h
+ mls v0.2s, v1.2s, v2.2s
+ mls v0.4s, v1.4s, v2.4s
+
+// CHECK: mls v0.8b, v1.8b, v2.8b // encoding: [0x20,0x94,0x22,0x2e]
+// CHECK: mls v0.16b, v1.16b, v2.16b // encoding: [0x20,0x94,0x22,0x6e]
+// CHECK: mls v0.4h, v1.4h, v2.4h // encoding: [0x20,0x94,0x62,0x2e]
+// CHECK: mls v0.8h, v1.8h, v2.8h // encoding: [0x20,0x94,0x62,0x6e]
+// CHECK: mls v0.2s, v1.2s, v2.2s // encoding: [0x20,0x94,0xa2,0x2e]
+// CHECK: mls v0.4s, v1.4s, v2.4s // encoding: [0x20,0x94,0xa2,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Multiply-accumulate
+//----------------------------------------------------------------------
+ fmla v0.2s, v1.2s, v2.2s
+ fmla v0.4s, v1.4s, v2.4s
+ fmla v0.2d, v1.2d, v2.2d
+
+// CHECK: fmla v0.2s, v1.2s, v2.2s // encoding: [0x20,0xcc,0x22,0x0e]
+// CHECK: fmla v0.4s, v1.4s, v2.4s // encoding: [0x20,0xcc,0x22,0x4e]
+// CHECK: fmla v0.2d, v1.2d, v2.2d // encoding: [0x20,0xcc,0x62,0x4e]
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Multiply-subtract
+//----------------------------------------------------------------------
+ fmls v0.2s, v1.2s, v2.2s
+ fmls v0.4s, v1.4s, v2.4s
+ fmls v0.2d, v1.2d, v2.2d
+
+// CHECK: fmls v0.2s, v1.2s, v2.2s // encoding: [0x20,0xcc,0xa2,0x0e]
+// CHECK: fmls v0.4s, v1.4s, v2.4s // encoding: [0x20,0xcc,0xa2,0x4e]
+// CHECK: fmls v0.2d, v1.2d, v2.2d // encoding: [0x20,0xcc,0xe2,0x4e]
+
diff --git a/test/MC/AArch64/neon-mov.s b/test/MC/AArch64/neon-mov.s
new file mode 100644
index 0000000000..8331372410
--- /dev/null
+++ b/test/MC/AArch64/neon-mov.s
@@ -0,0 +1,207 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//----------------------------------------------------------------------
+// Vector Move Immediate Shifted
+//----------------------------------------------------------------------
+ movi v0.2s, #1
+ movi v1.2s, #0
+ movi v15.2s, #1, lsl #8
+ movi v16.2s, #1, lsl #16
+ movi v31.2s, #1, lsl #24
+ movi v0.4s, #1
+ movi v0.4s, #1, lsl #8
+ movi v0.4s, #1, lsl #16
+ movi v0.4s, #1, lsl #24
+ movi v0.4h, #1
+ movi v0.4h, #1, lsl #8
+ movi v0.8h, #1
+ movi v0.8h, #1, lsl #8
+
+// CHECK: movi v0.2s, #0x1 // encoding: [0x20,0x04,0x00,0x0f]
+// CHECK: movi v1.2s, #0x0 // encoding: [0x01,0x04,0x00,0x0f]
+// CHECK: movi v15.2s, #0x1, lsl #8 // encoding: [0x2f,0x24,0x00,0x0f]
+// CHECK: movi v16.2s, #0x1, lsl #16 // encoding: [0x30,0x44,0x00,0x0f]
+// CHECK: movi v31.2s, #0x1, lsl #24 // encoding: [0x3f,0x64,0x00,0x0f]
+// CHECK: movi v0.4s, #0x1 // encoding: [0x20,0x04,0x00,0x4f]
+// CHECK: movi v0.4s, #0x1, lsl #8 // encoding: [0x20,0x24,0x00,0x4f]
+// CHECK: movi v0.4s, #0x1, lsl #16 // encoding: [0x20,0x44,0x00,0x4f]
+// CHECK: movi v0.4s, #0x1, lsl #24 // encoding: [0x20,0x64,0x00,0x4f]
+// CHECK: movi v0.4h, #0x1 // encoding: [0x20,0x84,0x00,0x0f]
+// CHECK: movi v0.4h, #0x1, lsl #8 // encoding: [0x20,0xa4,0x00,0x0f]
+// CHECK: movi v0.8h, #0x1 // encoding: [0x20,0x84,0x00,0x4f]
+// CHECK: movi v0.8h, #0x1, lsl #8 // encoding: [0x20,0xa4,0x00,0x4f]
+
+//----------------------------------------------------------------------
+// Vector Move Inverted Immediate Shifted
+//----------------------------------------------------------------------
+ mvni v0.2s, #1
+ mvni v1.2s, #0
+ mvni v0.2s, #1, lsl #8
+ mvni v0.2s, #1, lsl #16
+ mvni v0.2s, #1, lsl #24
+ mvni v0.4s, #1
+ mvni v15.4s, #1, lsl #8
+ mvni v16.4s, #1, lsl #16
+ mvni v31.4s, #1, lsl #24
+ mvni v0.4h, #1
+ mvni v0.4h, #1, lsl #8
+ mvni v0.8h, #1
+ mvni v0.8h, #1, lsl #8
+
+// CHECK: mvni v0.2s, #0x1 // encoding: [0x20,0x04,0x00,0x2f]
+// CHECK: mvni v1.2s, #0x0 // encoding: [0x01,0x04,0x00,0x2f]
+// CHECK: mvni v0.2s, #0x1, lsl #8 // encoding: [0x20,0x24,0x00,0x2f]
+// CHECK: mvni v0.2s, #0x1, lsl #16 // encoding: [0x20,0x44,0x00,0x2f]
+// CHECK: mvni v0.2s, #0x1, lsl #24 // encoding: [0x20,0x64,0x00,0x2f]
+// CHECK: mvni v0.4s, #0x1 // encoding: [0x20,0x04,0x00,0x6f]
+// CHECK: mvni v15.4s, #0x1, lsl #8 // encoding: [0x2f,0x24,0x00,0x6f]
+// CHECK: mvni v16.4s, #0x1, lsl #16 // encoding: [0x30,0x44,0x00,0x6f]
+// CHECK: mvni v31.4s, #0x1, lsl #24 // encoding: [0x3f,0x64,0x00,0x6f]
+// CHECK: mvni v0.4h, #0x1 // encoding: [0x20,0x84,0x00,0x2f]
+// CHECK: mvni v0.4h, #0x1, lsl #8 // encoding: [0x20,0xa4,0x00,0x2f]
+// CHECK: mvni v0.8h, #0x1 // encoding: [0x20,0x84,0x00,0x6f]
+// CHECK: mvni v0.8h, #0x1, lsl #8 // encoding: [0x20,0xa4,0x00,0x6f]
+
+//----------------------------------------------------------------------
+// Vector Bitwise Bit Clear (AND NOT) - immediate
+//----------------------------------------------------------------------
+ bic v0.2s, #1
+ bic v1.2s, #0
+ bic v0.2s, #1, lsl #8
+ bic v0.2s, #1, lsl #16
+ bic v0.2s, #1, lsl #24
+ bic v0.4s, #1
+ bic v0.4s, #1, lsl #8
+ bic v0.4s, #1, lsl #16
+ bic v0.4s, #1, lsl #24
+ bic v15.4h, #1
+ bic v16.4h, #1, lsl #8
+ bic v0.8h, #1
+ bic v31.8h, #1, lsl #8
+
+// CHECK: bic v0.2s, #0x1 // encoding: [0x20,0x14,0x00,0x2f]
+// CHECK: bic v1.2s, #0x0 // encoding: [0x01,0x14,0x00,0x2f]
+// CHECK: bic v0.2s, #0x1, lsl #8 // encoding: [0x20,0x34,0x00,0x2f]
+// CHECK: bic v0.2s, #0x1, lsl #16 // encoding: [0x20,0x54,0x00,0x2f]
+// CHECK: bic v0.2s, #0x1, lsl #24 // encoding: [0x20,0x74,0x00,0x2f]
+// CHECK: bic v0.4s, #0x1 // encoding: [0x20,0x14,0x00,0x6f]
+// CHECK: bic v0.4s, #0x1, lsl #8 // encoding: [0x20,0x34,0x00,0x6f]
+// CHECK: bic v0.4s, #0x1, lsl #16 // encoding: [0x20,0x54,0x00,0x6f]
+// CHECK: bic v0.4s, #0x1, lsl #24 // encoding: [0x20,0x74,0x00,0x6f]
+// CHECK: bic v15.4h, #0x1 // encoding: [0x2f,0x94,0x00,0x2f]
+// CHECK: bic v16.4h, #0x1, lsl #8 // encoding: [0x30,0xb4,0x00,0x2f]
+// CHECK: bic v0.8h, #0x1 // encoding: [0x20,0x94,0x00,0x6f]
+// CHECK: bic v31.8h, #0x1, lsl #8 // encoding: [0x3f,0xb4,0x00,0x6f]
+
+//----------------------------------------------------------------------
+// Vector Bitwise OR - immedidate
+//----------------------------------------------------------------------
+ orr v0.2s, #1
+ orr v1.2s, #0
+ orr v0.2s, #1, lsl #8
+ orr v0.2s, #1, lsl #16
+ orr v0.2s, #1, lsl #24
+ orr v0.4s, #1
+ orr v0.4s, #1, lsl #8
+ orr v0.4s, #1, lsl #16
+ orr v0.4s, #1, lsl #24
+ orr v31.4h, #1
+ orr v15.4h, #1, lsl #8
+ orr v0.8h, #1
+ orr v16.8h, #1, lsl #8
+
+// CHECK: orr v0.2s, #0x1 // encoding: [0x20,0x14,0x00,0x0f]
+// CHECK: orr v1.2s, #0x0 // encoding: [0x01,0x14,0x00,0x0f]
+// CHECK: orr v0.2s, #0x1, lsl #8 // encoding: [0x20,0x34,0x00,0x0f]
+// CHECK: orr v0.2s, #0x1, lsl #16 // encoding: [0x20,0x54,0x00,0x0f]
+// CHECK: orr v0.2s, #0x1, lsl #24 // encoding: [0x20,0x74,0x00,0x0f]
+// CHECK: orr v0.4s, #0x1 // encoding: [0x20,0x14,0x00,0x4f]
+// CHECK: orr v0.4s, #0x1, lsl #8 // encoding: [0x20,0x34,0x00,0x4f]
+// CHECK: orr v0.4s, #0x1, lsl #16 // encoding: [0x20,0x54,0x00,0x4f]
+// CHECK: orr v0.4s, #0x1, lsl #24 // encoding: [0x20,0x74,0x00,0x4f]
+// CHECK: orr v31.4h, #0x1 // encoding: [0x3f,0x94,0x00,0x0f]
+// CHECK: orr v15.4h, #0x1, lsl #8 // encoding: [0x2f,0xb4,0x00,0x0f]
+// CHECK: orr v0.8h, #0x1 // encoding: [0x20,0x94,0x00,0x4f]
+// CHECK: orr v16.8h, #0x1, lsl #8 // encoding: [0x30,0xb4,0x00,0x4f]
+
+//----------------------------------------------------------------------
+// Vector Move Immediate Masked
+//----------------------------------------------------------------------
+ movi v0.2s, #1, msl #8
+ movi v1.2s, #1, msl #16
+ movi v0.4s, #1, msl #8
+ movi v31.4s, #1, msl #16
+
+// CHECK: movi v0.2s, #0x1, msl #8 // encoding: [0x20,0xc4,0x00,0x0f]
+// CHECK: movi v1.2s, #0x1, msl #16 // encoding: [0x21,0xd4,0x00,0x0f]
+// CHECK: movi v0.4s, #0x1, msl #8 // encoding: [0x20,0xc4,0x00,0x4f]
+// CHECK: movi v31.4s, #0x1, msl #16 // encoding: [0x3f,0xd4,0x00,0x4f]
+
+//----------------------------------------------------------------------
+// Vector Move Inverted Immediate Masked
+//----------------------------------------------------------------------
+ mvni v1.2s, #0x1, msl #8
+ mvni v0.2s, #0x1, msl #16
+ mvni v31.4s, #0x1, msl #8
+ mvni v0.4s, #0x1, msl #16
+
+// CHECK: mvni v1.2s, #0x1, msl #8 // encoding: [0x21,0xc4,0x00,0x2f]
+// CHECK: mvni v0.2s, #0x1, msl #16 // encoding: [0x20,0xd4,0x00,0x2f]
+// CHECK: mvni v31.4s, #0x1, msl #8 // encoding: [0x3f,0xc4,0x00,0x6f]
+// CHECK: mvni v0.4s, #0x1, msl #16 // encoding: [0x20,0xd4,0x00,0x6f]
+
+//----------------------------------------------------------------------
+// Vector Immediate - per byte
+//----------------------------------------------------------------------
+ movi v0.8b, #0
+ movi v31.8b, #0xff
+ movi v15.16b, #0xf
+ movi v31.16b, #0x1f
+
+// CHECK: movi v0.8b, #0x0 // encoding: [0x00,0xe4,0x00,0x0f]
+// CHECK: movi v31.8b, #0xff // encoding: [0xff,0xe7,0x07,0x0f]
+// CHECK: movi v15.16b, #0xf // encoding: [0xef,0xe5,0x00,0x4f]
+// CHECK: movi v31.16b, #0x1f // encoding: [0xff,0xe7,0x00,0x4f]
+
+//----------------------------------------------------------------------
+// Vector Move Immediate - bytemask, per doubleword
+//---------------------------------------------------------------------
+ movi v0.2d, #0xff00ff00ff00ff00
+
+// CHECK: movi v0.2d, #0xff00ff00ff00ff00 // encoding: [0x40,0xe5,0x05,0x6f]
+
+//----------------------------------------------------------------------
+// Vector Move Immediate - bytemask, one doubleword
+//----------------------------------------------------------------------
+ movi d0, #0xff00ff00ff00ff00
+
+// CHECK: movi d0, #0xff00ff00ff00ff00 // encoding: [0x40,0xe5,0x05,0x2f]
+
+//----------------------------------------------------------------------
+// Vector Floating Point Move Immediate
+//----------------------------------------------------------------------
+ fmov v1.2s, #1.0
+ fmov v15.4s, #1.0
+ fmov v31.2d, #1.0
+
+// CHECK: fmov v1.2s, #1.00000000 // encoding: [0x01,0xf6,0x03,0x0f]
+// CHECK: fmov v15.4s, #1.00000000 // encoding: [0x0f,0xf6,0x03,0x4f]
+// CHECK: fmov v31.2d, #1.00000000 // encoding: [0x1f,0xf6,0x03,0x6f]
+
+
+//----------------------------------------------------------------------
+// Vector Move - register
+//----------------------------------------------------------------------
+ mov v0.8b, v31.8b
+ mov v15.16b, v16.16b
+ orr v0.8b, v31.8b, v31.8b
+ orr v15.16b, v16.16b, v16.16b
+
+// CHECK: mov v0.8b, v31.8b // encoding: [0xe0,0x1f,0xbf,0x0e]
+// CHECK: mov v15.16b, v16.16b // encoding: [0x0f,0x1e,0xb0,0x4e]
+// CHECK: mov v0.8b, v31.8b // encoding: [0xe0,0x1f,0xbf,0x0e]
+// CHECK: mov v15.16b, v16.16b // encoding: [0x0f,0x1e,0xb0,0x4e]
+
diff --git a/test/MC/AArch64/neon-mul-div-instructions.s b/test/MC/AArch64/neon-mul-div-instructions.s
new file mode 100644
index 0000000000..1fe6d2b819
--- /dev/null
+++ b/test/MC/AArch64/neon-mul-div-instructions.s
@@ -0,0 +1,86 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+//----------------------------------------------------------------------
+// Vector Integer Mul
+//----------------------------------------------------------------------
+ mul v0.8b, v1.8b, v2.8b
+ mul v0.16b, v1.16b, v2.16b
+ mul v0.4h, v1.4h, v2.4h
+ mul v0.8h, v1.8h, v2.8h
+ mul v0.2s, v1.2s, v2.2s
+ mul v0.4s, v1.4s, v2.4s
+
+// CHECK: mul v0.8b, v1.8b, v2.8b // encoding: [0x20,0x9c,0x22,0x0e]
+// CHECK: mul v0.16b, v1.16b, v2.16b // encoding: [0x20,0x9c,0x22,0x4e]
+// CHECK: mul v0.4h, v1.4h, v2.4h // encoding: [0x20,0x9c,0x62,0x0e]
+// CHECK: mul v0.8h, v1.8h, v2.8h // encoding: [0x20,0x9c,0x62,0x4e]
+// CHECK: mul v0.2s, v1.2s, v2.2s // encoding: [0x20,0x9c,0xa2,0x0e]
+// CHECK: mul v0.4s, v1.4s, v2.4s // encoding: [0x20,0x9c,0xa2,0x4e]
+
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Mul
+//----------------------------------------------------------------------
+ fmul v0.2s, v1.2s, v2.2s
+ fmul v0.4s, v1.4s, v2.4s
+ fmul v0.2d, v1.2d, v2.2d
+
+// CHECK: fmul v0.2s, v1.2s, v2.2s // encoding: [0x20,0xdc,0x22,0x2e]
+// CHECK: fmul v0.4s, v1.4s, v2.4s // encoding: [0x20,0xdc,0x22,0x6e]
+// CHECK: fmul v0.2d, v1.2d, v2.2d // encoding: [0x20,0xdc,0x62,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Floating-Point Div
+//----------------------------------------------------------------------
+ fdiv v0.2s, v1.2s, v2.2s
+ fdiv v0.4s, v1.4s, v2.4s
+ fdiv v0.2d, v1.2d, v2.2d
+
+// CHECK: fdiv v0.2s, v1.2s, v2.2s // encoding: [0x20,0xfc,0x22,0x2e]
+// CHECK: fdiv v0.4s, v1.4s, v2.4s // encoding: [0x20,0xfc,0x22,0x6e]
+// CHECK: fdiv v0.2d, v1.2d, v2.2d // encoding: [0x20,0xfc,0x62,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Multiply (Polynomial)
+//----------------------------------------------------------------------
+ pmul v17.8b, v31.8b, v16.8b
+ pmul v0.16b, v1.16b, v2.16b
+
+// CHECK: pmul v17.8b, v31.8b, v16.8b // encoding: [0xf1,0x9f,0x30,0x2e]
+// CHECK: pmul v0.16b, v1.16b, v2.16b // encoding: [0x20,0x9c,0x22,0x6e]
+
+//----------------------------------------------------------------------
+// Vector Saturating Doubling Multiply High
+//----------------------------------------------------------------------
+ sqdmulh v2.4h, v25.4h, v3.4h
+ sqdmulh v12.8h, v5.8h, v13.8h
+ sqdmulh v3.2s, v1.2s, v30.2s
+
+// CHECK: sqdmulh v2.4h, v25.4h, v3.4h // encoding: [0x22,0xb7,0x63,0x0e]
+// CHECK: sqdmulh v12.8h, v5.8h, v13.8h // encoding: [0xac,0xb4,0x6d,0x4e]
+// CHECK: sqdmulh v3.2s, v1.2s, v30.2s // encoding: [0x23,0xb4,0xbe,0x0e]
+
+//----------------------------------------------------------------------
+// Vector Saturating Rouding Doubling Multiply High
+//----------------------------------------------------------------------
+ sqrdmulh v2.4h, v25.4h, v3.4h
+ sqrdmulh v12.8h, v5.8h, v13.8h
+ sqrdmulh v3.2s, v1.2s, v30.2s
+
+// CHECK: sqrdmulh v2.4h, v25.4h, v3.4h // encoding: [0x22,0xb7,0x63,0x2e]
+// CHECK: sqrdmulh v12.8h, v5.8h, v13.8h // encoding: [0xac,0xb4,0x6d,0x6e]
+// CHECK: sqrdmulh v3.2s, v1.2s, v30.2s // encoding: [0x23,0xb4,0xbe,0x2e]
+
+//----------------------------------------------------------------------
+// Vector Multiply Extended
+//----------------------------------------------------------------------
+ fmulx v21.2s, v5.2s, v13.2s
+ fmulx v1.4s, v25.4s, v3.4s
+ fmulx v31.2d, v22.2d, v2.2d
+
+// CHECK: fmulx v21.2s, v5.2s, v13.2s // encoding: [0xb5,0xdc,0x2d,0x0e]
+// CHECK: fmulx v1.4s, v25.4s, v3.4s // encoding: [0x21,0xdf,0x23,0x4e]
+// CHECK: fmulx v31.2d, v22.2d, v2.2d // encoding: [0xdf,0xde,0x62,0x4e]
+
diff --git a/test/MC/AArch64/neon-rounding-halving-add.s b/test/MC/AArch64/neon-rounding-halving-add.s
new file mode 100644
index 0000000000..47ac212680
--- /dev/null
+++ b/test/MC/AArch64/neon-rounding-halving-add.s
@@ -0,0 +1,39 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Rouding Halving Add (Signed)
+//------------------------------------------------------------------------------
+ srhadd v0.8b, v1.8b, v2.8b
+ srhadd v0.16b, v1.16b, v2.16b
+ srhadd v0.4h, v1.4h, v2.4h
+ srhadd v0.8h, v1.8h, v2.8h
+ srhadd v0.2s, v1.2s, v2.2s
+ srhadd v0.4s, v1.4s, v2.4s
+
+// CHECK: srhadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x14,0x22,0x0e]
+// CHECK: srhadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x14,0x22,0x4e]
+// CHECK: srhadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x14,0x62,0x0e]
+// CHECK: srhadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x14,0x62,0x4e]
+// CHECK: srhadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x14,0xa2,0x0e]
+// CHECK: srhadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x14,0xa2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Rouding Halving Add (Unsigned)
+//------------------------------------------------------------------------------
+ urhadd v0.8b, v1.8b, v2.8b
+ urhadd v0.16b, v1.16b, v2.16b
+ urhadd v0.4h, v1.4h, v2.4h
+ urhadd v0.8h, v1.8h, v2.8h
+ urhadd v0.2s, v1.2s, v2.2s
+ urhadd v0.4s, v1.4s, v2.4s
+
+// CHECK: urhadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x14,0x22,0x2e]
+// CHECK: urhadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x14,0x22,0x6e]
+// CHECK: urhadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x14,0x62,0x2e]
+// CHECK: urhadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x14,0x62,0x6e]
+// CHECK: urhadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x14,0xa2,0x2e]
+// CHECK: urhadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x14,0xa2,0x6e]
+
diff --git a/test/MC/AArch64/neon-rounding-shift.s b/test/MC/AArch64/neon-rounding-shift.s
new file mode 100644
index 0000000000..f3c70d7e38
--- /dev/null
+++ b/test/MC/AArch64/neon-rounding-shift.s
@@ -0,0 +1,57 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+ srshl v0.8b, v1.8b, v2.8b
+ srshl v0.16b, v1.16b, v2.16b
+ srshl v0.4h, v1.4h, v2.4h
+ srshl v0.8h, v1.8h, v2.8h
+ srshl v0.2s, v1.2s, v2.2s
+ srshl v0.4s, v1.4s, v2.4s
+ srshl v0.2d, v1.2d, v2.2d
+
+// CHECK: srshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x54,0x22,0x0e]
+// CHECK: srshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x54,0x22,0x4e]
+// CHECK: srshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x54,0x62,0x0e]
+// CHECK: srshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x54,0x62,0x4e]
+// CHECK: srshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x54,0xa2,0x0e]
+// CHECK: srshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x54,0xa2,0x4e]
+// CHECK: srshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x54,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+ urshl v0.8b, v1.8b, v2.8b
+ urshl v0.16b, v1.16b, v2.16b
+ urshl v0.4h, v1.4h, v2.4h
+ urshl v0.8h, v1.8h, v2.8h
+ urshl v0.2s, v1.2s, v2.2s
+ urshl v0.4s, v1.4s, v2.4s
+ urshl v0.2d, v1.2d, v2.2d
+
+// CHECK: urshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x54,0x22,0x2e]
+// CHECK: urshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x54,0x22,0x6e]
+// CHECK: urshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x54,0x62,0x2e]
+// CHECK: urshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x54,0x62,0x6e]
+// CHECK: urshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x54,0xa2,0x2e]
+// CHECK: urshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x54,0xa2,0x6e]
+// CHECK: urshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x54,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+ srshl d17, d31, d8
+
+// CHECK: srshl d17, d31, d8 // encoding: [0xf1,0x57,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+ urshl d17, d31, d8
+
+// CHECK: urshl d17, d31, d8 // encoding: [0xf1,0x57,0xe8,0x7e]
+
diff --git a/test/MC/AArch64/neon-saturating-add-sub.s b/test/MC/AArch64/neon-saturating-add-sub.s
new file mode 100644
index 0000000000..1032ae47e2
--- /dev/null
+++ b/test/MC/AArch64/neon-saturating-add-sub.s
@@ -0,0 +1,133 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Add (Signed)
+//------------------------------------------------------------------------------
+ sqadd v0.8b, v1.8b, v2.8b
+ sqadd v0.16b, v1.16b, v2.16b
+ sqadd v0.4h, v1.4h, v2.4h
+ sqadd v0.8h, v1.8h, v2.8h
+ sqadd v0.2s, v1.2s, v2.2s
+ sqadd v0.4s, v1.4s, v2.4s
+ sqadd v0.2d, v1.2d, v2.2d
+
+// CHECK: sqadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x0c,0x22,0x0e]
+// CHECK: sqadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x0c,0x22,0x4e]
+// CHECK: sqadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x0c,0x62,0x0e]
+// CHECK: sqadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x0c,0x62,0x4e]
+// CHECK: sqadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x0c,0xa2,0x0e]
+// CHECK: sqadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x0c,0xa2,0x4e]
+// CHECK: sqadd v0.2d, v1.2d, v2.2d // encoding: [0x20,0x0c,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Add (Unsigned)
+//------------------------------------------------------------------------------
+ uqadd v0.8b, v1.8b, v2.8b
+ uqadd v0.16b, v1.16b, v2.16b
+ uqadd v0.4h, v1.4h, v2.4h
+ uqadd v0.8h, v1.8h, v2.8h
+ uqadd v0.2s, v1.2s, v2.2s
+ uqadd v0.4s, v1.4s, v2.4s
+ uqadd v0.2d, v1.2d, v2.2d
+
+// CHECK: uqadd v0.8b, v1.8b, v2.8b // encoding: [0x20,0x0c,0x22,0x2e]
+// CHECK: uqadd v0.16b, v1.16b, v2.16b // encoding: [0x20,0x0c,0x22,0x6e]
+// CHECK: uqadd v0.4h, v1.4h, v2.4h // encoding: [0x20,0x0c,0x62,0x2e]
+// CHECK: uqadd v0.8h, v1.8h, v2.8h // encoding: [0x20,0x0c,0x62,0x6e]
+// CHECK: uqadd v0.2s, v1.2s, v2.2s // encoding: [0x20,0x0c,0xa2,0x2e]
+// CHECK: uqadd v0.4s, v1.4s, v2.4s // encoding: [0x20,0x0c,0xa2,0x6e]
+// CHECK: uqadd v0.2d, v1.2d, v2.2d // encoding: [0x20,0x0c,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Sub (Signed)
+//------------------------------------------------------------------------------
+ sqsub v0.8b, v1.8b, v2.8b
+ sqsub v0.16b, v1.16b, v2.16b
+ sqsub v0.4h, v1.4h, v2.4h
+ sqsub v0.8h, v1.8h, v2.8h
+ sqsub v0.2s, v1.2s, v2.2s
+ sqsub v0.4s, v1.4s, v2.4s
+ sqsub v0.2d, v1.2d, v2.2d
+
+// CHECK: sqsub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x2c,0x22,0x0e]
+// CHECK: sqsub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x2c,0x22,0x4e]
+// CHECK: sqsub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x2c,0x62,0x0e]
+// CHECK: sqsub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x2c,0x62,0x4e]
+// CHECK: sqsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x2c,0xa2,0x0e]
+// CHECK: sqsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x2c,0xa2,0x4e]
+// CHECK: sqsub v0.2d, v1.2d, v2.2d // encoding: [0x20,0x2c,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Sub (Unsigned)
+//------------------------------------------------------------------------------
+ uqsub v0.8b, v1.8b, v2.8b
+ uqsub v0.16b, v1.16b, v2.16b
+ uqsub v0.4h, v1.4h, v2.4h
+ uqsub v0.8h, v1.8h, v2.8h
+ uqsub v0.2s, v1.2s, v2.2s
+ uqsub v0.4s, v1.4s, v2.4s
+ uqsub v0.2d, v1.2d, v2.2d
+
+// CHECK: uqsub v0.8b, v1.8b, v2.8b // encoding: [0x20,0x2c,0x22,0x2e]
+// CHECK: uqsub v0.16b, v1.16b, v2.16b // encoding: [0x20,0x2c,0x22,0x6e]
+// CHECK: uqsub v0.4h, v1.4h, v2.4h // encoding: [0x20,0x2c,0x62,0x2e]
+// CHECK: uqsub v0.8h, v1.8h, v2.8h // encoding: [0x20,0x2c,0x62,0x6e]
+// CHECK: uqsub v0.2s, v1.2s, v2.2s // encoding: [0x20,0x2c,0xa2,0x2e]
+// CHECK: uqsub v0.4s, v1.4s, v2.4s // encoding: [0x20,0x2c,0xa2,0x6e]
+// CHECK: uqsub v0.2d, v1.2d, v2.2d // encoding: [0x20,0x2c,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Add (Signed)
+//------------------------------------------------------------------------------
+ sqadd b0, b1, b2
+ sqadd h10, h11, h12
+ sqadd s20, s21, s2
+ sqadd d17, d31, d8
+
+// CHECK: sqadd b0, b1, b2 // encoding: [0x20,0x0c,0x22,0x5e]
+// CHECK: sqadd h10, h11, h12 // encoding: [0x6a,0x0d,0x6c,0x5e]
+// CHECK: sqadd s20, s21, s2 // encoding: [0xb4,0x0e,0xa2,0x5e]
+// CHECK: sqadd d17, d31, d8 // encoding: [0xf1,0x0f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Add (Unsigned)
+//------------------------------------------------------------------------------
+ uqadd b0, b1, b2
+ uqadd h10, h11, h12
+ uqadd s20, s21, s2
+ uqadd d17, d31, d8
+
+// CHECK: uqadd b0, b1, b2 // encoding: [0x20,0x0c,0x22,0x7e]
+// CHECK: uqadd h10, h11, h12 // encoding: [0x6a,0x0d,0x6c,0x7e]
+// CHECK: uqadd s20, s21, s2 // encoding: [0xb4,0x0e,0xa2,0x7e]
+// CHECK: uqadd d17, d31, d8 // encoding: [0xf1,0x0f,0xe8,0x7e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Sub (Signed)
+//------------------------------------------------------------------------------
+ sqsub b0, b1, b2
+ sqsub h10, h11, h12
+ sqsub s20, s21, s2
+ sqsub d17, d31, d8
+
+// CHECK: sqsub b0, b1, b2 // encoding: [0x20,0x2c,0x22,0x5e]
+// CHECK: sqsub h10, h11, h12 // encoding: [0x6a,0x2d,0x6c,0x5e]
+// CHECK: sqsub s20, s21, s2 // encoding: [0xb4,0x2e,0xa2,0x5e]
+// CHECK: sqsub d17, d31, d8 // encoding: [0xf1,0x2f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Sub (Unsigned)
+//------------------------------------------------------------------------------
+ uqsub b0, b1, b2
+ uqsub h10, h11, h12
+ uqsub s20, s21, s2
+ uqsub d17, d31, d8
+
+// CHECK: uqsub b0, b1, b2 // encoding: [0x20,0x2c,0x22,0x7e]
+// CHECK: uqsub h10, h11, h12 // encoding: [0x6a,0x2d,0x6c,0x7e]
+// CHECK: uqsub s20, s21, s2 // encoding: [0xb4,0x2e,0xa2,0x7e]
+// CHECK: uqsub d17, d31, d8 // encoding: [0xf1,0x2f,0xe8,0x7e]
+
diff --git a/test/MC/AArch64/neon-saturating-rounding-shift.s b/test/MC/AArch64/neon-saturating-rounding-shift.s
new file mode 100644
index 0000000000..a36e68988e
--- /dev/null
+++ b/test/MC/AArch64/neon-saturating-rounding-shift.s
@@ -0,0 +1,70 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+ sqrshl v0.8b, v1.8b, v2.8b
+ sqrshl v0.16b, v1.16b, v2.16b
+ sqrshl v0.4h, v1.4h, v2.4h
+ sqrshl v0.8h, v1.8h, v2.8h
+ sqrshl v0.2s, v1.2s, v2.2s
+ sqrshl v0.4s, v1.4s, v2.4s
+ sqrshl v0.2d, v1.2d, v2.2d
+
+// CHECK: sqrshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x5c,0x22,0x0e]
+// CHECK: sqrshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x5c,0x22,0x4e]
+// CHECK: sqrshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x5c,0x62,0x0e]
+// CHECK: sqrshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x5c,0x62,0x4e]
+// CHECK: sqrshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x5c,0xa2,0x0e]
+// CHECK: sqrshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x5c,0xa2,0x4e]
+// CHECK: sqrshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x5c,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+ uqrshl v0.8b, v1.8b, v2.8b
+ uqrshl v0.16b, v1.16b, v2.16b
+ uqrshl v0.4h, v1.4h, v2.4h
+ uqrshl v0.8h, v1.8h, v2.8h
+ uqrshl v0.2s, v1.2s, v2.2s
+ uqrshl v0.4s, v1.4s, v2.4s
+ uqrshl v0.2d, v1.2d, v2.2d
+
+// CHECK: uqrshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x5c,0x22,0x2e]
+// CHECK: uqrshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x5c,0x22,0x6e]
+// CHECK: uqrshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x5c,0x62,0x2e]
+// CHECK: uqrshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x5c,0x62,0x6e]
+// CHECK: uqrshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x5c,0xa2,0x2e]
+// CHECK: uqrshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x5c,0xa2,0x6e]
+// CHECK: uqrshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x5c,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Shift Lef (Signed)
+//------------------------------------------------------------------------------
+ sqrshl b0, b1, b2
+ sqrshl h10, h11, h12
+ sqrshl s20, s21, s2
+ sqrshl d17, d31, d8
+
+// CHECK: sqrshl b0, b1, b2 // encoding: [0x20,0x5c,0x22,0x5e]
+// CHECK: sqrshl h10, h11, h12 // encoding: [0x6a,0x5d,0x6c,0x5e]
+// CHECK: sqrshl s20, s21, s2 // encoding: [0xb4,0x5e,0xa2,0x5e]
+// CHECK: sqrshl d17, d31, d8 // encoding: [0xf1,0x5f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Rounding Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+ uqrshl b0, b1, b2
+ uqrshl h10, h11, h12
+ uqrshl s20, s21, s2
+ uqrshl d17, d31, d8
+
+// CHECK: uqrshl b0, b1, b2 // encoding: [0x20,0x5c,0x22,0x7e]
+// CHECK: uqrshl h10, h11, h12 // encoding: [0x6a,0x5d,0x6c,0x7e]
+// CHECK: uqrshl s20, s21, s2 // encoding: [0xb4,0x5e,0xa2,0x7e]
+// CHECK: uqrshl d17, d31, d8 // encoding: [0xf1,0x5f,0xe8,0x7e]
+
+
diff --git a/test/MC/AArch64/neon-saturating-shift.s b/test/MC/AArch64/neon-saturating-shift.s
new file mode 100644
index 0000000000..2c8456db63
--- /dev/null
+++ b/test/MC/AArch64/neon-saturating-shift.s
@@ -0,0 +1,69 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Shift Lef (Signed)
+//------------------------------------------------------------------------------
+ sqshl v0.8b, v1.8b, v2.8b
+ sqshl v0.16b, v1.16b, v2.16b
+ sqshl v0.4h, v1.4h, v2.4h
+ sqshl v0.8h, v1.8h, v2.8h
+ sqshl v0.2s, v1.2s, v2.2s
+ sqshl v0.4s, v1.4s, v2.4s
+ sqshl v0.2d, v1.2d, v2.2d
+
+// CHECK: sqshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x4c,0x22,0x0e]
+// CHECK: sqshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x4c,0x22,0x4e]
+// CHECK: sqshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x4c,0x62,0x0e]
+// CHECK: sqshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x4c,0x62,0x4e]
+// CHECK: sqshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x4c,0xa2,0x0e]
+// CHECK: sqshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x4c,0xa2,0x4e]
+// CHECK: sqshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x4c,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Saturating Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+ uqshl v0.8b, v1.8b, v2.8b
+ uqshl v0.16b, v1.16b, v2.16b
+ uqshl v0.4h, v1.4h, v2.4h
+ uqshl v0.8h, v1.8h, v2.8h
+ uqshl v0.2s, v1.2s, v2.2s
+ uqshl v0.4s, v1.4s, v2.4s
+ uqshl v0.2d, v1.2d, v2.2d
+
+// CHECK: uqshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x4c,0x22,0x2e]
+// CHECK: uqshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x4c,0x22,0x6e]
+// CHECK: uqshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x4c,0x62,0x2e]
+// CHECK: uqshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x4c,0x62,0x6e]
+// CHECK: uqshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x4c,0xa2,0x2e]
+// CHECK: uqshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x4c,0xa2,0x6e]
+// CHECK: uqshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x4c,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Shift Lef (Signed)
+//------------------------------------------------------------------------------
+ sqshl b0, b1, b2
+ sqshl h10, h11, h12
+ sqshl s20, s21, s2
+ sqshl d17, d31, d8
+
+// CHECK: sqshl b0, b1, b2 // encoding: [0x20,0x4c,0x22,0x5e]
+// CHECK: sqshl h10, h11, h12 // encoding: [0x6a,0x4d,0x6c,0x5e]
+// CHECK: sqshl s20, s21, s2 // encoding: [0xb4,0x4e,0xa2,0x5e]
+// CHECK: sqshl d17, d31, d8 // encoding: [0xf1,0x4f,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Saturating Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+ uqshl b0, b1, b2
+ uqshl h10, h11, h12
+ uqshl s20, s21, s2
+ uqshl d17, d31, d8
+
+// CHECK: uqshl b0, b1, b2 // encoding: [0x20,0x4c,0x22,0x7e]
+// CHECK: uqshl h10, h11, h12 // encoding: [0x6a,0x4d,0x6c,0x7e]
+// CHECK: uqshl s20, s21, s2 // encoding: [0xb4,0x4e,0xa2,0x7e]
+// CHECK: uqshl d17, d31, d8 // encoding: [0xf1,0x4f,0xe8,0x7e]
+
diff --git a/test/MC/AArch64/neon-shift.s b/test/MC/AArch64/neon-shift.s
new file mode 100644
index 0000000000..be1799e2c1
--- /dev/null
+++ b/test/MC/AArch64/neon-shift.s
@@ -0,0 +1,57 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -show-encoding < %s | FileCheck %s
+
+// Check that the assembler can handle the documented syntax for AArch64
+
+
+//------------------------------------------------------------------------------
+// Vector Integer Shift Lef (Signed)
+//------------------------------------------------------------------------------
+ sshl v0.8b, v1.8b, v2.8b
+ sshl v0.16b, v1.16b, v2.16b
+ sshl v0.4h, v1.4h, v2.4h
+ sshl v0.8h, v1.8h, v2.8h
+ sshl v0.2s, v1.2s, v2.2s
+ sshl v0.4s, v1.4s, v2.4s
+ sshl v0.2d, v1.2d, v2.2d
+
+// CHECK: sshl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x44,0x22,0x0e]
+// CHECK: sshl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x44,0x22,0x4e]
+// CHECK: sshl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x44,0x62,0x0e]
+// CHECK: sshl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x44,0x62,0x4e]
+// CHECK: sshl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x44,0xa2,0x0e]
+// CHECK: sshl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x44,0xa2,0x4e]
+// CHECK: sshl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x44,0xe2,0x4e]
+
+//------------------------------------------------------------------------------
+// Vector Integer Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+ ushl v0.8b, v1.8b, v2.8b
+ ushl v0.16b, v1.16b, v2.16b
+ ushl v0.4h, v1.4h, v2.4h
+ ushl v0.8h, v1.8h, v2.8h
+ ushl v0.2s, v1.2s, v2.2s
+ ushl v0.4s, v1.4s, v2.4s
+ ushl v0.2d, v1.2d, v2.2d
+
+// CHECK: ushl v0.8b, v1.8b, v2.8b // encoding: [0x20,0x44,0x22,0x2e]
+// CHECK: ushl v0.16b, v1.16b, v2.16b // encoding: [0x20,0x44,0x22,0x6e]
+// CHECK: ushl v0.4h, v1.4h, v2.4h // encoding: [0x20,0x44,0x62,0x2e]
+// CHECK: ushl v0.8h, v1.8h, v2.8h // encoding: [0x20,0x44,0x62,0x6e]
+// CHECK: ushl v0.2s, v1.2s, v2.2s // encoding: [0x20,0x44,0xa2,0x2e]
+// CHECK: ushl v0.4s, v1.4s, v2.4s // encoding: [0x20,0x44,0xa2,0x6e]
+// CHECK: ushl v0.2d, v1.2d, v2.2d // encoding: [0x20,0x44,0xe2,0x6e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Shift Lef (Signed)
+//------------------------------------------------------------------------------
+ sshl d17, d31, d8
+
+// CHECK: sshl d17, d31, d8 // encoding: [0xf1,0x47,0xe8,0x5e]
+
+//------------------------------------------------------------------------------
+// Scalar Integer Shift Lef (Unsigned)
+//------------------------------------------------------------------------------
+ ushl d17, d31, d8
+
+// CHECK: ushl d17, d31, d8 // encoding: [0xf1,0x47,0xe8,0x7e]
+
diff --git a/test/MC/AArch64/noneon-diagnostics.s b/test/MC/AArch64/noneon-diagnostics.s
new file mode 100644
index 0000000000..ea786c0ba6
--- /dev/null
+++ b/test/MC/AArch64/noneon-diagnostics.s
@@ -0,0 +1,28 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=-neon < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+
+ fmla v3.4s, v12.4s, v17.4s
+ fmla v1.2d, v30.2d, v20.2d
+ fmla v9.2s, v9.2s, v0.2s
+// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: fmla v3.4s, v12.4s, v17.4s
+// CHECK-ERROR-NEXT: ^
+// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: fmla v1.2d, v30.2d, v20.2d
+// CHECK-ERROR-NEXT: ^
+// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: fmla v9.2s, v9.2s, v0.2s
+// CHECK-ERROR-NEXT: ^
+
+ fmls v3.4s, v12.4s, v17.4s
+ fmls v1.2d, v30.2d, v20.2d
+ fmls v9.2s, v9.2s, v0.2s
+// CHECK-ERROR: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: fmls v3.4s, v12.4s, v17.4s
+// CHECK-ERROR-NEXT: ^
+// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: fmls v1.2d, v30.2d, v20.2d
+// CHECK-ERROR-NEXT: ^
+// CHECK-ERROR-NEXT: error: instruction requires a CPU feature not currently enabled
+// CHECK-ERROR-NEXT: fmls v9.2s, v9.2s, v0.2s
+// CHECK-ERROR-NEXT: ^
diff --git a/test/MC/Disassembler/AArch64/neon-instructions.txt b/test/MC/Disassembler/AArch64/neon-instructions.txt
new file mode 100644
index 0000000000..40d1f4c66f
--- /dev/null
+++ b/test/MC/Disassembler/AArch64/neon-instructions.txt
@@ -0,0 +1,673 @@
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+neon -disassemble < %s | FileCheck %s
+
+#------------------------------------------------------------------------------
+# Vector Integer Add/Sub
+#------------------------------------------------------------------------------
+# CHECK: add v31.8b, v31.8b, v31.8b
+# CHECK: sub v0.2d, v0.2d, v0.2d
+0xff 0x87 0x3f 0x0e
+0x00 0x84 0xe0 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Floating-Point Add/Sub
+#------------------------------------------------------------------------------
+
+# CHECK: fadd v0.4s, v0.4s, v0.4s
+# CHECK: fsub v31.2s, v31.2s, v31.2s
+0x00 0xd4 0x20 0x4e
+0xff 0xd7 0xbf 0x0e
+
+#------------------------------------------------------------------------------
+# Vector Integer Mul
+#------------------------------------------------------------------------------
+# CHECK: mul v0.8b, v1.8b, v2.8b
+0x20 0x9c 0x22 0x0e
+
+#------------------------------------------------------------------------------
+# Vector Floating-Point Mul/Div
+#------------------------------------------------------------------------------
+# CHECK: fmul v0.2s, v1.2s, v2.2s
+# CHECK: fdiv v31.2s, v31.2s, v31.2s
+0x20 0xdc 0x22 0x2e
+0xff 0xff 0x3f 0x2e
+
+#----------------------------------------------------------------------
+# Vector Polynomial Multiply
+#----------------------------------------------------------------------
+# CHECK: pmul v0.8b, v15.8b, v16.8b
+# CHECK: pmul v31.16b, v7.16b, v8.16b
+0xe0 0x9d 0x30 0x2e
+0xff 0x9c 0x28 0x6e
+
+#------------------------------------------------------------------------------
+# Vector And, Orr, Eor, Orn, Bic
+#------------------------------------------------------------------------------
+# CHECK: and v2.8b, v2.8b, v2.8b
+# CHECK: orr v31.16b, v31.16b, v30.16b
+# CHECK: eor v0.16b, v1.16b, v2.16b
+# CHECK: orn v9.16b, v10.16b, v11.16b
+# CHECK: bic v31.8b, v30.8b, v29.8b
+0x42 0x1c 0x22 0x0e
+0xff 0x1f 0xbe 0x4e
+0x20 0x1c 0x22 0x6e
+0x49 0x1d 0xeb 0x4e
+0xdf 0x1f 0x7d 0x0e
+
+#------------------------------------------------------------------------------
+# Vector Bsl, Bit, Bif
+#------------------------------------------------------------------------------
+# CHECK: bsl v0.8b, v1.8b, v2.8b
+# CHECK: bit v31.16b, v31.16b, v31.16b
+# CHECK: bif v0.16b, v1.16b, v2.16b
+0x20 0x1c 0x62 0x2e
+0xff 0x1f 0xbf 0x6e
+0x20 0x1c 0xe2 0x6e
+
+
+#------------------------------------------------------------------------------
+# Vector Integer Multiply-accumulate and Multiply-subtract
+#------------------------------------------------------------------------------
+# CHECK: mla v0.8b, v1.8b, v2.8b
+# CHECK: mls v31.4h, v31.4h, v31.4h
+0x20 0x94 0x22 0x0e
+0xff 0x97 0x7f 0x2e
+
+#------------------------------------------------------------------------------
+# Vector Floating-Point Multiply-accumulate and Multiply-subtract
+#------------------------------------------------------------------------------
+# CHECK: fmla v0.2s, v1.2s, v2.2s
+# CHECK: fmls v31.2s, v31.2s, v31.2s
+0x20 0xcc 0x22 0x0e
+0xff 0xcf 0xbf 0x0e
+
+#------------------------------------------------------------------------------
+# Vector Move Immediate Shifted
+# Vector Move Inverted Immediate Shifted
+# Vector Bitwise Bit Clear (AND NOT) - immediate
+# Vector Bitwise OR - immedidate
+#------------------------------------------------------------------------------
+# CHECK: movi v31.4s, #0xff, lsl #24
+# CHECK: mvni v0.2s, #0x0
+# CHECK: bic v15.4h, #0xf, lsl #8
+# CHECK: orr v16.8h, #0x1f
+0xff 0x67 0x07 0x4f
+0x00 0x04 0x00 0x2f
+0xef 0xb5 0x00 0x2f
+0xf0 0x97 0x00 0x4f
+
+#------------------------------------------------------------------------------
+# Vector Move Immediate Masked
+# Vector Move Inverted Immediate Masked
+#------------------------------------------------------------------------------
+# CHECK: movi v8.2s, #0x8, msl #8
+# CHECK: mvni v16.4s, #0x10, msl #16
+0x08 0xc5 0x00 0x0f
+0x10 0xd6 0x00 0x6f
+
+#------------------------------------------------------------------------------
+# Vector Immediate - per byte
+# Vector Move Immediate - bytemask, per doubleword
+# Vector Move Immediate - bytemask, one doubleword
+#------------------------------------------------------------------------------
+# CHECK: movi v16.8b, #0xff
+# CHECK: movi v31.16b, #0x1f
+# CHECK: movi d15, #0xff00ff00ff00ff
+# CHECK: movi v31.2d, #0xff0000ff0000ffff
+0xf0 0xe7 0x07 0x0f
+0xff 0xe7 0x00 0x4f
+0xaf 0xe6 0x02 0x2f
+0x7f 0xe6 0x04 0x6f
+
+#------------------------------------------------------------------------------
+# Vector Floating Point Move Immediate
+#------------------------------------------------------------------------------
+# CHECK: fmov v0.2s, #13.0
+# CHECK: fmov v15.4s, #1.0
+# CHECK: fmov v31.2d, #-1.25
+0x40 0xf5 0x01 0x0f
+0x0f 0xf6 0x03 0x4f
+0x9f 0xf6 0x07 0x6f
+
+#------------------------------------------------------------------------------
+# Vector Move - register
+#------------------------------------------------------------------------------
+# CHECK: mov v1.16b, v15.16b
+# CHECK: mov v25.8b, v4.8b
+0xe1 0x1d 0xaf 0x4e
+0x99 0x1c 0xa4 0x0e
+
+#----------------------------------------------------------------------
+# Vector Absolute Difference and Accumulate (Signed, Unsigned)
+# Vector Absolute Difference (Signed, Unsigned)
+# Vector Absolute Difference (Floating Point)
+#----------------------------------------------------------------------
+
+# CHECK: uaba v0.8b, v1.8b, v2.8b
+# CHECK: saba v31.16b, v30.16b, v29.16b
+# CHECK: uabd v15.4h, v16.4h, v17.4h
+# CHECK: sabd v5.4h, v4.4h, v6.4h
+# CHECK: fabd v1.4s, v31.4s, v16.4s
+0x20 0x7c 0x22 0x2e
+0xdf 0x7f 0x3d 0x4e
+0x0f 0x76 0x71 0x2e
+0x85 0x74 0x66 0x0e
+0xe1 0xd7 0xb0 0x6e
+
+#----------------------------------------------------------------------
+# Scalar Integer Add
+# Scalar Integer Sub
+#----------------------------------------------------------------------
+
+# CHECK: add d17, d31, d29
+# CHECK: sub d15, d5, d16
+0xf1 0x87 0xfd 0x5e
+0xaf 0x84 0xf0 0x7e
+
+#----------------------------------------------------------------------
+# Vector Reciprocal Square Root Step (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: frsqrts v31.2d, v15.2d, v8.2d
+0xff 0xfd 0xe8 0x4e
+
+#----------------------------------------------------------------------
+# Vector Reciprocal Step (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: frecps v5.4s, v7.4s, v16.4s
+0xe5 0xfc 0x30 0x4e
+
+#----------------------------------------------------------------------
+# Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: facge v0.4s, v31.4s, v16.4s
+0xe0 0xef 0x30 0x6e
+
+#----------------------------------------------------------------------
+# Vector Absolute Compare Mask Less Than (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: facgt v31.2d, v29.2d, v28.2d
+0xbf 0xef 0xfc 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Equal (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmeq v5.16b, v15.16b, v31.16b
+0xe5 0x8d 0x3f 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Higher or Same (Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: cmhs v1.8b, v16.8b, v30.8b
+0x01 0x3e 0x3e 0x2e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than or Equal (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmge v20.4h, v11.4h, v23.4h
+0x74 0x3d 0x77 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Higher (Unsigned Integer)
+# CHECK: cmhi v13.8h, v3.8h, v27.8h
+0x6d 0x34 0x7b 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmgt v9.4s, v4.4s, v28.4s
+0x89 0x34 0xbc 0x4e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Bitwise Test (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmtst v21.2s, v19.2s, v18.2s
+0x75 0x8e 0xb2 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Equal (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmeq v0.2s, v15.2s, v16.2s
+0xe0 0xe5 0x30 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than Or Equal (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmge v31.4s, v7.4s, v29.4s
+0xff 0xe4 0x3d 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmgt v17.4s, v8.4s, v25.4s
+0x11 0xe5 0xb9 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Equal to Zero (Integer)
+#----------------------------------------------------------------------
+# CHECK: cmeq v31.16b, v15.16b, #0x0
+0xff 0x99 0x20 0x4e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
+#----------------------------------------------------------------------
+# CHECK: cmge v3.8b, v15.8b, #0x0
+0xe3 0x89 0x20 0x2e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than Zero (Signed Integer)
+#----------------------------------------------------------------------
+# CHECK: cmgt v22.2s, v9.2s, #0x0
+0x36 0x89 0xa0 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
+#----------------------------------------------------------------------
+# CHECK: cmle v5.2d, v14.2d, #0x0
+0xc5 0x99 0xe0 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Less Than Zero (Signed Integer)
+#----------------------------------------------------------------------
+# CHECK: cmlt v13.8h, v11.8h, #0x0
+0x6d 0xa9 0x60 0x4e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Equal to Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmeq v15.2s, v21.2s, #0.0
+0xaf 0xda 0xa0 0x0e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmge v14.2d, v13.2d, #0.0
+0xae 0xc9 0xe0 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Greater Than Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmgt v9.4s, v23.4s, #0.0
+0xe9 0xca 0xa0 0x4e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Less Than or Equal To Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmle v11.2d, v6.2d, #0.0
+0xcb 0xd8 0xe0 0x6e
+
+#----------------------------------------------------------------------
+# Vector Compare Mask Less Than Zero (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fcmlt v12.4s, v25.4s, #0.0
+0x2c 0xeb 0xa0 0x4e
+
+
+#------------------------------------------------------------------------------
+# Vector Integer Halving Add (Signed)
+# Vector Integer Halving Add (Unsigned)
+# Vector Integer Halving Sub (Signed)
+# Vector Integer Halving Sub (Unsigned)
+#------------------------------------------------------------------------------
+# CHECK: shadd v0.8b, v31.8b, v29.8b
+# CHECK: uhadd v15.16b, v16.16b, v17.16b
+# CHECK: shsub v0.4h, v1.4h, v2.4h
+# CHECK: uhadd v5.8h, v7.8h, v8.8h
+# CHECK: shsub v9.2s, v11.2s, v21.2s
+# CHECK: uhsub v22.4s, v30.4s, v19.4s
+0xe0 0x07 0x3d 0x0e
+0x0f 0x06 0x31 0x6e
+0x20 0x24 0x62 0x0e
+0xe5 0x04 0x68 0x6e
+0x69 0x25 0xb5 0x0e
+0xd6 0x27 0xb3 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Integer Rouding Halving Add (Signed)
+# Vector Integer Rouding Halving Add (Unsigned)
+#------------------------------------------------------------------------------
+# CHECK: srhadd v3.8b, v5.8b, v7.8b
+# CHECK: urhadd v7.16b, v17.16b, v27.16b
+# CHECK: srhadd v10.4h, v11.4h, v13.4h
+# CHECK: urhadd v1.8h, v2.8h, v3.8h
+# CHECK: srhadd v4.2s, v5.2s, v6.2s
+# CHECK: urhadd v7.4s, v7.4s, v7.4s
+0xa3 0x14 0x27 0x0e
+0x27 0x16 0x3b 0x6e
+0x6a 0x15 0x6d 0x0e
+0x41 0x14 0x63 0x6e
+0xa4 0x14 0xa6 0x0e
+0xe7 0x14 0xa7 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Integer Saturating Add (Signed)
+# Vector Integer Saturating Add (Unsigned)
+# Vector Integer Saturating Sub (Signed)
+# Vector Integer Saturating Sub (Unsigned)
+#------------------------------------------------------------------------------
+# CHECK: sqsub v0.8b, v1.8b, v2.8b
+# CHECK: sqadd v0.16b, v1.16b, v2.16b
+# CHECK: uqsub v0.4h, v1.4h, v2.4h
+# CHECK: uqadd v0.8h, v1.8h, v2.8h
+# CHECK: sqadd v0.2s, v1.2s, v2.2s
+# CHECK: sqsub v0.4s, v1.4s, v2.4s
+# CHECK: sqsub v0.2d, v1.2d, v2.2d
+0x20 0x2c 0x22 0x0e
+0x20 0x0c 0x22 0x4e
+0x20 0x2c 0x62 0x2e
+0x20 0x0c 0x62 0x6e
+0x20 0x0c 0xa2 0x0e
+0x20 0x2c 0xa2 0x4e
+0x20 0x2c 0xe2 0x4e
+
+#------------------------------------------------------------------------------
+# Scalar Integer Saturating Add (Signed)
+# Scalar Integer Saturating Add (Unsigned)
+# Scalar Integer Saturating Sub (Signed)
+# Scalar Integer Saturating Add (Unsigned)
+#------------------------------------------------------------------------------
+# CHECK: sqadd b20, b11, b15
+# CHECK: uqadd h0, h1, h5
+# CHECK: sqsub s20, s10, s7
+# CHECK: uqsub d16, d16, d16
+0x74 0x0d 0x2f 0x5e
+0x20 0x0c 0x65 0x7e
+0x54 0x2d 0xa7 0x5e
+0x10 0x2e 0xf0 0x7e
+
+
+#----------------------------------------------------------------------
+# Vector Shift Left (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: sshl v10.8b, v15.8b, v22.8b
+# CHECK: ushl v10.16b, v5.16b, v2.16b
+# CHECK: sshl v10.4h, v15.4h, v22.4h
+# CHECK: ushl v10.8h, v5.8h, v2.8h
+# CHECK: sshl v10.2s, v15.2s, v22.2s
+# CHECK: ushl v10.4s, v5.4s, v2.4s
+# CHECK: sshl v0.2d, v1.2d, v2.2d
+0xea 0x45 0x36 0x0e
+0xaa 0x44 0x22 0x6e
+0xea 0x45 0x76 0x0e
+0xaa 0x44 0x62 0x6e
+0xea 0x45 0xb6 0x0e
+0xaa 0x44 0xa2 0x6e
+0x20 0x44 0xe2 0x4e
+
+#----------------------------------------------------------------------
+# Vector Saturating Shift Left (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: sqshl v1.8b, v15.8b, v22.8b
+# CHECK: uqshl v2.16b, v14.16b, v23.16b
+# CHECK: sqshl v3.4h, v13.4h, v24.4h
+# CHECK: uqshl v4.8h, v12.8h, v25.8h
+# CHECK: sqshl v5.2s, v11.2s, v26.2s
+# CHECK: uqshl v6.4s, v10.4s, v27.4s
+# CHECK: uqshl v0.2d, v1.2d, v2.2d
+0xe1 0x4d 0x36 0x0e
+0xc2 0x4d 0x37 0x6e
+0xa3 0x4d 0x78 0x0e
+0x84 0x4d 0x79 0x6e
+0x65 0x4d 0xba 0x0e
+0x46 0x4d 0xbb 0x6e
+0x20 0x4c 0xe2 0x6e
+
+#----------------------------------------------------------------------
+# Vector Rouding Shift Left (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: srshl v10.8b, v5.8b, v22.8b
+# CHECK: urshl v10.16b, v5.16b, v2.16b
+# CHECK: srshl v1.4h, v5.4h, v31.4h
+# CHECK: urshl v1.8h, v5.8h, v2.8h
+# CHECK: srshl v10.2s, v15.2s, v2.2s
+# CHECK: urshl v1.4s, v5.4s, v2.4s
+# CHECK: urshl v0.2d, v1.2d, v2.2d
+0xaa 0x54 0x36 0x0e
+0xaa 0x54 0x22 0x6e
+0xa1 0x54 0x7f 0x0e
+0xa1 0x54 0x62 0x6e
+0xea 0x55 0xa2 0x0e
+0xa1 0x54 0xa2 0x6e
+0x20 0x54 0xe2 0x6e
+
+#----------------------------------------------------------------------
+# Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: sqrshl v1.8b, v15.8b, v22.8b
+# CHECK: uqrshl v2.16b, v14.16b, v23.16b
+# CHECK: sqrshl v3.4h, v13.4h, v24.4h
+# CHECK: uqrshl v4.8h, v12.8h, v25.8h
+# CHECK: sqrshl v5.2s, v11.2s, v26.2s
+# CHECK: uqrshl v6.4s, v10.4s, v27.4s
+# CHECK: uqrshl v6.4s, v10.4s, v27.4s
+0xe1 0x5d 0x36 0x0e
+0xc2 0x5d 0x37 0x6e
+0xa3 0x5d 0x78 0x0e
+0x84 0x5d 0x79 0x6e
+0x65 0x5d 0xba 0x0e
+0x46 0x5d 0xbb 0x6e
+0x46 0x5d 0xbb 0x6e
+
+#----------------------------------------------------------------------
+# Scalar Integer Shift Left (Signed, Unsigned)
+#----------------------------------------------------------------------
+# CHECK: sshl d31, d31, d31
+# CHECK: ushl d0, d0, d0
+0xff 0x47 0xff 0x5e
+0x00 0x44 0xe0 0x7e
+
+#----------------------------------------------------------------------
+# Scalar Integer Saturating Shift Left (Signed, Unsigned)
+#----------------------------------------------------------------------
+# CHECK: sqshl d31, d31, d31
+# CHECK: uqshl s23, s20, s16
+# CHECK: sqshl h3, h4, h15
+# CHECK: uqshl b11, b20, b30
+0xff 0x4f 0xff 0x5e
+0x97 0x4e 0xb0 0x7e
+0x83 0x4c 0x6f 0x5e
+0x8b 0x4e 0x3e 0x7e
+
+#----------------------------------------------------------------------
+# Scalar Integer Rouding Shift Left (Signed, Unsigned)
+#----------------------------------------------------------------------
+# CHECK: srshl d16, d16, d16
+# CHECK: urshl d8, d7, d4
+0x10 0x56 0xf0 0x5e
+0xe8 0x54 0xe4 0x7e
+
+#----------------------------------------------------------------------
+# Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
+#----------------------------------------------------------------------
+# CHECK: sqrshl d31, d31, d31
+# CHECK: uqrshl s23, s20, s16
+# CHECK: sqrshl h3, h4, h15
+# CHECK: uqrshl b11, b20, b30
+0xff 0x5f 0xff 0x5e
+0x97 0x5e 0xb0 0x7e
+0x83 0x5c 0x6f 0x5e
+0x8b 0x5e 0x3e 0x7e
+
+#----------------------------------------------------------------------
+# Vector Maximum (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: smax v1.8b, v15.8b, v22.8b
+# CHECK: umax v2.16b, v14.16b, v23.16b
+# CHECK: smax v3.4h, v13.4h, v24.4h
+# CHECK: umax v4.8h, v12.8h, v25.8h
+# CHECK: smax v5.2s, v11.2s, v26.2s
+# CHECK: umax v6.4s, v10.4s, v27.4s
+0xe1 0x65 0x36 0x0e
+0xc2 0x65 0x37 0x6e
+0xa3 0x65 0x78 0x0e
+0x84 0x65 0x79 0x6e
+0x65 0x65 0xba 0x0e
+0x46 0x65 0xbb 0x6e
+
+#----------------------------------------------------------------------
+# Vector Minimum (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: umin v1.8b, v15.8b, v22.8b
+# CHECK: smin v2.16b, v14.16b, v23.16b
+# CHECK: umin v3.4h, v13.4h, v24.4h
+# CHECK: smin v4.8h, v12.8h, v25.8h
+# CHECK: umin v5.2s, v11.2s, v26.2s
+# CHECK: smin v6.4s, v10.4s, v27.4s
+0xe1 0x6d 0x36 0x2e
+0xc2 0x6d 0x37 0x4e
+0xa3 0x6d 0x78 0x2e
+0x84 0x6d 0x79 0x4e
+0x65 0x6d 0xba 0x2e
+0x46 0x6d 0xbb 0x4e
+
+#----------------------------------------------------------------------
+# Vector Maximum (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmax v29.2s, v28.2s, v25.2s
+# CHECK: fmax v9.4s, v8.4s, v5.4s
+# CHECK: fmax v11.2d, v10.2d, v7.2d
+0x9d 0xf7 0x39 0x0e
+0x09 0xf5 0x25 0x4e
+0x4b 0xf5 0x67 0x4e
+
+#----------------------------------------------------------------------
+# Vector Minimum (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmin v29.2s, v28.2s, v25.2s
+# CHECK: fmin v9.4s, v8.4s, v5.4s
+# CHECK: fmin v11.2d, v10.2d, v7.2d
+0x9d 0xf7 0xb9 0x0e
+0x09 0xf5 0xa5 0x4e
+0x4b 0xf5 0xe7 0x4e
+
+#----------------------------------------------------------------------
+# Vector maxNum (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmaxnm v9.2s, v8.2s, v5.2s
+# CHECK: fmaxnm v9.4s, v8.4s, v5.4s
+# CHECK: fmaxnm v11.2d, v10.2d, v7.2d
+0x09 0xc5 0x25 0x0e
+0x09 0xc5 0x25 0x4e
+0x4b 0xc5 0x67 0x4e
+
+#----------------------------------------------------------------------
+# Vector minNum (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fminnm v2.2s, v8.2s, v25.2s
+# CHECK: fminnm v9.4s, v8.4s, v5.4s
+# CHECK: fminnm v11.2d, v10.2d, v7.2d
+0x02 0xc5 0xb9 0x0e
+0x09 0xc5 0xa5 0x4e
+0x4b 0xc5 0xe7 0x4e
+
+
+#----------------------------------------------------------------------
+# Vector Maximum Pairwise (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: smaxp v1.8b, v15.8b, v22.8b
+# CHECK: umaxp v2.16b, v14.16b, v23.16b
+# CHECK: smaxp v3.4h, v13.4h, v24.4h
+# CHECK: umaxp v4.8h, v12.8h, v25.8h
+# CHECK: smaxp v5.2s, v11.2s, v26.2s
+# CHECK: umaxp v6.4s, v10.4s, v27.4s
+0xe1 0xa5 0x36 0x0e
+0xc2 0xa5 0x37 0x6e
+0xa3 0xa5 0x78 0x0e
+0x84 0xa5 0x79 0x6e
+0x65 0xa5 0xba 0x0e
+0x46 0xa5 0xbb 0x6e
+
+#----------------------------------------------------------------------
+# Vector Minimum Pairwise (Signed and Unsigned Integer)
+#----------------------------------------------------------------------
+# CHECK: uminp v1.8b, v15.8b, v22.8b
+# CHECK: sminp v2.16b, v14.16b, v23.16b
+# CHECK: uminp v3.4h, v13.4h, v24.4h
+# CHECK: sminp v4.8h, v12.8h, v25.8h
+# CHECK: uminp v5.2s, v11.2s, v26.2s
+# CHECK: sminp v6.4s, v10.4s, v27.4s
+0xe1 0xad 0x36 0x2e
+0xc2 0xad 0x37 0x4e
+0xa3 0xad 0x78 0x2e
+0x84 0xad 0x79 0x4e
+0x65 0xad 0xba 0x2e
+0x46 0xad 0xbb 0x4e
+
+#----------------------------------------------------------------------
+# Vector Maximum Pairwise (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmaxp v29.2s, v28.2s, v25.2s
+# CHECK: fmaxp v9.4s, v8.4s, v5.4s
+# CHECK: fmaxp v11.2d, v10.2d, v7.2d
+0x9d 0xf7 0x39 0x2e
+0x09 0xf5 0x25 0x6e
+0x4b 0xf5 0x67 0x6e
+
+#----------------------------------------------------------------------
+# Vector Minimum Pairwise (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fminp v29.2s, v28.2s, v25.2s
+# CHECK: fminp v9.4s, v8.4s, v5.4s
+# CHECK: fminp v11.2d, v10.2d, v7.2d
+0x9d 0xf7 0xb9 0x2e
+0x09 0xf5 0xa5 0x6e
+0x4b 0xf5 0xe7 0x6e
+
+#----------------------------------------------------------------------
+# Vector maxNum Pairwise (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fmaxnmp v9.2s, v8.2s, v5.2s
+# CHECK: fmaxnmp v9.4s, v8.4s, v5.4s
+# CHECK: fmaxnmp v11.2d, v10.2d, v7.2d
+0x09 0xc5 0x25 0x2e
+0x09 0xc5 0x25 0x6e
+0x4b 0xc5 0x67 0x6e
+
+#----------------------------------------------------------------------
+# Vector minNum Pairwise (Floating Point)
+#----------------------------------------------------------------------
+# CHECK: fminnmp v2.2s, v8.2s, v25.2s
+# CHECK: fminnmp v9.4s, v8.4s, v5.4s
+# CHECK: fminnmp v11.2d, v10.2d, v7.2d
+0x02 0xc5 0xb9 0x2e
+0x09 0xc5 0xa5 0x6e
+0x4b 0xc5 0xe7 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Add Pairwise (Integer)
+#------------------------------------------------------------------------------
+# CHECK: addp v31.8b, v31.8b, v31.8b
+# CHECK: addp v0.2d, v0.2d, v0.2d
+0xff 0xbf 0x3f 0x0e
+0x00 0xbc 0xe0 0x4e
+
+#------------------------------------------------------------------------------
+# Vector Add Pairwise (Floating Point)
+#------------------------------------------------------------------------------
+# CHECK: faddp v0.4s, v0.4s, v0.4s
+# CHECK: faddp v31.2s, v31.2s, v31.2s
+0x00 0xd4 0x20 0x6e
+0xff 0xd7 0x3f 0x2e
+
+
+#------------------------------------------------------------------------------
+# Vector Saturating Doubling Multiply High
+# Vector Saturating Rouding Doubling Multiply High
+#------------------------------------------------------------------------------
+# CHECK: sqdmulh v31.2s, v31.2s, v31.2s
+# CHECK: sqdmulh v5.4s, v7.4s, v9.4s
+# CHECK: sqrdmulh v31.4h, v3.4h, v13.4h
+# CHECK: sqrdmulh v0.8h, v10.8h, v20.8h
+0xff 0xb7 0xbf 0x0e
+0xe5 0xb4 0xa9 0x4e
+0x7f 0xb4 0x6d 0x2e
+0x40 0xb5 0x74 0x6e
+
+#------------------------------------------------------------------------------
+# Vector Multiply Extended
+#------------------------------------------------------------------------------
+# CHECK: fmulx v1.2s, v22.2s, v2.2s
+# CHECK: fmulx v21.4s, v15.4s, v3.4s
+# CHECK: fmulx v11.2d, v5.2d, v23.2d
+0xc1 0xde 0x22 0x0e
+0xf5 0xdd 0x23 0x4e
+0xab 0xdc 0x77 0x4e
+