summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Qin <Kevin.Qin@arm.com>2014-01-29 01:57:30 +0000
committerKevin Qin <Kevin.Qin@arm.com>2014-01-29 01:57:30 +0000
commit79c6a4f3478bd21558e2c779667bec7d69e94ccc (patch)
tree4c69979185e3d7da5d820000115dbeb39c1a1944
parente0f5a8667105ac03c6cd21c0200ba87e94d8d931 (diff)
downloadllvm-79c6a4f3478bd21558e2c779667bec7d69e94ccc.tar.gz
llvm-79c6a4f3478bd21558e2c779667bec7d69e94ccc.tar.bz2
llvm-79c6a4f3478bd21558e2c779667bec7d69e94ccc.tar.xz
[AArch64 NEON] Lower SELECT_CC with vector operand.
When the scalar compare is between floating point and operands are vector, we custom lower SELECT_CC to use NEON SIMD compare for generating less instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200365 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp234
-rw-r--r--lib/Target/AArch64/AArch64InstrNEON.td4
-rw-r--r--test/CodeGen/AArch64/neon-select_cc.ll180
3 files changed, 362 insertions, 56 deletions
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 327547c27c..7f101fffa1 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -458,6 +458,32 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
setOperationAction(ISD::FREM, MVT::v1f64, Expand);
setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+ setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
+ setOperationAction(ISD::SELECT, MVT::v16i8, Expand);
+ setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
+ setOperationAction(ISD::SELECT, MVT::v8i16, Expand);
+ setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
+ setOperationAction(ISD::SELECT, MVT::v2i64, Expand);
+ setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+ setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+ setOperationAction(ISD::SELECT, MVT::v2f64, Expand);
+
+ setOperationAction(ISD::SELECT_CC, MVT::v8i8, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v16i8, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v4i16, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v8i16, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v2i32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v4i32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v1i64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v2i64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v2f32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v4f32, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v1f64, Custom);
+ setOperationAction(ISD::SELECT_CC, MVT::v2f64, Custom);
+
// Vector ExtLoad and TruncStore are expanded.
for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE;
I <= MVT::LAST_VECTOR_VALUETYPE; ++I) {
@@ -2661,62 +2687,6 @@ AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
}
}
-// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
-SDValue
-AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
- SDLoc dl(Op);
- SDValue LHS = Op.getOperand(0);
- SDValue RHS = Op.getOperand(1);
- SDValue IfTrue = Op.getOperand(2);
- SDValue IfFalse = Op.getOperand(3);
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-
- if (LHS.getValueType() == MVT::f128) {
- // f128 comparisons are lowered to libcalls, but slot in nicely here
- // afterwards.
- softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
- // If softenSetCCOperands returned a scalar, we need to compare the result
- // against zero to select between true and false values.
- if (RHS.getNode() == 0) {
- RHS = DAG.getConstant(0, LHS.getValueType());
- CC = ISD::SETNE;
- }
- }
-
- if (LHS.getValueType().isInteger()) {
- SDValue A64cc;
-
- // Integers are handled in a separate function because the combinations of
- // immediates and tests can get hairy and we may want to fiddle things.
- SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
-
- return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
- CmpOp, IfTrue, IfFalse, A64cc);
- }
-
- // Note that some LLVM floating-point CondCodes can't be lowered to a single
- // conditional branch, hence FPCCToA64CC can set a second test, where either
- // passing is sufficient.
- A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
- CondCode = FPCCToA64CC(CC, Alternative);
- SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
- SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
- DAG.getCondCode(CC));
- SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
- Op.getValueType(),
- SetCC, IfTrue, IfFalse, A64cc);
-
- if (Alternative != A64CC::Invalid) {
- A64cc = DAG.getConstant(Alternative, MVT::i32);
- A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
- SetCC, IfTrue, A64SELECT_CC, A64cc);
-
- }
-
- return A64SELECT_CC;
-}
-
// (SELECT testbit, iftrue, iffalse)
SDValue
AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -3004,6 +2974,158 @@ AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return A64SELECT_CC;
}
+static SDValue LowerVectorSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue IfTrue = Op.getOperand(2);
+ SDValue IfFalse = Op.getOperand(3);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+ // If LHS & RHS are floating point and IfTrue & IfFalse are vectors, we will
+ // use NEON compare.
+ if ((LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64)) {
+ EVT EltVT = LHS.getValueType();
+ unsigned EltNum = 128 / EltVT.getSizeInBits();
+ EVT VT = EVT::getVectorVT(*DAG.getContext(), EltVT, EltNum);
+ unsigned SubConstant =
+ (LHS.getValueType() == MVT::f32) ? AArch64::sub_32 :AArch64::sub_64;
+ EVT CEltT = (LHS.getValueType() == MVT::f32) ? MVT::i32 : MVT::i64;
+ EVT CVT = EVT::getVectorVT(*DAG.getContext(), CEltT, EltNum);
+
+ LHS
+ = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
+ VT, DAG.getTargetConstant(0, MVT::i32), LHS,
+ DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
+ RHS
+ = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
+ VT, DAG.getTargetConstant(0, MVT::i32), RHS,
+ DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
+
+ SDValue VSetCC = DAG.getSetCC(dl, CVT, LHS, RHS, CC);
+ SDValue ResCC = LowerVectorSETCC(VSetCC, DAG);
+ EVT IfTrueVT = IfTrue.getValueType();
+ EVT CastEltT =
+ MVT::getIntegerVT(IfTrueVT.getVectorElementType().getSizeInBits());
+ EVT CastVT = EVT::getVectorVT(*DAG.getContext(), CastEltT,
+ IfTrueVT.getVectorNumElements());
+ if (CEltT.getSizeInBits() < IfTrueVT.getSizeInBits()) {
+ EVT DUPVT =
+ EVT::getVectorVT(*DAG.getContext(), CEltT,
+ IfTrueVT.getSizeInBits() / CEltT.getSizeInBits());
+ ResCC = DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, DUPVT, ResCC,
+ DAG.getConstant(0, MVT::i64, false));
+
+ ResCC = DAG.getNode(ISD::BITCAST, dl, CastVT, ResCC);
+ } else {
+ // FIXME: If IfTrue & IfFalse hold v1i8, v1i16 or v1i32, this function
+ // can't handle them and will hit this assert.
+ assert(CEltT.getSizeInBits() == IfTrueVT.getSizeInBits() &&
+ "Vector of IfTrue & IfFalse is too small.");
+
+ unsigned ExEltNum =
+ EltNum * IfTrueVT.getSizeInBits() / ResCC.getValueSizeInBits();
+ EVT ExVT = EVT::getVectorVT(*DAG.getContext(), CEltT, ExEltNum);
+ ResCC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExVT, ResCC,
+ DAG.getConstant(0, MVT::i64, false));
+ ResCC = DAG.getNode(ISD::BITCAST, dl, CastVT, ResCC);
+ }
+ SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
+ ResCC, IfTrue, IfFalse);
+ return VSelect;
+ }
+
+ // Here we handle the case that LHS & RHS are integer and IfTrue & IfFalse are
+ // vectors.
+ A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+ CondCode = FPCCToA64CC(CC, Alternative);
+ SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+ SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+ DAG.getCondCode(CC));
+ EVT SEVT = MVT::i32;
+ if (IfTrue.getValueType().getVectorElementType().getSizeInBits() > 32)
+ SEVT = MVT::i64;
+ SDValue AllOne = DAG.getConstant(-1, SEVT);
+ SDValue AllZero = DAG.getConstant(0, SEVT);
+ SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, SEVT, SetCC,
+ AllOne, AllZero, A64cc);
+
+ if (Alternative != A64CC::Invalid) {
+ A64cc = DAG.getConstant(Alternative, MVT::i32);
+ A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+ SetCC, AllOne, A64SELECT_CC, A64cc);
+ }
+ SDValue VDup;
+ if (IfTrue.getValueType().getVectorNumElements() == 1)
+ VDup = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, IfTrue.getValueType(),
+ A64SELECT_CC);
+ else
+ VDup = DAG.getNode(AArch64ISD::NEON_VDUP, dl, IfTrue.getValueType(),
+ A64SELECT_CC);
+ SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
+ VDup, IfTrue, IfFalse);
+ return VSelect;
+}
+
+// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
+SDValue
+AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ SDValue IfTrue = Op.getOperand(2);
+ SDValue IfFalse = Op.getOperand(3);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+ if (IfTrue.getValueType().isVector())
+ return LowerVectorSELECT_CC(Op, DAG);
+
+ if (LHS.getValueType() == MVT::f128) {
+ // f128 comparisons are lowered to libcalls, but slot in nicely here
+ // afterwards.
+ softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+ // If softenSetCCOperands returned a scalar, we need to compare the result
+ // against zero to select between true and false values.
+ if (RHS.getNode() == 0) {
+ RHS = DAG.getConstant(0, LHS.getValueType());
+ CC = ISD::SETNE;
+ }
+ }
+
+ if (LHS.getValueType().isInteger()) {
+ SDValue A64cc;
+
+ // Integers are handled in a separate function because the combinations of
+ // immediates and tests can get hairy and we may want to fiddle things.
+ SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+
+ return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), CmpOp,
+ IfTrue, IfFalse, A64cc);
+ }
+
+ // Note that some LLVM floating-point CondCodes can't be lowered to a single
+ // conditional branch, hence FPCCToA64CC can set a second test, where either
+ // passing is sufficient.
+ A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+ CondCode = FPCCToA64CC(CC, Alternative);
+ SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+ SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+ DAG.getCondCode(CC));
+ SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
+ Op.getValueType(),
+ SetCC, IfTrue, IfFalse, A64cc);
+
+ if (Alternative != A64CC::Invalid) {
+ A64cc = DAG.getConstant(Alternative, MVT::i32);
+ A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+ SetCC, IfTrue, A64SELECT_CC, A64cc);
+
+ }
+
+ return A64SELECT_CC;
+}
+
SDValue
AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
index 3056343abb..1309bf12b5 100644
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -461,10 +461,14 @@ multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
(INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
def : Pat<(v2i32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
(INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v2f32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
def : Pat<(v4i16 (opnode (v4i16 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
(INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
def : Pat<(v1i64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
(INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+ def : Pat<(v1f64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
+ (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
def : Pat<(v16i8 (opnode (v16i8 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
(INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
def : Pat<(v4i32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
diff --git a/test/CodeGen/AArch64/neon-select_cc.ll b/test/CodeGen/AArch64/neon-select_cc.ll
new file mode 100644
index 0000000000..537ec97d36
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-select_cc.ll
@@ -0,0 +1,180 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_i8:
+; CHECK: and w0, w0, #0xff
+; CHECK-NEXT: cmp w0, w1, uxtb
+; CHECK-NEXT: csinv w0, wzr, wzr, ne
+; CHECK-NEXT: dup v{{[0-9]+}}.8b, w0
+; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b
+ %cmp31 = icmp eq i8 %a, %b
+ %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+ ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f32:
+; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b
+ %cmp31 = fcmp oeq float %a, %b
+ %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+ ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f64:
+; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b
+ %cmp31 = fcmp oeq double %a, %b
+ %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+ ret <8x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_i8:
+; CHECK: and w0, w0, #0xff
+; CHECK-NEXT: cmp w0, w1, uxtb
+; CHECK-NEXT: csinv w0, wzr, wzr, ne
+; CHECK-NEXT: dup v{{[0-9]+}}.16b, w0
+; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b
+ %cmp31 = icmp eq i8 %a, %b
+ %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+ ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f32:
+; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b
+ %cmp31 = fcmp oeq float %a, %b
+ %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+ ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f64:
+; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: dup v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
+; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b
+ %cmp31 = fcmp oeq double %a, %b
+ %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+ ret <16x i8> %e
+}
+
+define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v4i16:
+; CHECK: and w0, w0, #0xffff
+; CHECK-NEXT: cmp w0, w1, uxth
+; CHECK-NEXT: csinv w0, wzr, wzr, ne
+; CHECK-NEXT: dup v{{[0-9]+}}.4h, w0
+; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b
+ %cmp31 = icmp eq i16 %a, %b
+ %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
+ ret <4x i16> %e
+}
+
+define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v8i16:
+; CHECK: and w0, w0, #0xffff
+; CHECK-NEXT: cmp w0, w1, uxth
+; CHECK-NEXT: csinv w0, wzr, wzr, ne
+; CHECK-NEXT: dup v{{[0-9]+}}.8h, w0
+; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b
+ %cmp31 = icmp eq i16 %a, %b
+ %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
+ ret <8x i16> %e
+}
+
+define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v2i32:
+; CHECK: cmp w0, w1, uxtw
+; CHECK-NEXT: csinv w0, wzr, wzr, ne
+; CHECK-NEXT: dup v{{[0-9]+}}.2s, w0
+; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b
+ %cmp31 = icmp eq i32 %a, %b
+ %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
+ ret <2x i32> %e
+}
+
+define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v4i32:
+; CHECK: cmp w0, w1, uxtw
+; CHECK-NEXT: csinv w0, wzr, wzr, ne
+; CHECK-NEXT: dup v{{[0-9]+}}.4s, w0
+; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b
+ %cmp31 = icmp eq i32 %a, %b
+ %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
+ ret <4x i32> %e
+}
+
+define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v1i64:
+; CHECK: cmp x0, x1
+; CHECK-NEXT: csinv x0, xzr, xzr, ne
+; CHECK-NEXT: fmov d{{[0-9]+}}, x0
+; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v0.8b, v1.8b
+ %cmp31 = icmp eq i64 %a, %b
+ %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
+ ret <1x i64> %e
+}
+
+define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v2i64:
+; CHECK: cmp x0, x1
+; CHECK-NEXT: csinv x0, xzr, xzr, ne
+; CHECK-NEXT: dup v{{[0-9]+}}.2d, x0
+; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v0.16b, v1.16b
+ %cmp31 = icmp eq i64 %a, %b
+ %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
+ ret <2x i64> %e
+}
+
+define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v1f32:
+; CHECK: fcmp s0, s1
+; CHECK-NEXT: fcsel s0, s2, s3, eq
+ %cmp31 = fcmp oeq float %a, %b
+ %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
+ ret <1 x float> %e
+}
+
+define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v2f32:
+; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b
+ %cmp31 = fcmp oeq float %a, %b
+ %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
+ ret <2 x float> %e
+}
+
+define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
+; CHECK-LABEL: test_select_cc_v4f32:
+; CHECK: fcmeq v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b
+ %cmp31 = fcmp oeq float %a, %b
+ %e = select i1 %cmp31, <4x float> %c, <4x float> %d
+ ret <4x float> %e
+}
+
+define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v1f64:
+; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: bsl v{{[0-9]+}}.8b, v2.8b, v3.8b
+ %cmp31 = fcmp oeq double %a, %b
+ %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
+ ret <1 x double> %e
+}
+
+define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v2f64:
+; CHECK: fcmeq v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: dup v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
+; CHECK-NEXT: bsl v{{[0-9]+}}.16b, v2.16b, v3.16b
+ %cmp31 = fcmp oeq double %a, %b
+ %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
+ ret <2 x double> %e
+}