From 79c6a4f3478bd21558e2c779667bec7d69e94ccc Mon Sep 17 00:00:00 2001
From: Kevin Qin <Kevin.Qin@arm.com>
Date: Wed, 29 Jan 2014 01:57:30 +0000
Subject: [AArch64 NEON] Lower SELECT_CC with vector operand.

When the scalar compare is between floating point and operands are
vector, we custom lower SELECT_CC to use NEON SIMD compare for
generating less instructions.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@200365 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/AArch64/AArch64ISelLowering.cpp | 234 ++++++++++++++++++++++-------
 lib/Target/AArch64/AArch64InstrNEON.td     |   4 +
 test/CodeGen/AArch64/neon-select_cc.ll     | 180 ++++++++++++++++++++++
 3 files changed, 362 insertions(+), 56 deletions(-)
 create mode 100644 test/CodeGen/AArch64/neon-select_cc.ll
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 327547c27c..7f101fffa1 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -458,6 +458,32 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
     setOperationAction(ISD::FREM, MVT::v1f64, Expand);
     setOperationAction(ISD::FREM, MVT::v2f64, Expand);
 
+    setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
+    setOperationAction(ISD::SELECT, MVT::v16i8, Expand);
+    setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
+    setOperationAction(ISD::SELECT, MVT::v8i16, Expand);
+    setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
+    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v2i64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v2f64, Expand);
+
+    setOperationAction(ISD::SELECT_CC, MVT::v8i8, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v16i8, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v4i16, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v8i16, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v2i32, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v4i32, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v1i64, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v2i64, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v2f32, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Custom);
+    setOperationAction(ISD::SELECT_CC, MVT::v2f64, Custom);
+
     // Vector ExtLoad and TruncStore are expanded.
     for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE;
          I <= MVT::LAST_VECTOR_VALUETYPE; ++I) {
@@ -2661,62 +2687,6 @@ AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   }
 }
 
-// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
-SDValue
-AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue IfTrue = Op.getOperand(2);
-  SDValue IfFalse = Op.getOperand(3);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-
-  if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons are lowered to libcalls, but slot in nicely here
-    // afterwards.
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
-
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
-
-    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                       CmpOp, IfTrue, IfFalse, A64cc);
-  }
-
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
-                                     Op.getValueType(),
-                                     SetCC, IfTrue, IfFalse, A64cc);
-
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                               SetCC, IfTrue, A64SELECT_CC, A64cc);
-
-  }
-
-  return A64SELECT_CC;
-}
-
 // (SELECT testbit, iftrue, iffalse)
 SDValue
 AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -3004,6 +2974,158 @@ AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   return A64SELECT_CC;
 }
 
+static SDValue LowerVectorSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue IfTrue = Op.getOperand(2);
+  SDValue IfFalse = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  // If LHS & RHS are floating point and IfTrue & IfFalse are vectors, we will
+  // use NEON compare.
+  if ((LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64)) {
+    EVT EltVT = LHS.getValueType();
+    unsigned EltNum = 128 / EltVT.getSizeInBits();
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), EltVT, EltNum);
+    unsigned SubConstant =
+        (LHS.getValueType() == MVT::f32) ? AArch64::sub_32 :AArch64::sub_64;
+    EVT CEltT = (LHS.getValueType() == MVT::f32) ? MVT::i32 : MVT::i64;
+    EVT CVT = EVT::getVectorVT(*DAG.getContext(), CEltT, EltNum);
+
+    LHS
+      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
+                  VT, DAG.getTargetConstant(0, MVT::i32), LHS,
+                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
+    RHS
+      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
+                  VT, DAG.getTargetConstant(0, MVT::i32), RHS,
+                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
+
+    SDValue VSetCC = DAG.getSetCC(dl, CVT, LHS, RHS, CC);
+    SDValue ResCC = LowerVectorSETCC(VSetCC, DAG);
+    EVT IfTrueVT = IfTrue.getValueType();
+    EVT CastEltT =
+        MVT::getIntegerVT(IfTrueVT.getVectorElementType().getSizeInBits());
+    EVT CastVT = EVT::getVectorVT(*DAG.getContext(), CastEltT,
+                                  IfTrueVT.getVectorNumElements());
+    if (CEltT.getSizeInBits() < IfTrueVT.getSizeInBits()) {
+      EVT DUPVT =
+          EVT::getVectorVT(*DAG.getContext(), CEltT,
+                           IfTrueVT.getSizeInBits() / CEltT.getSizeInBits());
+      ResCC = DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, DUPVT, ResCC,
+                          DAG.getConstant(0, MVT::i64, false));
+
+      ResCC = DAG.getNode(ISD::BITCAST, dl, CastVT, ResCC);
+    } else {
+      // FIXME: If IfTrue & IfFalse hold v1i8, v1i16 or v1i32, this function
+      // can't handle them and will hit this assert.
+      assert(CEltT.getSizeInBits() == IfTrueVT.getSizeInBits() &&
+             "Vector of IfTrue & IfFalse is too small.");
+
+      unsigned ExEltNum =
+          EltNum * IfTrueVT.getSizeInBits() / ResCC.getValueSizeInBits();
+      EVT ExVT = EVT::getVectorVT(*DAG.getContext(), CEltT, ExEltNum);
+      ResCC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExVT, ResCC,
+                          DAG.getConstant(0, MVT::i64, false));
+      ResCC = DAG.getNode(ISD::BITCAST, dl, CastVT, ResCC);
+    }
+    SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
+                                  ResCC, IfTrue, IfFalse);
+    return VSelect;
+  }
+
+  // Here we handle the case that LHS & RHS are integer and IfTrue & IfFalse are
+  // vectors.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  EVT SEVT = MVT::i32;
+  if (IfTrue.getValueType().getVectorElementType().getSizeInBits() > 32)
+    SEVT = MVT::i64;
+  SDValue AllOne = DAG.getConstant(-1, SEVT);
+  SDValue AllZero = DAG.getConstant(0, SEVT);
+  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, SEVT, SetCC,
+                                     AllOne, AllZero, A64cc);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                               SetCC, AllOne, A64SELECT_CC, A64cc);
+  }
+  SDValue VDup;
+  if (IfTrue.getValueType().getVectorNumElements() == 1)
+    VDup = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, IfTrue.getValueType(),
+                       A64SELECT_CC);
+  else
+    VDup = DAG.getNode(AArch64ISD::NEON_VDUP, dl, IfTrue.getValueType(),
+                       A64SELECT_CC);
+  SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
+                                VDup, IfTrue, IfFalse);
+  return VSelect;
+}
+
+// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
+SDValue
+AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue IfTrue = Op.getOperand(2);
+  SDValue IfFalse = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  if (IfTrue.getValueType().isVector())
+    return LowerVectorSELECT_CC(Op, DAG);
+
+  if (LHS.getValueType() == MVT::f128) {
+    // f128 comparisons are lowered to libcalls, but slot in nicely here
+    // afterwards.
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (RHS.getNode() == 0) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+  }
+
+  if (LHS.getValueType().isInteger()) {
+    SDValue A64cc;
+
+    // Integers are handled in a separate function because the combinations of
+    // immediates and tests can get hairy and we may want to fiddle things.
+    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+
+    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), CmpOp,
+                       IfTrue, IfFalse, A64cc);
+  }
+
+  // Note that some LLVM floating-point CondCodes can't be lowered to a single
+  // conditional branch, hence FPCCToA64CC can set a second test, where either
+  // passing is sufficient.
+  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
+  CondCode = FPCCToA64CC(CC, Alternative);
+  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
+  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
+                              DAG.getCondCode(CC));
+  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
+                                     Op.getValueType(),
+                                     SetCC, IfTrue, IfFalse, A64cc);
+
+  if (Alternative != A64CC::Invalid) {
+    A64cc = DAG.getConstant(Alternative, MVT::i32);
+    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
+                               SetCC, IfTrue, A64SELECT_CC, A64cc);
+
+  }
+
+  return A64SELECT_CC;
+}
+
 SDValue
 AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
   const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
index 3056343abb..1309bf12b5 100644
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -461,10 +461,14 @@ multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
             (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
   def : Pat<(v2i32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
             (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v2f32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
   def : Pat<(v4i16 (opnode (v4i16 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
             (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
   def : Pat<(v1i64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
             (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
+  def : Pat<(v1f64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
+            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
   def : Pat<(v16i8 (opnode (v16i8 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
             (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
   def : Pat<(v4i32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
diff --git a/test/CodeGen/AArch64/neon-select_cc.ll b/test/CodeGen/AArch64/neon-select_cc.ll
new file mode 100644
index 0000000000..537ec97d36
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-select_cc.ll
@@ -0,0 +1,180 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+
+define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_i8:
+; CHECK: and	w0, w0, #0xff
+; CHECK-NEXT: cmp	w0, w1, uxtb
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.8b, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i8 %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f32(float %a, float %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f32:
+; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <8x i8> @test_select_cc_v8i8_f64(double %a, double %b, <8x i8> %c, <8x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v8i8_f64:
+; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <8x i8> %c, <8x i8> %d
+  ret <8x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_i8(i8 %a, i8 %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_i8:
+; CHECK: and	w0, w0, #0xff
+; CHECK-NEXT: cmp	w0, w1, uxtb
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.16b, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i8 %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f32(float %a, float %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f32:
+; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <16x i8> @test_select_cc_v16i8_f64(double %a, double %b, <16x i8> %c, <16x i8> %d ) {
+; CHECK-LABEL: test_select_cc_v16i8_f64:
+; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <16x i8> %c, <16x i8> %d
+  ret <16x i8> %e
+}
+
+define <4x i16> @test_select_cc_v4i16(i16 %a, i16 %b, <4x i16> %c, <4x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v4i16:
+; CHECK: and	w0, w0, #0xffff
+; CHECK-NEXT: cmp	w0, w1, uxth
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.4h, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i16 %a, %b
+  %e = select i1 %cmp31, <4x i16> %c, <4x i16> %d
+  ret <4x i16> %e
+}
+
+define <8x i16> @test_select_cc_v8i16(i16 %a, i16 %b, <8x i16> %c, <8x i16> %d ) {
+; CHECK-LABEL: test_select_cc_v8i16:
+; CHECK: and	w0, w0, #0xffff
+; CHECK-NEXT: cmp	w0, w1, uxth
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.8h, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i16 %a, %b
+  %e = select i1 %cmp31, <8x i16> %c, <8x i16> %d
+  ret <8x i16> %e
+}
+
+define <2x i32> @test_select_cc_v2i32(i32 %a, i32 %b, <2x i32> %c, <2x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v2i32:
+; CHECK: cmp	w0, w1, uxtw
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.2s, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <2x i32> %c, <2x i32> %d
+  ret <2x i32> %e
+}
+
+define <4x i32> @test_select_cc_v4i32(i32 %a, i32 %b, <4x i32> %c, <4x i32> %d ) {
+; CHECK-LABEL: test_select_cc_v4i32:
+; CHECK: cmp	w0, w1, uxtw
+; CHECK-NEXT: csinv	w0, wzr, wzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.4s, w0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i32 %a, %b
+  %e = select i1 %cmp31, <4x i32> %c, <4x i32> %d
+  ret <4x i32> %e
+}
+
+define <1x i64> @test_select_cc_v1i64(i64 %a, i64 %b, <1x i64> %c, <1x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v1i64:
+; CHECK: cmp	x0, x1
+; CHECK-NEXT: csinv	x0, xzr, xzr, ne
+; CHECK-NEXT: fmov	d{{[0-9]+}}, x0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v0.8b, v1.8b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <1x i64> %c, <1x i64> %d
+  ret <1x i64> %e
+}
+
+define <2x i64> @test_select_cc_v2i64(i64 %a, i64 %b, <2x i64> %c, <2x i64> %d ) {
+; CHECK-LABEL: test_select_cc_v2i64:
+; CHECK: cmp	x0, x1
+; CHECK-NEXT: csinv	x0, xzr, xzr, ne
+; CHECK-NEXT: dup	v{{[0-9]+}}.2d, x0
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v0.16b, v1.16b
+  %cmp31 = icmp eq i64 %a, %b
+  %e = select i1 %cmp31, <2x i64> %c, <2x i64> %d
+  ret <2x i64> %e
+}
+
+define <1 x float> @test_select_cc_v1f32(float %a, float %b, <1 x float> %c, <1 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v1f32:
+; CHECK: fcmp	s0, s1
+; CHECK-NEXT: fcsel	s0, s2, s3, eq
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <1 x float> %c, <1 x float> %d
+  ret <1 x float> %e
+}
+  
+define <2 x float> @test_select_cc_v2f32(float %a, float %b, <2 x float> %c, <2 x float> %d ) {
+; CHECK-LABEL: test_select_cc_v2f32:
+; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup	v{{[0-9]+}}.2s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <2 x float> %c, <2 x float> %d
+  ret <2 x float> %e
+}
+
+define <4x float> @test_select_cc_v4f32(float %a, float %b, <4x float> %c, <4x float> %d ) {
+; CHECK-LABEL: test_select_cc_v4f32:
+; CHECK: fcmeq	v{{[0-9]+}}.4s, v0.4s, v1.4s
+; CHECK-NEXT: dup	v{{[0-9]+}}.4s, v{{[0-9]+}}.s[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq float %a, %b
+  %e = select i1 %cmp31, <4x float> %c, <4x float> %d
+  ret <4x float> %e
+}
+
+define <1 x double> @test_select_cc_v1f64(double %a, double %b, <1 x double> %c, <1 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v1f64:
+; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.8b, v2.8b, v3.8b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <1 x double> %c, <1 x double> %d
+  ret <1 x double> %e
+}
+
+define <2 x double> @test_select_cc_v2f64(double %a, double %b, <2 x double> %c, <2 x double> %d ) {
+; CHECK-LABEL: test_select_cc_v2f64:
+; CHECK: fcmeq	v{{[0-9]+}}.2d, v0.2d, v1.2d
+; CHECK-NEXT: dup	v{{[0-9]+}}.2d, v{{[0-9]+}}.d[0]
+; CHECK-NEXT:	bsl	v{{[0-9]+}}.16b, v2.16b, v3.16b
+  %cmp31 = fcmp oeq double %a, %b
+  %e = select i1 %cmp31, <2 x double> %c, <2 x double> %d
+  ret <2 x double> %e
+}
-- 
cgit v1.2.3