Optimize vector select from all 0s or all 1s

As packed comparisons in AVX/SSE produce all 0s or all 1s in each SIMD lane, vector select could be simplified to AND/OR or removed if one or both values being selected is all 0s or all 1s. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179267 91177308-0d34-0410-b5e6-96231b3b80d8
author: Michael Liao <michael.liao@intel.com> 2013-04-11 05:15:54 +0000
committer: Michael Liao <michael.liao@intel.com> 2013-04-11 05:15:54 +0000
commit: bf53841cfe3c341ebc0fca102d641c2018855254 (patch)
tree: 0288e1038ed3afd3eadc5ecf44b98d8dec9e63a3
parent: 02d2e612521954b5ff7c1ba6fd53e36bc51e1c48 (diff)
download: llvm-bf53841cfe3c341ebc0fca102d641c2018855254.tar.gz
llvm-bf53841cfe3c341ebc0fca102d641c2018855254.tar.bz2
llvm-bf53841cfe3c341ebc0fca102d641c2018855254.tar.xz
2 files changed, 117 insertions, 0 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 806e72f227..3564ce3920 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -15787,6 +15787,51 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
       return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
 
+  // Simplify vector selection if the selector will be produced by CMPP*/PCMP*.
+  if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT &&
+      Cond.getOpcode() == ISD::SETCC) {
+
+    assert(Cond.getValueType().isVector() &&
+           "vector select expects a vector selector!");
+
+    EVT IntVT = Cond.getValueType();
+    bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
+    bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+    if (!TValIsAllOnes && !FValIsAllZeros) {
+      // Try invert the condition if true value is not all 1s and false value
+      // is not all 0s.
+      bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+      bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
+
+      if (TValIsAllZeros || FValIsAllOnes) {
+        SDValue CC = Cond.getOperand(2);
+        ISD::CondCode NewCC =
+          ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                               Cond.getOperand(0).getValueType().isInteger());
+        Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC);
+        std::swap(LHS, RHS);
+        TValIsAllOnes = FValIsAllOnes;
+        FValIsAllZeros = TValIsAllZeros;
+      }
+    }
+
+    if (TValIsAllOnes || FValIsAllZeros) {
+      SDValue Ret;
+
+      if (TValIsAllOnes && FValIsAllZeros)
+        Ret = Cond;
+      else if (TValIsAllOnes)
+        Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond,
+                          DAG.getNode(ISD::BITCAST, DL, IntVT, RHS));
+      else if (FValIsAllZeros)
+        Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond,
+                          DAG.getNode(ISD::BITCAST, DL, IntVT, LHS));
+
+      return DAG.getNode(ISD::BITCAST, DL, VT, Ret);
+    }
+  }
+
   // If we know that this node is legal then we know that it is going to be
   // matched by one of the SSE/AVX BLEND instructions. These instructions only
   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
diff --git a/test/CodeGen/X86/select-with-and-or.ll b/test/CodeGen/X86/select-with-and-or.ll
new file mode 100644
index 0000000000..1ccf30bf20
--- /dev/null
+++ b/test/CodeGen/X86/select-with-and-or.ll
@@ -0,0 +1,72 @@
+; RUN: opt < %s -O3 | \
+; RUN:	llc -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+define <4 x i32> @test1(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
+  %f = fcmp ult <4 x float> %a, %b
+  %r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> zeroinitializer
+  ret <4 x i32> %r
+; CHECK: test1
+; CHECK: cmpnle
+; CHECK-NEXT: andps
+; CHECK: ret
+}
+
+define <4 x i32> @test2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
+  %f = fcmp ult <4 x float> %a, %b
+  %r = select <4 x i1> %f, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c
+  ret <4 x i32> %r
+; CHECK: test2
+; CHECK: cmpnle
+; CHECK-NEXT: orps
+; CHECK: ret
+}
+
+define <4 x i32> @test3(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
+  %f = fcmp ult <4 x float> %a, %b
+  %r = select <4 x i1> %f, <4 x i32> zeroinitializer, <4 x i32> %c
+  ret <4 x i32> %r
+; CHECK: test3
+; CHECK: cmple
+; CHECK-NEXT: andps
+; CHECK: ret
+}
+
+define <4 x i32> @test4(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
+  %f = fcmp ult <4 x float> %a, %b
+  %r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %r
+; CHECK: test4
+; CHECK: cmple
+; CHECK-NEXT: orps
+; CHECK: ret
+}
+
+define <4 x i32> @test5(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
+  %f = fcmp ult <4 x float> %a, %b
+  %r = select <4 x i1> %f, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> zeroinitializer
+  ret <4 x i32> %r
+; CHECK: test5
+; CHECK: cmpnle
+; CHECK-NEXT: ret
+}
+
+define <4 x i32> @test6(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
+  %f = fcmp ult <4 x float> %a, %b
+  %r = select <4 x i1> %f, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <4 x i32> %r
+; CHECK: test6
+; CHECK: cmple
+; CHECK-NEXT: ret
+}
+
+define <4 x i32> @test7(<4 x float> %a, <4 x float> %b, <4 x i32>* %p) {
+  %f = fcmp ult <4 x float> %a, %b
+  %s = sext <4 x i1> %f to <4 x i32>
+  %l = load <4 x i32>* %p
+  %r = and <4 x i32> %l, %s
+  ret <4 x i32> %r
+; CHECK: test7
+; CHECK: cmpnle
+; CHECK-NEXT: andps
+; CHECK: ret
+}
author	Michael Liao <michael.liao@intel.com>	2013-04-11 05:15:54 +0000
committer	Michael Liao <michael.liao@intel.com>	2013-04-11 05:15:54 +0000
commit	bf53841cfe3c341ebc0fca102d641c2018855254 (patch)
tree	0288e1038ed3afd3eadc5ecf44b98d8dec9e63a3
parent	02d2e612521954b5ff7c1ba6fd53e36bc51e1c48 (diff)
download	llvm-bf53841cfe3c341ebc0fca102d641c2018855254.tar.gz llvm-bf53841cfe3c341ebc0fca102d641c2018855254.tar.bz2 llvm-bf53841cfe3c341ebc0fca102d641c2018855254.tar.xz