diff options
author | Michael Liao <michael.liao@intel.com> | 2013-04-11 05:15:54 +0000 |
---|---|---|
committer | Michael Liao <michael.liao@intel.com> | 2013-04-11 05:15:54 +0000 |
commit | bf53841cfe3c341ebc0fca102d641c2018855254 (patch) | |
tree | 0288e1038ed3afd3eadc5ecf44b98d8dec9e63a3 | |
parent | 02d2e612521954b5ff7c1ba6fd53e36bc51e1c48 (diff) | |
download | llvm-bf53841cfe3c341ebc0fca102d641c2018855254.tar.gz llvm-bf53841cfe3c341ebc0fca102d641c2018855254.tar.bz2 llvm-bf53841cfe3c341ebc0fca102d641c2018855254.tar.xz |
Optimize vector select from all 0s or all 1s
As packed comparisons in AVX/SSE produce all 0s or all 1s in each SIMD lane,
vector select could be simplified to AND/OR or removed if one or both values
being selected is all 0s or all 1s.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179267 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 45 | ||||
-rw-r--r-- | test/CodeGen/X86/select-with-and-or.ll | 72 |
2 files changed, 117 insertions, 0 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 806e72f227..3564ce3920 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -15787,6 +15787,51 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG, if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget)) return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS); + // Simplify vector selection if the selector will be produced by CMPP*/PCMP*. + if (!DCI.isBeforeLegalize() && N->getOpcode() == ISD::VSELECT && + Cond.getOpcode() == ISD::SETCC) { + + assert(Cond.getValueType().isVector() && + "vector select expects a vector selector!"); + + EVT IntVT = Cond.getValueType(); + bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + + if (!TValIsAllOnes && !FValIsAllZeros) { + // Try invert the condition if true value is not all 1s and false value + // is not all 0s. + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); + + if (TValIsAllZeros || FValIsAllOnes) { + SDValue CC = Cond.getOperand(2); + ISD::CondCode NewCC = + ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), + Cond.getOperand(0).getValueType().isInteger()); + Cond = DAG.getSetCC(DL, IntVT, Cond.getOperand(0), Cond.getOperand(1), NewCC); + std::swap(LHS, RHS); + TValIsAllOnes = FValIsAllOnes; + FValIsAllZeros = TValIsAllZeros; + } + } + + if (TValIsAllOnes || FValIsAllZeros) { + SDValue Ret; + + if (TValIsAllOnes && FValIsAllZeros) + Ret = Cond; + else if (TValIsAllOnes) + Ret = DAG.getNode(ISD::OR, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, RHS)); + else if (FValIsAllZeros) + Ret = DAG.getNode(ISD::AND, DL, IntVT, Cond, + DAG.getNode(ISD::BITCAST, DL, IntVT, LHS)); + + return DAG.getNode(ISD::BITCAST, DL, VT, Ret); + } + } + // If we know that this node is legal then we know that it is going to be // matched by one of the SSE/AVX BLEND instructions. These instructions only // depend on the highest bit in each word. Try to use SimplifyDemandedBits diff --git a/test/CodeGen/X86/select-with-and-or.ll b/test/CodeGen/X86/select-with-and-or.ll new file mode 100644 index 0000000000..1ccf30bf20 --- /dev/null +++ b/test/CodeGen/X86/select-with-and-or.ll @@ -0,0 +1,72 @@ +; RUN: opt < %s -O3 | \ +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s + +define <4 x i32> @test1(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> zeroinitializer + ret <4 x i32> %r +; CHECK: test1 +; CHECK: cmpnle +; CHECK-NEXT: andps +; CHECK: ret +} + +define <4 x i32> @test2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %c + ret <4 x i32> %r +; CHECK: test2 +; CHECK: cmpnle +; CHECK-NEXT: orps +; CHECK: ret +} + +define <4 x i32> @test3(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> zeroinitializer, <4 x i32> %c + ret <4 x i32> %r +; CHECK: test3 +; CHECK: cmple +; CHECK-NEXT: andps +; CHECK: ret +} + +define <4 x i32> @test4(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> %c, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> + ret <4 x i32> %r +; CHECK: test4 +; CHECK: cmple +; CHECK-NEXT: orps +; CHECK: ret +} + +define <4 x i32> @test5(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> zeroinitializer + ret <4 x i32> %r +; CHECK: test5 +; CHECK: cmpnle +; CHECK-NEXT: ret +} + +define <4 x i32> @test6(<4 x float> %a, <4 x float> %b, <4 x i32> %c) { + %f = fcmp ult <4 x float> %a, %b + %r = select <4 x i1> %f, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1> + ret <4 x i32> %r +; CHECK: test6 +; CHECK: cmple +; CHECK-NEXT: ret +} + +define <4 x i32> @test7(<4 x float> %a, <4 x float> %b, <4 x i32>* %p) { + %f = fcmp ult <4 x float> %a, %b + %s = sext <4 x i1> %f to <4 x i32> + %l = load <4 x i32>* %p + %r = and <4 x i32> %l, %s + ret <4 x i32> %r +; CHECK: test7 +; CHECK: cmpnle +; CHECK-NEXT: andps +; CHECK: ret +} |