summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorBenjamin Kramer <benny.kra@googlemail.com>2012-12-15 16:47:44 +0000
committerBenjamin Kramer <benny.kra@googlemail.com>2012-12-15 16:47:44 +0000
commit388fc6a988b56a50efff57893a4df14b4d04e1cd (patch)
tree1ef73286c3a9317acb791432b39bddb31b746c0e /lib
parentd12de955856204db4cabdd9bcabc82c22d0e85f2 (diff)
downloadllvm-388fc6a988b56a50efff57893a4df14b4d04e1cd.tar.gz
llvm-388fc6a988b56a50efff57893a4df14b4d04e1cd.tar.bz2
llvm-388fc6a988b56a50efff57893a4df14b4d04e1cd.tar.xz
X86: Add a couple of target-specific dag combines that turn VSELECTS into psubus if possible.
We match the pattern "x >= y ? x-y : 0" into "subus x, y" and two special cases if y is a constant. DAGCombiner canonicalizes those so we first have to undo the canonicalization for those cases. The pattern occurs in gzip when the loop vectorizer is enabled. Part of PR14613. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170273 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp68
-rw-r--r--lib/Target/X86/X86ISelLowering.h3
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td1
-rw-r--r--lib/Target/X86/X86InstrSSE.td34
4 files changed, 88 insertions, 18 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index d4ee985ca6..70089fcc65 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -10097,6 +10097,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ // SSE2/AVX2 sub with unsigned saturation intrinsics
+ case Intrinsic::x86_sse2_psubus_b:
+ case Intrinsic::x86_sse2_psubus_w:
+ case Intrinsic::x86_avx2_psubus_b:
+ case Intrinsic::x86_avx2_psubus_w:
+ return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
// SSE3/AVX horizontal add/sub intrinsics
case Intrinsic::x86_sse3_hadd_ps:
case Intrinsic::x86_sse3_hadd_pd:
@@ -11961,6 +11969,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PSIGN: return "X86ISD::PSIGN";
case X86ISD::BLENDV: return "X86ISD::BLENDV";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
+ case X86ISD::SUBUS: return "X86ISD::SUBUS";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::HSUB: return "X86ISD::HSUB";
case X86ISD::FHADD: return "X86ISD::FHADD";
@@ -14913,6 +14922,65 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ // Match VSELECTs into subs with unsigned saturation.
+ if (!DCI.isBeforeLegalize() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+ ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+ (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+ // left side invert the predicate to simplify logic below.
+ SDValue Other;
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+ Other = RHS;
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+ Other = LHS;
+ }
+
+ if (Other.getNode() && Other->getNumOperands() == 2 &&
+ DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+ SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+ SDValue CondRHS = Cond->getOperand(1);
+
+ // Look for a general sub with unsigned saturation first.
+ // x >= y ? x-y : 0 --> subus x, y
+ // x > y ? x-y : 0 --> subus x, y
+ if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+ Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+ // If the RHS is a constant we have to reverse the const canonicalization.
+ // x > C-1 ? x+-C : 0 --> subus x, C
+ if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+ isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
+ APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+ if (CondRHS.getConstantOperandVal(0) == -A-1) {
+ SmallVector<SDValue, 32> V(VT.getVectorNumElements(),
+ DAG.getConstant(-A, VT.getScalarType()));
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
+ V.data(), V.size()));
+ }
+ }
+
+ // Another special case: If C was a sign bit, the sub has been
+ // canonicalized into a xor.
+ // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+ // it's safe to decanonicalize the xor?
+ // x s< 0 ? x^C : 0 --> subus x, C
+ if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+ isSplatVector(OpRHS.getNode())) {
+ APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+ if (A.isSignBit())
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+ }
+ }
+ }
+
// If we know that this node is legal then we know that it is going to be
// matched by one of the SSE/AVX BLEND instructions. These instructions only
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index cbe83a7c24..60f34099dd 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -182,6 +182,9 @@ namespace llvm {
/// BLENDI - Blend where the selector is an immediate.
BLENDI,
+ // SUBUS - Integer sub with unsigned saturation.
+ SUBUS,
+
/// HADD - Integer horizontal add.
HADD,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 09ab995166..7d16d2741d 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -128,6 +128,7 @@ def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisVec<1>,
SDTCisSameAs<2, 1>]>;
+def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 1912a936ce..54032fe97f 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3724,6 +3724,12 @@ defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64,
i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64,
i128mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V;
+defm VPSUBUSB : PDI_binop_rm<0xD8, "vpsubusb", X86subus, v16i8, VR128,
+ memopv2i64, i128mem, SSE_INTALU_ITINS_P, 0, 0>,
+ VEX_4V;
+defm VPSUBUSW : PDI_binop_rm<0xD9, "vpsubusw", X86subus, v8i16, VR128,
+ memopv2i64, i128mem, SSE_INTALU_ITINS_P, 0, 0>,
+ VEX_4V;
defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
VEX_4V;
@@ -3735,12 +3741,6 @@ defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b,
defm VPSUBSW : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w,
VR128, memopv2i64, i128mem,
SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
defm VPADDSB : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b,
VR128, memopv2i64, i128mem,
SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
@@ -3804,6 +3804,12 @@ defm VPSUBDY : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
defm VPSUBQY : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V, VEX_L;
+defm VPSUBUSBY : PDI_binop_rm<0xD8, "vpsubusb", X86subus, v32i8, VR256,
+ memopv4i64, i256mem, SSE_INTALU_ITINS_P, 0, 0>,
+ VEX_4V, VEX_L;
+defm VPSUBUSWY : PDI_binop_rm<0xD9, "vpsubusw", X86subus, v16i16, VR256,
+ memopv4i64, i256mem, SSE_INTALU_ITINS_P, 0, 0>,
+ VEX_4V, VEX_L;
defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
VR256, memopv4i64, i256mem,
SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
@@ -3815,12 +3821,6 @@ defm VPSUBSBY : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
defm VPSUBSWY : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w,
VR256, memopv4i64, i256mem,
SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
defm VPADDSBY : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b,
VR256, memopv4i64, i256mem,
SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
@@ -3884,6 +3884,10 @@ defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64,
i128mem, SSE_INTALU_ITINS_P>;
defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64,
i128mem, SSE_INTALUQ_ITINS_P>;
+defm PSUBUSB : PDI_binop_rm<0xD8, "psubusb", X86subus, v16i8, VR128, memopv2i64,
+ i128mem, SSE_INTALU_ITINS_P>;
+defm PSUBUSW : PDI_binop_rm<0xD9, "psubusw", X86subus, v8i16, VR128, memopv2i64,
+ i128mem, SSE_INTALU_ITINS_P>;
defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
@@ -3894,12 +3898,6 @@ defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b,
defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
VR128, memopv2i64, i128mem,
SSE_INTALU_ITINS_P>;
-defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
-defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
VR128, memopv2i64, i128mem,
SSE_INTALU_ITINS_P, 1>;