summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Lin <stephenwlin@gmail.com>2013-07-12 15:31:36 +0000
committerStephen Lin <stephenwlin@gmail.com>2013-07-12 15:31:36 +0000
commitfff967358b56c4e191089f668b75ae415b5bd992 (patch)
tree063cb5d8eab7374555fa488f0229a8f123353c9c
parent55ec2218c448ef9e0d09b5534885b6d2a9786a73 (diff)
downloadllvm-fff967358b56c4e191089f668b75ae415b5bd992.tar.gz
llvm-fff967358b56c4e191089f668b75ae415b5bd992.tar.bz2
llvm-fff967358b56c4e191089f668b75ae415b5bd992.tar.xz
X86: fold SSE2/AVX2 logical shift by immediate amount into zero vector when possible
Patch by Andrea Di Biagio git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@186165 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp38
-rw-r--r--test/CodeGen/X86/avx2-vector-shifts.ll247
-rw-r--r--test/CodeGen/X86/sse2-vector-shifts.ll247
3 files changed, 532 insertions, 0 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 6284dd7e58..95ca6c315d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -16321,6 +16321,38 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+/// \brief Returns a vector of 0s if the node in input is a vector logical
+/// shift by a constant amount which is known to be bigger than or equal
+/// to the vector element size in bits.
+static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
+ (!Subtarget->hasInt256() ||
+ (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
+ return SDValue();
+
+ SDValue Amt = N->getOperand(1);
+ SDLoc DL(N);
+ if (isSplatVector(Amt.getNode())) {
+ SDValue SclrAmt = Amt->getOperand(0);
+ if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(SclrAmt)) {
+ APInt ShiftAmt = C->getAPIntValue();
+ unsigned MaxAmount = VT.getVectorElementType().getSizeInBits();
+
+ // SSE2/AVX2 logical shifts always return a vector of 0s
+ // if the shift amount is bigger than or equal to
+ // the element size. The constant shift amount will be
+ // encoded as a 8-bit immediate.
+ if (ShiftAmt.trunc(8).uge(MaxAmount))
+ return getZeroVector(VT, Subtarget, DAG, DL);
+ }
+ }
+
+ return SDValue();
+}
+
/// PerformShiftCombine - Combine shifts.
static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -16330,6 +16362,12 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
if (V.getNode()) return V;
}
+ if (N->getOpcode() != ISD::SRA) {
+ // Try to fold this logical shift into a zero vector.
+ SDValue V = performShiftToAllZeros(N, DAG, Subtarget);
+ if (V.getNode()) return V;
+ }
+
return SDValue();
}
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
new file mode 100644
index 0000000000..ca18a60b3c
--- /dev/null
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -0,0 +1,247 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; AVX2 Logical Shift Left
+
+define <16 x i16> @test_sllw_1(<16 x i16> %InVec) {
+entry:
+ %shl = shl <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+ ret <16 x i16> %shl
+}
+
+; CHECK: test_sllw_1:
+; CHECK: vpsllw $0, %ymm0, %ymm0
+; CHECK: ret
+
+define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
+entry:
+ %shl = shl <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <16 x i16> %shl
+}
+
+; CHECK: test_sllw_2:
+; CHECK: vpaddw %ymm0, %ymm0, %ymm0
+; CHECK: ret
+
+define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
+entry:
+ %shl = shl <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+ ret <16 x i16> %shl
+}
+
+; CHECK: test_sllw_3:
+; CHECK: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: ret
+
+define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
+entry:
+ %shl = shl <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %shl
+}
+
+; CHECK: test_slld_1:
+; CHECK: vpslld $0, %ymm0, %ymm0
+; CHECK: ret
+
+define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
+entry:
+ %shl = shl <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %shl
+}
+
+; CHECK: test_slld_2:
+; CHECK: vpaddd %ymm0, %ymm0, %ymm0
+; CHECK: ret
+
+define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
+entry:
+ %shl = shl <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+ ret <8 x i32> %shl
+}
+
+; CHECK: test_slld_3:
+; CHECK: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: ret
+
+define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
+entry:
+ %shl = shl <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
+ ret <4 x i64> %shl
+}
+
+; CHECK: test_sllq_1:
+; CHECK: vpsllq $0, %ymm0, %ymm0
+; CHECK: ret
+
+define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
+entry:
+ %shl = shl <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
+ ret <4 x i64> %shl
+}
+
+; CHECK: test_sllq_2:
+; CHECK: vpaddq %ymm0, %ymm0, %ymm0
+; CHECK: ret
+
+define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
+entry:
+ %shl = shl <4 x i64> %InVec, <i64 64, i64 64, i64 64, i64 64>
+ ret <4 x i64> %shl
+}
+
+; CHECK: test_sllq_3:
+; CHECK: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: ret
+
+; AVX2 Arithmetic Shift
+
+define <16 x i16> @test_sraw_1(<16 x i16> %InVec) {
+entry:
+ %shl = ashr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+ ret <16 x i16> %shl
+}
+
+; CHECK: test_sraw_1:
+; CHECK: vpsraw $0, %ymm0, %ymm0
+; CHECK: ret
+
+define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
+entry:
+ %shl = ashr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <16 x i16> %shl
+}
+
+; CHECK: test_sraw_2:
+; CHECK: vpsraw $1, %ymm0, %ymm0
+; CHECK: ret
+
+define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
+entry:
+ %shl = ashr <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+ ret <16 x i16> %shl
+}
+
+; CHECK: test_sraw_3:
+; CHECK: vpsraw $16, %ymm0, %ymm0
+; CHECK: ret
+
+define <8 x i32> @test_srad_1(<8 x i32> %InVec) {
+entry:
+ %shl = ashr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %shl
+}
+
+; CHECK: test_srad_1:
+; CHECK: vpsrad $0, %ymm0, %ymm0
+; CHECK: ret
+
+define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
+entry:
+ %shl = ashr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %shl
+}
+
+; CHECK: test_srad_2:
+; CHECK: vpsrad $1, %ymm0, %ymm0
+; CHECK: ret
+
+define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
+entry:
+ %shl = ashr <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+ ret <8 x i32> %shl
+}
+
+; CHECK: test_srad_3:
+; CHECK: vpsrad $32, %ymm0, %ymm0
+; CHECK: ret
+
+; SSE Logical Shift Right
+
+define <16 x i16> @test_srlw_1(<16 x i16> %InVec) {
+entry:
+ %shl = lshr <16 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+ ret <16 x i16> %shl
+}
+
+; CHECK: test_srlw_1:
+; CHECK: vpsrlw $0, %ymm0, %ymm0
+; CHECK: ret
+
+define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
+entry:
+ %shl = lshr <16 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <16 x i16> %shl
+}
+
+; CHECK: test_srlw_2:
+; CHECK: vpsrlw $1, %ymm0, %ymm0
+; CHECK: ret
+
+define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
+entry:
+ %shl = lshr <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+ ret <16 x i16> %shl
+}
+
+; CHECK: test_srlw_3:
+; CHECK: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: ret
+
+define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
+entry:
+ %shl = lshr <8 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+ ret <8 x i32> %shl
+}
+
+; CHECK: test_srld_1:
+; CHECK: vpsrld $0, %ymm0, %ymm0
+; CHECK: ret
+
+define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
+entry:
+ %shl = lshr <8 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+ ret <8 x i32> %shl
+}
+
+; CHECK: test_srld_2:
+; CHECK: vpsrld $1, %ymm0, %ymm0
+; CHECK: ret
+
+define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
+entry:
+ %shl = lshr <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+ ret <8 x i32> %shl
+}
+
+; CHECK: test_srld_3:
+; CHECK: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: ret
+
+define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
+entry:
+ %shl = lshr <4 x i64> %InVec, <i64 0, i64 0, i64 0, i64 0>
+ ret <4 x i64> %shl
+}
+
+; CHECK: test_srlq_1:
+; CHECK: vpsrlq $0, %ymm0, %ymm0
+; CHECK: ret
+
+define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
+entry:
+ %shl = lshr <4 x i64> %InVec, <i64 1, i64 1, i64 1, i64 1>
+ ret <4 x i64> %shl
+}
+
+; CHECK: test_srlq_2:
+; CHECK: vpsrlq $1, %ymm0, %ymm0
+; CHECK: ret
+
+define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
+entry:
+ %shl = lshr <4 x i64> %InVec, <i64 64, i64 64, i64 64, i64 64>
+ ret <4 x i64> %shl
+}
+
+; CHECK: test_srlq_3:
+; CHECK: vxorps %ymm0, %ymm0, %ymm0
+; CHECK: ret
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll
new file mode 100644
index 0000000000..312ca9533c
--- /dev/null
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -0,0 +1,247 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2 -mcpu=corei7 | FileCheck %s
+
+; SSE2 Logical Shift Left
+
+define <8 x i16> @test_sllw_1(<8 x i16> %InVec) {
+entry:
+ %shl = shl <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+ ret <8 x i16> %shl
+}
+
+; CHECK: test_sllw_1:
+; CHECK: psllw $0, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_sllw_2(<8 x i16> %InVec) {
+entry:
+ %shl = shl <8 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %shl
+}
+
+; CHECK: test_sllw_2:
+; CHECK: paddw %xmm0, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_sllw_3(<8 x i16> %InVec) {
+entry:
+ %shl = shl <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+ ret <8 x i16> %shl
+}
+
+; CHECK: test_sllw_3:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_slld_1(<4 x i32> %InVec) {
+entry:
+ %shl = shl <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i32> %shl
+}
+
+; CHECK: test_slld_1:
+; CHECK: pslld $0, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_slld_2(<4 x i32> %InVec) {
+entry:
+ %shl = shl <4 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %shl
+}
+
+; CHECK: test_slld_2:
+; CHECK: paddd %xmm0, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_slld_3(<4 x i32> %InVec) {
+entry:
+ %shl = shl <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
+ ret <4 x i32> %shl
+}
+
+; CHECK: test_slld_3:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NEXT: ret
+
+define <2 x i64> @test_sllq_1(<2 x i64> %InVec) {
+entry:
+ %shl = shl <2 x i64> %InVec, <i64 0, i64 0>
+ ret <2 x i64> %shl
+}
+
+; CHECK: test_sllq_1:
+; CHECK: psllq $0, %xmm0
+; CHECK-NEXT: ret
+
+define <2 x i64> @test_sllq_2(<2 x i64> %InVec) {
+entry:
+ %shl = shl <2 x i64> %InVec, <i64 1, i64 1>
+ ret <2 x i64> %shl
+}
+
+; CHECK: test_sllq_2:
+; CHECK: paddq %xmm0, %xmm0
+; CHECK-NEXT: ret
+
+define <2 x i64> @test_sllq_3(<2 x i64> %InVec) {
+entry:
+ %shl = shl <2 x i64> %InVec, <i64 64, i64 64>
+ ret <2 x i64> %shl
+}
+
+; CHECK: test_sllq_3:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NEXT: ret
+
+; SSE2 Arithmetic Shift
+
+define <8 x i16> @test_sraw_1(<8 x i16> %InVec) {
+entry:
+ %shl = ashr <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+ ret <8 x i16> %shl
+}
+
+; CHECK: test_sraw_1:
+; CHECK: psraw $0, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_sraw_2(<8 x i16> %InVec) {
+entry:
+ %shl = ashr <8 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %shl
+}
+
+; CHECK: test_sraw_2:
+; CHECK: psraw $1, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_sraw_3(<8 x i16> %InVec) {
+entry:
+ %shl = ashr <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+ ret <8 x i16> %shl
+}
+
+; CHECK: test_sraw_3:
+; CHECK: psraw $16, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_srad_1(<4 x i32> %InVec) {
+entry:
+ %shl = ashr <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i32> %shl
+}
+
+; CHECK: test_srad_1:
+; CHECK: psrad $0, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_srad_2(<4 x i32> %InVec) {
+entry:
+ %shl = ashr <4 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %shl
+}
+
+; CHECK: test_srad_2:
+; CHECK: psrad $1, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_srad_3(<4 x i32> %InVec) {
+entry:
+ %shl = ashr <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
+ ret <4 x i32> %shl
+}
+
+; CHECK: test_srad_3:
+; CHECK: psrad $32, %xmm0
+; CHECK-NEXT: ret
+
+; SSE Logical Shift Right
+
+define <8 x i16> @test_srlw_1(<8 x i16> %InVec) {
+entry:
+ %shl = lshr <8 x i16> %InVec, <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
+ ret <8 x i16> %shl
+}
+
+; CHECK: test_srlw_1:
+; CHECK: psrlw $0, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_srlw_2(<8 x i16> %InVec) {
+entry:
+ %shl = lshr <8 x i16> %InVec, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+ ret <8 x i16> %shl
+}
+
+; CHECK: test_srlw_2:
+; CHECK: psrlw $1, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_srlw_3(<8 x i16> %InVec) {
+entry:
+ %shl = lshr <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+ ret <8 x i16> %shl
+}
+
+; CHECK: test_srlw_3:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_srld_1(<4 x i32> %InVec) {
+entry:
+ %shl = lshr <4 x i32> %InVec, <i32 0, i32 0, i32 0, i32 0>
+ ret <4 x i32> %shl
+}
+
+; CHECK: test_srld_1:
+; CHECK: psrld $0, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_srld_2(<4 x i32> %InVec) {
+entry:
+ %shl = lshr <4 x i32> %InVec, <i32 1, i32 1, i32 1, i32 1>
+ ret <4 x i32> %shl
+}
+
+; CHECK: test_srld_2:
+; CHECK: psrld $1, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_srld_3(<4 x i32> %InVec) {
+entry:
+ %shl = lshr <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
+ ret <4 x i32> %shl
+}
+
+; CHECK: test_srld_3:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NEXT: ret
+
+define <2 x i64> @test_srlq_1(<2 x i64> %InVec) {
+entry:
+ %shl = lshr <2 x i64> %InVec, <i64 0, i64 0>
+ ret <2 x i64> %shl
+}
+
+; CHECK: test_srlq_1:
+; CHECK: psrlq $0, %xmm0
+; CHECK-NEXT: ret
+
+define <2 x i64> @test_srlq_2(<2 x i64> %InVec) {
+entry:
+ %shl = lshr <2 x i64> %InVec, <i64 1, i64 1>
+ ret <2 x i64> %shl
+}
+
+; CHECK: test_srlq_2:
+; CHECK: psrlq $1, %xmm0
+; CHECK-NEXT: ret
+
+define <2 x i64> @test_srlq_3(<2 x i64> %InVec) {
+entry:
+ %shl = lshr <2 x i64> %InVec, <i64 64, i64 64>
+ ret <2 x i64> %shl
+}
+
+; CHECK: test_srlq_3:
+; CHECK: xorps %xmm0, %xmm0
+; CHECK-NEXT: ret