summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>2014-06-25 10:02:21 +0000
committerAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>2014-06-25 10:02:21 +0000
commit3e5582cc15639f7c908326a7c15b941fb3253a33 (patch)
tree9c04cbc46280ed543efd6ff268a931ad2e0f5bf7
parent97c856a6126f0853dd894cc5f78dfaa574c78540 (diff)
downloadllvm-3e5582cc15639f7c908326a7c15b941fb3253a33.tar.gz
llvm-3e5582cc15639f7c908326a7c15b941fb3253a33.tar.bz2
llvm-3e5582cc15639f7c908326a7c15b941fb3253a33.tar.xz
[X86] Add target combine rule to select ADDSUB instructions from a build_vector
This patch teaches the backend how to combine a build_vector that implements an 'addsub' between packed float vectors into a sequence of vector add and vector sub followed by a VSELECT. The new VSELECT is expected to be lowered into a BLENDI. At ISel stage, the sequence 'vector add + vector sub + BLENDI' is pattern-matched against ISel patterns added at r211427 to select 'addsub' instructions. Added three more ISel patterns for ADDSUB. Added test sse3-avx-addsub-2.ll to verify that we correctly emit 'addsub' instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211679 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp129
-rw-r--r--lib/Target/X86/X86InstrSSE.td10
-rw-r--r--test/CodeGen/X86/sse3-avx-addsub-2.ll318
3 files changed, 456 insertions, 1 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 874257f212..714fc0ccbd 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -6222,6 +6222,127 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
}
+/// \brief Try to fold a build_vector that performs an 'addsub' into the
+/// sequence of 'vadd + vsub + blendi'.
+static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ SDLoc DL(BV);
+ EVT VT = BV->getValueType(0);
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue InVec0 = DAG.getUNDEF(VT);
+ SDValue InVec1 = DAG.getUNDEF(VT);
+
+ assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
+ VT == MVT::v2f64) && "build_vector with an invalid type found!");
+
+ // Don't try to emit a VSELECT that cannot be lowered into a blend.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
+ return SDValue();
+
+ // Odd-numbered elements in the input build vector are obtained from
+ // adding two integer/float elements.
+ // Even-numbered elements in the input build vector are obtained from
+ // subtracting two integer/float elements.
+ unsigned ExpectedOpcode = ISD::FSUB;
+ unsigned NextExpectedOpcode = ISD::FADD;
+ bool AddFound = false;
+ bool SubFound = false;
+
+ for (unsigned i = 0, e = NumElts; i != e; i++) {
+ SDValue Op = BV->getOperand(i);
+
+ // Skip 'undef' values.
+ unsigned Opcode = Op.getOpcode();
+ if (Opcode == ISD::UNDEF) {
+ std::swap(ExpectedOpcode, NextExpectedOpcode);
+ continue;
+ }
+
+ // Early exit if we found an unexpected opcode.
+ if (Opcode != ExpectedOpcode)
+ return SDValue();
+
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+
+ // Try to match the following pattern:
+ // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
+ // Early exit if we cannot match that sequence.
+ if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+ !isa<ConstantSDNode>(Op0.getOperand(1)) ||
+ !isa<ConstantSDNode>(Op1.getOperand(1)) ||
+ Op0.getOperand(1) != Op1.getOperand(1))
+ return SDValue();
+
+ unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+ if (I0 != i)
+ return SDValue();
+
+ // We found a valid add/sub node. Update the information accordingly.
+ if (i & 1)
+ AddFound = true;
+ else
+ SubFound = true;
+
+ // Update InVec0 and InVec1.
+ if (InVec0.getOpcode() == ISD::UNDEF)
+ InVec0 = Op0.getOperand(0);
+ if (InVec1.getOpcode() == ISD::UNDEF)
+ InVec1 = Op1.getOperand(0);
+
+ // Make sure that operands in input to each add/sub node always
+ // come from a same pair of vectors.
+ if (InVec0 != Op0.getOperand(0)) {
+ if (ExpectedOpcode == ISD::FSUB)
+ return SDValue();
+
+ // FADD is commutable. Try to commute the operands
+ // and then test again.
+ std::swap(Op0, Op1);
+ if (InVec0 != Op0.getOperand(0))
+ return SDValue();
+ }
+
+ if (InVec1 != Op1.getOperand(0))
+ return SDValue();
+
+ // Update the pair of expected opcodes.
+ std::swap(ExpectedOpcode, NextExpectedOpcode);
+ }
+
+ // Don't try to fold this build_vector into a VSELECT if it has
+ // too many UNDEF operands.
+ if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
+ InVec1.getOpcode() != ISD::UNDEF) {
+ // Emit a sequence of vector add and sub followed by a VSELECT.
+ // The new VSELECT will be lowered into a BLENDI.
+ // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI'
+ // and emit a single ADDSUB instruction.
+ SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1);
+ SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1);
+
+ // Construct the VSELECT mask.
+ EVT MaskVT = VT.changeVectorElementTypeToInteger();
+ EVT SVT = MaskVT.getVectorElementType();
+ unsigned SVTBits = SVT.getSizeInBits();
+ SmallVector<SDValue, 8> Ops;
+
+ for (unsigned i = 0, e = NumElts; i != e; ++i) {
+ APInt Value = i & 1 ? APInt::getNullValue(SVTBits) :
+ APInt::getAllOnesValue(SVTBits);
+ SDValue Constant = DAG.getConstant(Value, SVT);
+ Ops.push_back(Constant);
+ }
+
+ SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops);
+ return DAG.getSelect(DL, VT, Mask, Sub, Add);
+ }
+
+ return SDValue();
+}
+
static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
const X86Subtarget *Subtarget) {
SDLoc DL(N);
@@ -6230,6 +6351,14 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
SDValue InVec0, InVec1;
+ // Try to match an ADDSUB.
+ if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
+ (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
+ SDValue Value = matchAddSub(BV, DAG, Subtarget);
+ if (Value.getNode())
+ return Value;
+ }
+
// Try to match horizontal ADD/SUB.
unsigned NumUndefsLO = 0;
unsigned NumUndefsHI = 0;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index a33d2cc832..988a059897 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -5377,10 +5377,15 @@ let Predicates = [HasAVX] in {
def : Pat<(v4f64 (X86Shufp (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
(v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i8 10))),
(VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
+ def : Pat<(v4f64 (X86Blendi (v4f64 (fsub VR256:$lhs, VR256:$rhs)),
+ (v4f64 (fadd VR256:$lhs, VR256:$rhs)), (i32 10))),
+ (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
def : Pat<(v4f32 (X86Blendi (v4f32 (fsub VR128:$lhs, VR128:$rhs)),
(v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
(VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
-
+ def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
+ (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))),
+ (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
(v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
(VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
@@ -5396,6 +5401,9 @@ let Predicates = [UseSSE3] in {
(v4f32 (fadd VR128:$lhs, VR128:$rhs)), (i32 10))),
(ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (fsub VR128:$lhs, VR128:$rhs)),
+ (v2f64 (fadd VR128:$lhs, VR128:$rhs)), (i32 2))),
+ (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
def : Pat<(v2f64 (X86Movsd (v2f64 (fadd VR128:$lhs, VR128:$rhs)),
(v2f64 (fsub VR128:$lhs, VR128:$rhs)))),
(ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll
new file mode 100644
index 0000000000..b7706cc34b
--- /dev/null
+++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -0,0 +1,318 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+
+
+; Verify that we correctly generate 'addsub' instructions from
+; a sequence of vector extracts + float add/sub + vector inserts.
+
+define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 2
+ %4 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <4 x float> %A, i32 1
+ %6 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <4 x float> %A, i32 3
+ %8 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+ ret <4 x float> %vecinsert4
+}
+; CHECK-LABEL: test1
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 2
+ %2 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 3
+ %4 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test2
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 3
+ %4 = extractelement <4 x float> %B, i32 3
+ %add = fadd float %4, %3
+ %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
+ ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test3
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 2
+ %2 = extractelement <4 x float> %B, i32 2
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 1
+ %4 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
+ ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test4
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub2 = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 1
+ %4 = extractelement <4 x float> %B, i32 1
+ %add2 = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
+ ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test5
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 2
+ %4 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <4 x float> %A, i32 1
+ %6 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <4 x float> %A, i32 3
+ %8 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+ ret <4 x float> %vecinsert4
+}
+; CHECK-LABEL: test6
+; SSE: addsubps
+; AVX: vaddsubps
+; CHECK-NEXT: ret
+
+
+define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
+ %1 = extractelement <4 x double> %A, i32 0
+ %2 = extractelement <4 x double> %B, i32 0
+ %sub = fsub double %1, %2
+ %3 = extractelement <4 x double> %A, i32 2
+ %4 = extractelement <4 x double> %B, i32 2
+ %sub2 = fsub double %3, %4
+ %5 = extractelement <4 x double> %A, i32 1
+ %6 = extractelement <4 x double> %B, i32 1
+ %add = fadd double %5, %6
+ %7 = extractelement <4 x double> %A, i32 3
+ %8 = extractelement <4 x double> %B, i32 3
+ %add2 = fadd double %7, %8
+ %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
+ %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
+ %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
+ %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
+ ret <4 x double> %vecinsert4
+}
+; CHECK-LABEL: test7
+; SSE: addsubpd
+; SSE-NEXT: addsubpd
+; AVX: vaddsubpd
+; AVX-NOT: vaddsubpd
+; CHECK: ret
+
+
+define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
+ %1 = extractelement <2 x double> %A, i32 0
+ %2 = extractelement <2 x double> %B, i32 0
+ %sub = fsub double %1, %2
+ %3 = extractelement <2 x double> %A, i32 1
+ %4 = extractelement <2 x double> %B, i32 1
+ %add = fadd double %3, %4
+ %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
+ %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
+ ret <2 x double> %vecinsert2
+}
+; CHECK-LABEL: test8
+; SSE: addsubpd
+; AVX: vaddsubpd
+; CHECK: ret
+
+
+define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
+ %1 = extractelement <8 x float> %A, i32 0
+ %2 = extractelement <8 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <8 x float> %A, i32 2
+ %4 = extractelement <8 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <8 x float> %A, i32 1
+ %6 = extractelement <8 x float> %B, i32 1
+ %add = fadd float %5, %6
+ %7 = extractelement <8 x float> %A, i32 3
+ %8 = extractelement <8 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %9 = extractelement <8 x float> %A, i32 4
+ %10 = extractelement <8 x float> %B, i32 4
+ %sub3 = fsub float %9, %10
+ %11 = extractelement <8 x float> %A, i32 6
+ %12 = extractelement <8 x float> %B, i32 6
+ %sub4 = fsub float %11, %12
+ %13 = extractelement <8 x float> %A, i32 5
+ %14 = extractelement <8 x float> %B, i32 5
+ %add3 = fadd float %13, %14
+ %15 = extractelement <8 x float> %A, i32 7
+ %16 = extractelement <8 x float> %B, i32 7
+ %add4 = fadd float %15, %16
+ %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
+ %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
+ %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
+ %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
+ %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
+ %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
+ ret <8 x float> %vecinsert8
+}
+; CHECK-LABEL: test9
+; SSE: addsubps
+; SSE-NEXT: addsubps
+; AVX: vaddsubps
+; AVX-NOT: vaddsubps
+; CHECK: ret
+
+
+; Verify that we don't generate addsub instruction for the following
+; functions.
+define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
+ ret <4 x float> %vecinsert1
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 2
+ %2 = extractelement <4 x float> %B, i32 2
+ %sub = fsub float %1, %2
+ %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
+ ret <4 x float> %vecinsert1
+}
+; CHECK-LABEL: test11
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 1
+ %2 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %1, %2
+ %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+ ret <4 x float> %vecinsert1
+}
+; CHECK-LABEL: test12
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 3
+ %2 = extractelement <4 x float> %B, i32 3
+ %add = fadd float %1, %2
+ %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3
+ ret <4 x float> %vecinsert1
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, %2
+ %3 = extractelement <4 x float> %A, i32 2
+ %4 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2
+ ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 1
+ %2 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %1, %2
+ %3 = extractelement <4 x float> %A, i32 3
+ %4 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %3, %4
+ %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ ret <4 x float> %vecinsert2
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+
+define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
+ %1 = extractelement <4 x float> %A, i32 0
+ %2 = extractelement <4 x float> %B, i32 0
+ %sub = fsub float %1, undef
+ %3 = extractelement <4 x float> %A, i32 2
+ %4 = extractelement <4 x float> %B, i32 2
+ %sub2 = fsub float %3, %4
+ %5 = extractelement <4 x float> %A, i32 1
+ %6 = extractelement <4 x float> %B, i32 1
+ %add = fadd float %5, undef
+ %7 = extractelement <4 x float> %A, i32 3
+ %8 = extractelement <4 x float> %B, i32 3
+ %add2 = fadd float %7, %8
+ %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
+ %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
+ %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
+ %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
+ ret <4 x float> %vecinsert4
+}
+; CHECK-LABEL: test16
+; CHECK-NOT: addsubps
+; CHECK: ret
+
+