summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold Schwaighofer <aschwaighofer@apple.com>2013-09-17 18:06:50 +0000
committerArnold Schwaighofer <aschwaighofer@apple.com>2013-09-17 18:06:50 +0000
commit65457b679ae240c1a37da82c5484dac478c47b6d (patch)
tree53bf1d482a72a8c891f5e786a67b8ab6213a2a95
parent3c940067424204ecffb48ddc269895d48442279a (diff)
downloadllvm-65457b679ae240c1a37da82c5484dac478c47b6d.tar.gz
llvm-65457b679ae240c1a37da82c5484dac478c47b6d.tar.bz2
llvm-65457b679ae240c1a37da82c5484dac478c47b6d.tar.xz
Costmodel: Add support for horizontal vector reductions
Upcoming SLP vectorization improvements will want to be able to estimate costs of horizontal reductions. Add infrastructure to support this. We model reductions as a series of (shufflevector,add) tuples ultimately followed by an extractelement. For example, for an add-reduction of <4 x float> we could generate the following sequence: (v0, v1, v2, v3) \ \ / / \ \ / + + (v0+v2, v1+v3, undef, undef) \ / ((v0+v2) + (v1+v3), undef, undef) %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef> %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7 %r = extractelement <4 x float> %bin.rdx8, i32 0 This commit adds a cost model interface "getReductionCost(Opcode, Ty, Pairwise)" that will allow clients to ask for the cost of such a reduction (as backends might generate more efficient code than the cost of the individual instructions summed up). This interface is excercised by the CostModel analysis pass which looks for reduction patterns like the one above - starting at extractelements - and if it sees a matching sequence will call the cost model interface. We will also support a second form of pairwise reduction that is well supported on common architectures (haddps, vpadd, faddp). (v0, v1, v2, v3) \ / \ / (v0+v1, v2+v3, undef, undef) \ / ((v0+v1)+(v2+v3), undef, undef, undef) %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef> %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef> %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef> %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef> %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 %r = extractelement <4 x float> %bin.rdx.1, i32 0 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@190876 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/Analysis/TargetTransformInfo.h16
-rw-r--r--lib/Analysis/CostModel.cpp272
-rw-r--r--lib/Analysis/TargetTransformInfo.cpp9
-rw-r--r--lib/CodeGen/BasicTargetTransformInfo.cpp15
-rw-r--r--test/Analysis/CostModel/X86/reduction.ll94
5 files changed, 406 insertions, 0 deletions
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index eb70e9cdaa..9104fd4b4a 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -368,6 +368,22 @@ public:
unsigned Alignment,
unsigned AddressSpace) const;
+ /// \brief Calculate the cost of performing a vector reduction.
+ ///
+ /// This is the cost of reducing the vector value of type \p Ty to a scalar
+ /// value using the operation denoted by \p Opcode. The form of the reduction
+ /// can either be a pairwise reduction or a reduction that splits the vector
+ /// at every reduction level.
+ ///
+ /// Pairwise:
+ /// (v0, v1, v2, v3)
+ /// ((v0+v1), (v2, v3), undef, undef)
+ /// Split:
+ /// (v0, v1, v2, v3)
+ /// ((v0+v2), (v1+v3), undef, undef)
+ virtual unsigned getReductionCost(unsigned Opcode, Type *Ty,
+ bool IsPairwiseForm) const;
+
/// \returns The cost of Intrinsic instructions.
virtual unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
ArrayRef<Type *> Tys) const;
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 927508e0a7..6970b872c1 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -19,6 +19,7 @@
#define CM_NAME "cost-model"
#define DEBUG_TYPE CM_NAME
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/Passes.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Function.h"
@@ -26,10 +27,15 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
+static cl::opt<bool> EnableReduxCost("costmodel-reduxcost", cl::init(false),
+ cl::Hidden,
+ cl::desc("Recognize reduction patterns."));
+
namespace {
class CostModelAnalysis : public FunctionPass {
@@ -105,6 +111,261 @@ static TargetTransformInfo::OperandValueKind getOperandInfo(Value *V) {
return OpInfo;
}
+static bool matchMask(SmallVectorImpl<int> &M1, SmallVectorImpl<int> &M2) {
+ if (M1.size() != M2.size())
+ return false;
+
+ for (unsigned i = 0, e = M1.size(); i != e; ++i)
+ if (M1[i] != M2[i])
+ return false;
+
+ return true;
+}
+
+static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
+ unsigned Level) {
+ // We don't need a shuffle if we just want to have element 0 in position 0 of
+ // the vector.
+ if (!SI && Level == 0 && IsLeft)
+ return true;
+ else if (!SI)
+ return false;
+
+ SmallVector<int, 32> Mask(SI->getType()->getVectorNumElements(), -1);
+
+ // Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether
+ // we look at the left or right side.
+ for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)
+ Mask[i] = val;
+
+ SmallVector<int, 16> ActualMask = SI->getShuffleMask();
+ if (!matchMask(Mask, ActualMask))
+ return false;
+
+ return true;
+}
+
+static bool matchPairwiseReductionAtLevel(const BinaryOperator *BinOp,
+ unsigned Level, unsigned NumLevels) {
+ // Match one level of pairwise operations.
+ // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ // <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+ // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ // <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+ // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+ if (BinOp == 0)
+ return false;
+
+ Type *VecTy = BinOp->getType();
+ assert(VecTy->isVectorTy() && "Expecting a vector type");
+
+ unsigned Opcode = BinOp->getOpcode();
+ Value *L = BinOp->getOperand(0);
+ Value *R = BinOp->getOperand(1);
+
+ ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(L);
+ if (!LS && Level)
+ return false;
+ ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(R);
+ if (!RS && Level)
+ return false;
+
+ // On level 0 we can omit one shufflevector instruction.
+ if (!Level && !RS && !LS)
+ return false;
+
+ // Shuffle inputs must match.
+ Value *NextLevelOpL = LS ? LS->getOperand(0) : 0;
+ Value *NextLevelOpR = RS ? RS->getOperand(0) : 0;
+ Value *NextLevelOp = 0;
+ if (NextLevelOpR && NextLevelOpL) {
+ // If we have two shuffles their operands must match.
+ if (NextLevelOpL != NextLevelOpR)
+ return false;
+
+ NextLevelOp = NextLevelOpL;
+ } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) {
+ // On the first level we can omit the shufflevector <0, undef,...>. So the
+ // input to the other shufflevector <1, undef> must match with one of the
+ // inputs to the current binary operation.
+ // Example:
+ // %NextLevelOpL = shufflevector %R, <1, undef ...>
+ // %BinOp = fadd %NextLevelOpL, %R
+ if (NextLevelOpL && NextLevelOpL != R)
+ return false;
+ else if (NextLevelOpR && NextLevelOpR != L)
+ return false;
+
+ NextLevelOp = NextLevelOpL ? R : L;
+ } else
+ return false;
+
+ // Check that the next levels binary operation exists and matches with the
+ // current one.
+ BinaryOperator *NextLevelBinOp = 0;
+ if (Level + 1 != NumLevels) {
+ if (!(NextLevelBinOp = dyn_cast<BinaryOperator>(NextLevelOp)))
+ return false;
+ else if (NextLevelBinOp->getOpcode() != Opcode)
+ return false;
+ }
+
+ // Shuffle mask for pairwise operation must match.
+ if (matchPairwiseShuffleMask(LS, true, Level)) {
+ if (!matchPairwiseShuffleMask(RS, false, Level))
+ return false;
+ } else if (matchPairwiseShuffleMask(RS, true, Level)) {
+ if (!matchPairwiseShuffleMask(LS, false, Level))
+ return false;
+ } else
+ return false;
+
+ if (++Level == NumLevels)
+ return true;
+
+ // Match next level.
+ return matchPairwiseReductionAtLevel(NextLevelBinOp, Level, NumLevels);
+}
+
+static bool matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
+ unsigned &Opcode, Type *&Ty) {
+ if (!EnableReduxCost)
+ return false;
+
+ // Need to extract the first element.
+ ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+ unsigned Idx = ~0u;
+ if (CI)
+ Idx = CI->getZExtValue();
+ if (Idx != 0)
+ return false;
+
+ BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+ if (!RdxStart)
+ return false;
+
+ Type *VecTy = ReduxRoot->getOperand(0)->getType();
+ unsigned NumVecElems = VecTy->getVectorNumElements();
+ if (!isPowerOf2_32(NumVecElems))
+ return false;
+
+ // We look for a sequence of shuffle,shuffle,add triples like the following
+ // that builds a pairwise reduction tree.
+ //
+ // (X0, X1, X2, X3)
+ // (X0 + X1, X2 + X3, undef, undef)
+ // ((X0 + X1) + (X2 + X3), undef, undef, undef)
+ //
+ // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ // <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+ // %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ // <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+ // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+ // %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+ // <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+ // %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+ // <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+ // %r = extractelement <4 x float> %bin.rdx8, i32 0
+ if (!matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems)))
+ return false;
+
+ Opcode = RdxStart->getOpcode();
+ Ty = VecTy;
+
+ return true;
+}
+
+static std::pair<Value *, ShuffleVectorInst *>
+getShuffleAndOtherOprd(BinaryOperator *B) {
+
+ Value *L = B->getOperand(0);
+ Value *R = B->getOperand(1);
+ ShuffleVectorInst *S = 0;
+
+ if ((S = dyn_cast<ShuffleVectorInst>(L)))
+ return std::make_pair(R, S);
+
+ S = dyn_cast<ShuffleVectorInst>(R);
+ return std::make_pair(L, S);
+}
+
+static bool matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
+ unsigned &Opcode, Type *&Ty) {
+ if (!EnableReduxCost)
+ return false;
+
+ // Need to extract the first element.
+ ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
+ unsigned Idx = ~0u;
+ if (CI)
+ Idx = CI->getZExtValue();
+ if (Idx != 0)
+ return false;
+
+ BinaryOperator *RdxStart = dyn_cast<BinaryOperator>(ReduxRoot->getOperand(0));
+ if (!RdxStart)
+ return false;
+ unsigned RdxOpcode = RdxStart->getOpcode();
+
+ Type *VecTy = ReduxRoot->getOperand(0)->getType();
+ unsigned NumVecElems = VecTy->getVectorNumElements();
+ if (!isPowerOf2_32(NumVecElems))
+ return false;
+
+ // We look for a sequence of shuffles and adds like the following matching one
+ // fadd, shuffle vector pair at a time.
+ //
+ // %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef,
+ // <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ // %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+ // %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef,
+ // <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ // %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+ // %r = extractelement <4 x float> %bin.rdx8, i32 0
+
+ unsigned MaskStart = 1;
+ Value *RdxOp = RdxStart;
+ SmallVector<int, 32> ShuffleMask(NumVecElems, 0);
+ unsigned NumVecElemsRemain = NumVecElems;
+ while (NumVecElemsRemain - 1) {
+ // Check for the right reduction operation.
+ BinaryOperator *BinOp;
+ if (!(BinOp = dyn_cast<BinaryOperator>(RdxOp)))
+ return false;
+ if (BinOp->getOpcode() != RdxOpcode)
+ return false;
+
+ Value *NextRdxOp;
+ ShuffleVectorInst *Shuffle;
+ tie(NextRdxOp, Shuffle) = getShuffleAndOtherOprd(BinOp);
+
+ // Check the current reduction operation and the shuffle use the same value.
+ if (Shuffle == 0)
+ return false;
+ if (Shuffle->getOperand(0) != NextRdxOp)
+ return false;
+
+ // Check that shuffle masks matches.
+ for (unsigned j = 0; j != MaskStart; ++j)
+ ShuffleMask[j] = MaskStart + j;
+ // Fill the rest of the mask with -1 for undef.
+ std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1);
+
+ SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
+ if (!matchMask(ShuffleMask, Mask))
+ return false;
+
+ RdxOp = NextRdxOp;
+ NumVecElemsRemain /= 2;
+ MaskStart *= 2;
+ }
+
+ Opcode = RdxOpcode;
+ Ty = VecTy;
+ return true;
+}
+
unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
if (!TTI)
return -1;
@@ -189,6 +450,17 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
unsigned Idx = -1;
if (CI)
Idx = CI->getZExtValue();
+
+ // Try to match a reduction sequence (series of shufflevector and vector
+ // adds followed by a extractelement).
+ unsigned ReduxOpCode;
+ Type *ReduxType;
+
+ if (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType))
+ return TTI->getReductionCost(ReduxOpCode, ReduxType, false);
+ else if (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType))
+ return TTI->getReductionCost(ReduxOpCode, ReduxType, true);
+
return TTI->getVectorInstrCost(I->getOpcode(),
EEI->getOperand(0)->getType(), Idx);
}
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index b23223d56f..0353295345 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -224,6 +224,11 @@ unsigned TargetTransformInfo::getAddressComputationCost(Type *Tp,
return PrevTTI->getAddressComputationCost(Tp, IsComplex);
}
+unsigned TargetTransformInfo::getReductionCost(unsigned Opcode, Type *Ty,
+ bool IsPairwise) const {
+ return PrevTTI->getReductionCost(Opcode, Ty, IsPairwise);
+}
+
namespace {
struct NoTTI : ImmutablePass, TargetTransformInfo {
@@ -592,6 +597,10 @@ struct NoTTI : ImmutablePass, TargetTransformInfo {
unsigned getAddressComputationCost(Type *Tp, bool) const {
return 0;
}
+
+ unsigned getReductionCost(unsigned, Type *, bool) const {
+ return 1;
+ }
};
} // end anonymous namespace
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index 9c4b49aa7c..24aa1abffa 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -113,6 +113,7 @@ public:
ArrayRef<Type*> Tys) const;
virtual unsigned getNumberOfParts(Type *Tp) const;
virtual unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const;
+ virtual unsigned getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwise) const;
/// @}
};
@@ -510,3 +511,17 @@ unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
return 0;
}
+
+unsigned BasicTTI::getReductionCost(unsigned Opcode, Type *Ty,
+ bool IsPairwise) const {
+ assert(Ty->isVectorTy() && "Expect a vector type");
+ unsigned NumVecElts = Ty->getVectorNumElements();
+ unsigned NumReduxLevels = Log2_32(NumVecElts);
+ unsigned ArithCost = NumReduxLevels *
+ TopTTI->getArithmeticInstrCost(Opcode, Ty);
+ // Assume the pairwise shuffles add a cost.
+ unsigned ShuffleCost =
+ NumReduxLevels * (IsPairwise + 1) *
+ TopTTI->getShuffleCost(SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
+ return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
+}
diff --git a/test/Analysis/CostModel/X86/reduction.ll b/test/Analysis/CostModel/X86/reduction.ll
new file mode 100644
index 0000000000..37d5f24abd
--- /dev/null
+++ b/test/Analysis/CostModel/X86/reduction.ll
@@ -0,0 +1,94 @@
+; RUN: opt < %s -cost-model -costmodel-reduxcost=true -analyze -mcpu=core2 -mtriple=x86_64-apple-darwin | FileCheck %s
+
+define fastcc float @reduction_cost_float(<4 x float> %rdx) {
+ %rdx.shuf = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+ %bin.rdx = fadd <4 x float> %rdx, %rdx.shuf
+ %rdx.shuf7 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %bin.rdx8 = fadd <4 x float> %bin.rdx, %rdx.shuf7
+
+; Check that we recognize the tree starting at the extractelement as a
+; reduction.
+; CHECK-LABEL: reduction_cost
+; CHECK: cost of 9 {{.*}} extractelement
+
+ %r = extractelement <4 x float> %bin.rdx8, i32 0
+ ret float %r
+}
+
+define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) {
+ %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef,
+ <8 x i32> <i32 4 , i32 5, i32 6, i32 7,
+ i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx = add <8 x i32> %rdx, %rdx.shuf
+ %rdx.shuf.2 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef,
+ <8 x i32> <i32 2 , i32 3, i32 undef, i32 undef,
+ i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2
+ %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef,
+ <8 x i32> <i32 1 , i32 undef, i32 undef, i32 undef,
+ i32 undef, i32 undef, i32 undef, i32 undef>
+ %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3
+
+; CHECK-LABEL: reduction_cost_int
+; CHECK: cost of 23 {{.*}} extractelement
+
+ %r = extractelement <8 x i32> %bin.rdx.3, i32 0
+ ret i32 %r
+}
+
+define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) {
+ %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+ %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+ %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+ %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+ <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+ %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+ <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; CHECK-LABEL: pairwise_hadd
+; CHECK: cost of 11 {{.*}} extractelement
+
+ %r = extractelement <4 x float> %bin.rdx.1, i32 0
+ %r2 = fadd float %r, %f1
+ ret float %r2
+}
+define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) {
+ %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+ %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+ %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.1, %rdx.shuf.0.0
+ %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+ <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+ %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+ <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
+
+; CHECK-LABEL: pairwise_hadd_assoc
+; CHECK: cost of 11 {{.*}} extractelement
+
+ %r = extractelement <4 x float> %bin.rdx.1, i32 0
+ %r2 = fadd float %r, %f1
+ ret float %r2
+}
+
+define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) {
+ %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
+ %rdx.shuf.0.1 = shufflevector <4 x float> %rdx, <4 x float> undef,
+ <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+ %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
+ %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef,
+ <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+ %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1
+
+; CHECK-LABEL: pairwise_hadd_skip_first
+; CHECK: cost of 11 {{.*}} extractelement
+
+ %r = extractelement <4 x float> %bin.rdx.1, i32 0
+ %r2 = fadd float %r, %f1
+ ret float %r2
+}