summaryrefslogtreecommitdiff
path: root/lib/Target/X86/X86TargetTransformInfo.cpp
diff options
context:
space:
mode:
authorArnold Schwaighofer <aschwaighofer@apple.com>2013-03-02 04:02:52 +0000
committerArnold Schwaighofer <aschwaighofer@apple.com>2013-03-02 04:02:52 +0000
commit5f0d9dbdf48a9efe16bfadf88e5335f7b9a8ec3f (patch)
treee39c3262177b95d29a2415f009cabade61fdd135 /lib/Target/X86/X86TargetTransformInfo.cpp
parent1c01af8f26f1ddca69d332dd8456fdeab3d1936e (diff)
downloadllvm-5f0d9dbdf48a9efe16bfadf88e5335f7b9a8ec3f.tar.gz
llvm-5f0d9dbdf48a9efe16bfadf88e5335f7b9a8ec3f.tar.bz2
llvm-5f0d9dbdf48a9efe16bfadf88e5335f7b9a8ec3f.tar.xz
X86 cost model: Adjust cost for custom lowered vector multiplies
This matters for example in following matrix multiply: int **mmult(int rows, int cols, int **m1, int **m2, int **m3) { int i, j, k, val; for (i=0; i<rows; i++) { for (j=0; j<cols; j++) { val = 0; for (k=0; k<cols; k++) { val += m1[i][k] * m2[k][j]; } m3[i][j] = val; } } return(m3); } Taken from the test-suite benchmark Shootout. We estimate the cost of the multiply to be 2 while we generate 9 instructions for it and end up being quite a bit slower than the scalar version (48% on my machine). Also, properly differentiate between avx1 and avx2. On avx-1 we still split the vector into 2 128bits and handle the subvector muls like above with 9 instructions. Only on avx-2 will we have a cost of 9 for v4i64. I changed the test case in test/Transforms/LoopVectorize/X86/avx1.ll to use an add instead of a mul because with a mul we now no longer vectorize. I did verify that the mul would be indeed more expensive when vectorized with 3 kernels: for (i ...) r += a[i] * 3; for (i ...) m1[i] = m1[i] * 3; // This matches the test case in avx1.ll and a matrix multiply. In each case the vectorized version was considerably slower. radar://13304919 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176403 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/X86/X86TargetTransformInfo.cpp')
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp34
1 files changed, 29 insertions, 5 deletions
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index fefb479da9..be2a997b8e 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -176,18 +176,42 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
{ ISD::MUL, MVT::v8i32, 4 },
{ ISD::SUB, MVT::v8i32, 4 },
{ ISD::ADD, MVT::v8i32, 4 },
- { ISD::MUL, MVT::v4i64, 4 },
{ ISD::SUB, MVT::v4i64, 4 },
{ ISD::ADD, MVT::v4i64, 4 },
- };
+ // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+ // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
+ // Because we believe v4i64 to be a legal type, we must also include the
+ // split factor of two in the cost table. Therefore, the cost here is 18
+ // instead of 9.
+ { ISD::MUL, MVT::v4i64, 18 },
+ };
// Look for AVX1 lowering tricks.
- if (ST->hasAVX()) {
- int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
- LT.second);
+ if (ST->hasAVX() && !ST->hasAVX2()) {
+ int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable),
+ ISD, LT.second);
if (Idx != -1)
return LT.first * AVX1CostTable[Idx].Cost;
}
+
+ // Custom lowering of vectors.
+ static const CostTblEntry<MVT> CustomLowered[] = {
+ // A v2i64/v4i64 and multiply is custom lowered as a series of long
+ // multiplies(3), shifts(4) and adds(2).
+ { ISD::MUL, MVT::v2i64, 9 },
+ { ISD::MUL, MVT::v4i64, 9 },
+ };
+ int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * CustomLowered[Idx].Cost;
+
+ // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+ // 2x pmuludq, 2x shuffle.
+ if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+ !ST->hasSSE41())
+ return 6;
+
// Fallback to the default implementation.
return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty);
}