summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorHal Finkel <hfinkel@anl.gov>2014-04-04 23:51:18 +0000
committerHal Finkel <hfinkel@anl.gov>2014-04-04 23:51:18 +0000
commite6a5b33e6e3e9626634f08f2dab8cbc0866e30b5 (patch)
tree48a6115dc1972a6184a8c04732dd3b94235975e8 /lib
parentcef9f7ef271e9d95c8151ed8e683ef272b6b8c18 (diff)
downloadllvm-e6a5b33e6e3e9626634f08f2dab8cbc0866e30b5.tar.gz
llvm-e6a5b33e6e3e9626634f08f2dab8cbc0866e30b5.tar.bz2
llvm-e6a5b33e6e3e9626634f08f2dab8cbc0866e30b5.tar.xz
[PowerPC] Adjust load/store costs in PPCTTI
This provides more realistic costs for the insert/extractelement instructions (which are load/store pairs), accounts for the cheap unaligned Altivec load sequence, and for unaligned VSX load/stores. Bad news: MultiSource/Applications/sgefa/sgefa - 35% slowdown (this will require more investigation) SingleSource/Benchmarks/McGill/queens - 20% slowdown (we no longer vectorize this, but it was a constant store that was scalarized) MultiSource/Benchmarks/FreeBench/pcompress2/pcompress2 - 2% slowdown Good news: SingleSource/Benchmarks/Shootout/ary3 - 54% speedup SingleSource/Benchmarks/Shootout-C++/ary - 40% speedup MultiSource/Benchmarks/Ptrdist/ks/ks - 35% speedup MultiSource/Benchmarks/FreeBench/neural/neural - 30% speedup MultiSource/Benchmarks/TSVC/Symbolics-flt/Symbolics-flt - 20% speedup Unfortunately, estimating the costs of the stack-based scalarization sequences is hard, and adjusting these costs is like a game of whac-a-mole :( I'll revisit this again after we have better codegen for vector extloads and truncstores and unaligned load/stores. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205658 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/Target/PowerPC/PPCTargetTransformInfo.cpp26
1 files changed, 23 insertions, 3 deletions
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 53b2dd65d0..ed849b5bc8 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -216,7 +216,9 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
// experimentally as a minimum needed to prevent unprofitable
// vectorization for the paq8p benchmark. It may need to be
// raised further if other unprofitable cases remain.
- unsigned LHSPenalty = 12;
+ unsigned LHSPenalty = 2;
+ if (ISD == ISD::INSERT_VECTOR_ELT)
+ LHSPenalty += 7;
// Vector element insert/extract with Altivec is very expensive,
// because they require store and reload with the attendant
@@ -240,14 +242,32 @@ unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned Cost =
TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
- // FIXME: Update this for VSX loads/stores that support unaligned access.
+ // VSX loads/stores support unaligned access.
+ if (ST->hasVSX()) {
+ if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
+ return Cost;
+ }
+
+ bool UnalignedAltivec =
+ Src->isVectorTy() &&
+ Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
+ LT.second.getSizeInBits() == 128 &&
+ Opcode == Instruction::Load;
// PPC in general does not support unaligned loads and stores. They'll need
// to be decomposed based on the alignment factor.
unsigned SrcBytes = LT.second.getStoreSize();
- if (SrcBytes && Alignment && Alignment < SrcBytes)
+ if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
Cost += LT.first*(SrcBytes/Alignment-1);
+ // For a vector type, there is also scalarization overhead (only for
+ // stores, loads are expanded using the vector-load + permutation sequence,
+ // which is much less expensive).
+ if (Src->isVectorTy() && Opcode == Instruction::Store)
+ for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+ Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+ }
+
return Cost;
}