summaryrefslogtreecommitdiff
path: root/lib/Target/X86/X86InstrInfo.cpp
diff options
context:
space:
mode:
authorAndrew Trick <atrick@apple.com>2013-10-14 22:19:03 +0000
committerAndrew Trick <atrick@apple.com>2013-10-14 22:19:03 +0000
commita6a9ac5aa1092067e6e1546226d8bdd6a4bfcf99 (patch)
treed28e7ac2e1333f9dc7af9c1f71719d94cba35483 /lib/Target/X86/X86InstrInfo.cpp
parent966772931eea7cdc3cdd7199e304d667aa344bd7 (diff)
downloadllvm-a6a9ac5aa1092067e6e1546226d8bdd6a4bfcf99.tar.gz
llvm-a6a9ac5aa1092067e6e1546226d8bdd6a4bfcf99.tar.bz2
llvm-a6a9ac5aa1092067e6e1546226d8bdd6a4bfcf99.tar.xz
Fix the ExecutionDepsFix pass to handle AVX instructions.
This pass is needed to break false dependencies. Without it, unlucky register assignment can result in wild (5x) swings in performance. This pass was trying to handle AVX but not getting it right. AVX doesn't have partial register defs, it has unused register reads in which the high bits of a source operand are copied into the unused bits of the dest. Fixing this requires conservative liveness analysis. This is awkard because the pass already has its own pseudo-liveness. However, proper liveness is expensive, and we would like to use a generic utility to compute it. The fix only invokes liveness on-demand. It is rare to detect a case that needs undef-read dependence breaking, but when it happens, it can be needed many times within a very large block. I think the existing heuristic which uses a register window of 16 is too conservative for loop-carried false dependencies. If the loop is a reduction. The out-of-order engine may be able to execute several loop iterations in parallel. However, I'll leave this tuning exercise for next time. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192635 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/X86/X86InstrInfo.cpp')
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp81
1 files changed, 67 insertions, 14 deletions
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index dfc8cadedc..32d2e16fed 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -4073,20 +4073,6 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
case X86::RSQRTSSr_Int:
case X86::SQRTSSr:
case X86::SQRTSSr_Int:
- // AVX encoded versions
- case X86::VCVTSD2SSrr:
- case X86::Int_VCVTSD2SSrr:
- case X86::VCVTSS2SDrr:
- case X86::Int_VCVTSS2SDrr:
- case X86::VCVTSD2SSZrr:
- case X86::VCVTSS2SDZrr:
- case X86::VRCPSSr:
- case X86::VROUNDSDr:
- case X86::VROUNDSDr_Int:
- case X86::VROUNDSSr:
- case X86::VROUNDSSr_Int:
- case X86::VRSQRTSSr:
- case X86::VSQRTSSr:
return true;
}
@@ -4118,10 +4104,77 @@ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
return 16;
}
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::VCVTSI2SSrr:
+ case X86::Int_VCVTSI2SSrr:
+ case X86::VCVTSI2SS64rr:
+ case X86::Int_VCVTSI2SS64rr:
+ case X86::VCVTSI2SDrr:
+ case X86::Int_VCVTSI2SDrr:
+ case X86::VCVTSI2SD64rr:
+ case X86::Int_VCVTSI2SD64rr:
+ case X86::VCVTSD2SSrr:
+ case X86::Int_VCVTSD2SSrr:
+ case X86::VCVTSS2SDrr:
+ case X86::Int_VCVTSS2SDrr:
+ case X86::VRCPSSr:
+ case X86::VROUNDSDr:
+ case X86::VROUNDSDr_Int:
+ case X86::VROUNDSSr:
+ case X86::VROUNDSSr_Int:
+ case X86::VRSQRTSSr:
+ case X86::VSQRTSSr:
+
+ // AVX-512
+ case X86::VCVTSD2SSZrr:
+ case X86::VCVTSS2SDZrr:
+ return true;
+ }
+
+ return false;
+}
+
+/// Inform the ExeDepsFix pass how many idle instructions we would like before
+/// certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned X86InstrInfo::
+getUndefRegClearance(const MachineInstr *MI, unsigned &OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (!hasUndefRegUpdate(MI->getOpcode()))
+ return 0;
+
+ // Set the OpNum parameter to the first source operand.
+ OpNum = 1;
+
+ const MachineOperand &MO = MI->getOperand(OpNum);
+ if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ // Use the same magic number as getPartialRegUpdateClearance.
+ return 16;
+ }
+ return 0;
+}
+
void X86InstrInfo::
breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
const TargetRegisterInfo *TRI) const {
unsigned Reg = MI->getOperand(OpNum).getReg();
+ // If MI kills this register, the false dependence is already broken.
+ if (MI->killsRegister(Reg, TRI))
+ return;
if (X86::VR128RegClass.contains(Reg)) {
// These instructions are all floating point domain, so xorps is the best
// choice.