summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakob Stoklund Olesen <stoklund@2pi.dk>2011-11-15 01:15:30 +0000
committerJakob Stoklund Olesen <stoklund@2pi.dk>2011-11-15 01:15:30 +0000
commitc2ecf3efbf375fc82bb1cea6afd7448498f9ae75 (patch)
tree4cb44beba42b77f41d955701edd3e69df3ec1b68
parent2947f730a96fc602ea008bba1929ae4f0638850a (diff)
downloadllvm-c2ecf3efbf375fc82bb1cea6afd7448498f9ae75.tar.gz
llvm-c2ecf3efbf375fc82bb1cea6afd7448498f9ae75.tar.bz2
llvm-c2ecf3efbf375fc82bb1cea6afd7448498f9ae75.tar.xz
Break false dependencies before partial register updates.
Two new TargetInstrInfo hooks lets the target tell ExecutionDepsFix about instructions with partial register updates causing false unwanted dependencies. The ExecutionDepsFix pass will break the false dependencies if the updated register was written in the previoius N instructions. The small loop added to sse-domains.ll runs twice as fast with dependency-breaking instructions inserted. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@144602 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/Target/TargetInstrInfo.h68
-rw-r--r--lib/CodeGen/ExecutionDepsFix.cpp27
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp52
-rw-r--r--lib/Target/X86/X86InstrInfo.h5
-rw-r--r--test/CodeGen/X86/sse-domains.ll41
5 files changed, 193 insertions, 0 deletions
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 07f614d61d..590fc1e8f7 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -718,6 +718,74 @@ public:
///
virtual void setExecutionDomain(MachineInstr *MI, unsigned Domain) const {}
+
+ /// getPartialRegUpdateClearance - Returns the preferred minimum clearance
+ /// before an instruction with an unwanted partial register update.
+ ///
+ /// Some instructions only write part of a register, and implicitly need to
+ /// read the other parts of the register. This may cause unwanted stalls
+ /// preventing otherwise unrelated instructions from executing in parallel in
+ /// an out-of-order CPU.
+ ///
+ /// For example, the x86 instruction cvtsi2ss writes its result to bits
+ /// [31:0] of the destination xmm register. Bits [127:32] are unaffected, so
+ /// the instruction needs to wait for the old value of the register to become
+ /// available:
+ ///
+ /// addps %xmm1, %xmm0
+ /// movaps %xmm0, (%rax)
+ /// cvtsi2ss %rbx, %xmm0
+ ///
+ /// In the code above, the cvtsi2ss instruction needs to wait for the addps
+ /// instruction before it can issue, even though the high bits of %xmm0
+ /// probably aren't needed.
+ ///
+ /// This hook returns the preferred clearance before MI, measured in
+ /// instructions. Other defs of MI's operand OpNum are avoided in the last N
+ /// instructions before MI. It should only return a positive value for
+ /// unwanted dependencies. If the old bits of the defined register have
+ /// useful values, or if MI is determined to otherwise read the dependency,
+ /// the hook should return 0.
+ ///
+ /// The unwanted dependency may be handled by:
+ ///
+ /// 1. Allocating the same register for an MI def and use. That makes the
+ /// unwanted dependency identical to a required dependency.
+ ///
+ /// 2. Allocating a register for the def that has no defs in the previous N
+ /// instructions.
+ ///
+ /// 3. Calling breakPartialRegDependency() with the same arguments. This
+ /// allows the target to insert a dependency breaking instruction.
+ ///
+ virtual unsigned
+ getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ // The default implementation returns 0 for no partial register dependency.
+ return 0;
+ }
+
+ /// breakPartialRegDependency - Insert a dependency-breaking instruction
+ /// before MI to eliminate an unwanted dependency on OpNum.
+ ///
+ /// If it wasn't possible to avoid a def in the last N instructions before MI
+ /// (see getPartialRegUpdateClearance), this hook will be called to break the
+ /// unwanted dependency.
+ ///
+ /// On x86, an xorps instruction can be used as a dependency breaker:
+ ///
+ /// addps %xmm1, %xmm0
+ /// movaps %xmm0, (%rax)
+ /// xorps %xmm0, %xmm0
+ /// cvtsi2ss %rbx, %xmm0
+ ///
+ /// An <imp-kill> operand should be added to MI if an instruction was
+ /// inserted. This ties the instructions together in the post-ra scheduler.
+ ///
+ virtual void
+ breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {}
+
private:
int CallFrameSetupOpcode, CallFrameDestroyOpcode;
};
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index d094411116..050edce2ec 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -471,11 +471,34 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
<< '\t' << *MI);
+ // How many instructions since rx was last written?
+ unsigned Clearance = CurInstr - LiveRegs[rx].Def;
LiveRegs[rx].Def = CurInstr;
// Kill off domains redefined by generic instructions.
if (Kill)
kill(rx);
+
+ // Verify clearance before partial register updates.
+ unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+ if (!Pref)
+ continue;
+ DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+ if (Pref > Clearance) {
+ DEBUG(dbgs() << ": Break dependency.\n");
+ TII->breakPartialRegDependency(MI, i, TRI);
+ continue;
+ }
+
+ // The current clearance seems OK, but we may be ignoring a def from a
+ // back-edge.
+ if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+ DEBUG(dbgs() << ": OK.\n");
+ continue;
+ }
+
+ // A def from an unprocessed back-edge may make us break this dependency.
+ DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
}
++CurInstr;
@@ -663,6 +686,10 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
for (unsigned i = 0, e = Loops.size(); i != e; ++i) {
MachineBasicBlock *MBB = Loops[i];
enterBasicBlock(MBB);
+ for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+ ++I)
+ if (!I->isDebugValue())
+ processDefs(I, false);
leaveBasicBlock(MBB);
}
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index d9ffd8161f..9428fffae8 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2761,6 +2761,10 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
///
static bool hasPartialRegUpdate(unsigned Opcode) {
switch (Opcode) {
+ case X86::CVTSI2SSrr:
+ case X86::CVTSI2SS64rr:
+ case X86::CVTSI2SDrr:
+ case X86::CVTSI2SD64rr:
case X86::CVTSD2SSrr:
case X86::Int_CVTSD2SSrr:
case X86::CVTSS2SDrr:
@@ -2789,6 +2793,54 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
return false;
}
+/// getPartialRegUpdateClearance - Inform the ExeDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::
+getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (OpNum != 0 || !hasPartialRegUpdate(MI->getOpcode()))
+ return 0;
+
+ // If MI is marked as reading Reg, the partial register update is wanted.
+ const MachineOperand &MO = MI->getOperand(0);
+ unsigned Reg = MO.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (MO.readsReg() || MI->readsVirtualRegister(Reg))
+ return 0;
+ } else {
+ if (MI->readsRegister(Reg, TRI))
+ return 0;
+ }
+
+ // If any of the preceding 16 instructions are reading Reg, insert a
+ // dependency breaking instruction. The magic number is based on a few
+ // Nehalem experiments.
+ return 16;
+}
+
+void X86InstrInfo::
+breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ unsigned Reg = MI->getOperand(OpNum).getReg();
+ if (X86::VR128RegClass.contains(Reg)) {
+ // These instructions are all floating point domain, so xorps is the best
+ // choice.
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ unsigned Opc = HasAVX ? X86::VXORPSrr : X86::XORPSrr;
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(Opc), Reg)
+ .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
+ } else if (X86::VR256RegClass.contains(Reg)) {
+ // Use vxorps to clear the full ymm register.
+ // It wants to read and write the xmm sub-register.
+ unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(X86::VXORPSrr), XReg)
+ .addReg(XReg, RegState::Undef).addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ } else
+ return;
+ MI->addRegisterKilled(Reg, TRI, true);
+}
+
MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr *MI,
const SmallVectorImpl<unsigned> &Ops,
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 97009dbdbe..ee488d8f01 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -345,6 +345,11 @@ public:
void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
+ unsigned getPartialRegUpdateClearance(const MachineInstr *MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const;
+ void breakPartialRegDependency(MachineBasicBlock::iterator MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const;
+
MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
MachineInstr* MI,
unsigned OpNum,
diff --git a/test/CodeGen/X86/sse-domains.ll b/test/CodeGen/X86/sse-domains.ll
index d26d32287e..3b66f4fd5c 100644
--- a/test/CodeGen/X86/sse-domains.ll
+++ b/test/CodeGen/X86/sse-domains.ll
@@ -43,3 +43,44 @@ while.body:
while.end:
ret void
}
+
+; CHECK: f2
+;
+; This loop contains two cvtsi2ss instructions that update the same xmm
+; register. Verify that the execution dependency fix pass breaks those
+; dependencies by inserting xorps instructions.
+;
+; If the register allocator chooses different registers for the two cvtsi2ss
+; instructions, they are still dependent on themselves.
+; CHECK: xorps [[XMM1:%xmm[0-9]+]]
+; CHECK: , [[XMM1]]
+; CHECK: cvtsi2ss %{{.*}}, [[XMM1]]
+; CHECK: xorps [[XMM2:%xmm[0-9]+]]
+; CHECK: , [[XMM2]]
+; CHECK: cvtsi2ss %{{.*}}, [[XMM2]]
+;
+define float @f2(i32 %m) nounwind uwtable readnone ssp {
+entry:
+ %tobool3 = icmp eq i32 %m, 0
+ br i1 %tobool3, label %for.end, label %for.body
+
+for.body: ; preds = %entry, %for.body
+ %m.addr.07 = phi i32 [ %dec, %for.body ], [ %m, %entry ]
+ %s1.06 = phi float [ %add, %for.body ], [ 0.000000e+00, %entry ]
+ %s2.05 = phi float [ %add2, %for.body ], [ 0.000000e+00, %entry ]
+ %n.04 = phi i32 [ %inc, %for.body ], [ 1, %entry ]
+ %conv = sitofp i32 %n.04 to float
+ %add = fadd float %s1.06, %conv
+ %conv1 = sitofp i32 %m.addr.07 to float
+ %add2 = fadd float %s2.05, %conv1
+ %inc = add nsw i32 %n.04, 1
+ %dec = add nsw i32 %m.addr.07, -1
+ %tobool = icmp eq i32 %dec, 0
+ br i1 %tobool, label %for.end, label %for.body
+
+for.end: ; preds = %for.body, %entry
+ %s1.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+ %s2.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add2, %for.body ]
+ %sub = fsub float %s1.0.lcssa, %s2.0.lcssa
+ ret float %sub
+}