diff options
author | Evan Cheng <evan.cheng@apple.com> | 2010-10-29 18:09:28 +0000 |
---|---|---|
committer | Evan Cheng <evan.cheng@apple.com> | 2010-10-29 18:09:28 +0000 |
commit | 089751535d6e9adf65842e2ca5867bf9a70e1e95 (patch) | |
tree | 883781d177686ef496311a6d049024b5b61219ef /lib | |
parent | d7e473c629a5e4fb1584fb5c5c1b0c1e142fdc8f (diff) | |
download | llvm-089751535d6e9adf65842e2ca5867bf9a70e1e95.tar.gz llvm-089751535d6e9adf65842e2ca5867bf9a70e1e95.tar.bz2 llvm-089751535d6e9adf65842e2ca5867bf9a70e1e95.tar.xz |
Avoiding overly aggressive latency scheduling. If the two nodes share an
operand and one of them has a single use that is a live out copy, favor the
one that is live out. Otherwise it will be difficult to eliminate the copy
if the instruction is a loop induction variable update. e.g.
BB:
sub r1, r3, #1
str r0, [r2, r3]
mov r3, r1
cmp
bne BB
=>
BB:
str r0, [r2, r3]
sub r3, r3, #1
cmp
bne BB
This fixed the recent 256.bzip2 regression.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@117675 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r-- | lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp | 93 | ||||
-rw-r--r-- | lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp | 9 | ||||
-rw-r--r-- | lib/Target/ARM/ARMBaseInstrInfo.cpp | 9 |
3 files changed, 85 insertions, 26 deletions
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index 4c3e4e3b07..ea1aaa1e05 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -190,7 +190,7 @@ private: void ScheduleDAGRRList::Schedule() { DEBUG(dbgs() << "********** List Scheduling BB#" << BB->getNumber() - << " **********\n"); + << " '" << BB->getName() << "' **********\n"); NumLiveRegs = 0; LiveRegDefs.resize(TRI->getNumRegs(), NULL); @@ -1483,6 +1483,46 @@ static unsigned calcMaxScratches(const SUnit *SU) { return Scratches; } +/// hasOnlyLiveOutUse - Return true if SU has a single value successor that is a +/// CopyToReg to a virtual register. This SU def is probably a liveout and +/// it has no other use. It should be scheduled closer to the terminator. +static bool hasOnlyLiveOutUses(const SUnit *SU) { + bool RetVal = false; + for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); + I != E; ++I) { + if (I->isCtrl()) continue; + const SUnit *SuccSU = I->getSUnit(); + if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) { + unsigned Reg = + cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + RetVal = true; + continue; + } + } + return false; + } + return RetVal; +} + +/// UnitsSharePred - Return true if the two scheduling units share a common +/// data predecessor. +static bool UnitsSharePred(const SUnit *left, const SUnit *right) { + SmallSet<const SUnit*, 4> Preds; + for (SUnit::const_pred_iterator I = left->Preds.begin(),E = left->Preds.end(); + I != E; ++I) { + if (I->isCtrl()) continue; // ignore chain preds + Preds.insert(I->getSUnit()); + } + for (SUnit::const_pred_iterator I = right->Preds.begin(),E = right->Preds.end(); + I != E; ++I) { + if (I->isCtrl()) continue; // ignore chain preds + if (Preds.count(I->getSUnit())) + return true; + } + return false; +} + template <typename RRSort> static bool BURRSort(const SUnit *left, const SUnit *right, const RegReductionPriorityQueue<RRSort> *SPQ) { @@ -1558,29 +1598,46 @@ bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{ else if (!LHigh && RHigh) return false; else if (!LHigh && !RHigh) { + // If the two nodes share an operand and one of them has a single + // use that is a live out copy, favor the one that is live out. Otherwise + // it will be difficult to eliminate the copy if the instruction is a + // loop induction variable update. e.g. + // BB: + // sub r1, r3, #1 + // str r0, [r2, r3] + // mov r3, r1 + // cmp + // bne BB + bool SharePred = UnitsSharePred(left, right); + // FIXME: Only adjust if BB is a loop back edge. + // FIXME: What's the cost of a copy? + int LBonus = (SharePred && hasOnlyLiveOutUses(left)) ? 1 : 0; + int RBonus = (SharePred && hasOnlyLiveOutUses(right)) ? 1 : 0; + int LHeight = (int)left->getHeight() - LBonus; + int RHeight = (int)right->getHeight() - RBonus; + // Low register pressure situation, schedule for latency if possible. bool LStall = left->SchedulingPref == Sched::Latency && - SPQ->getCurCycle() < left->getHeight(); + (int)SPQ->getCurCycle() < LHeight; bool RStall = right->SchedulingPref == Sched::Latency && - SPQ->getCurCycle() < right->getHeight(); + (int)SPQ->getCurCycle() < RHeight; // If scheduling one of the node will cause a pipeline stall, delay it. // If scheduling either one of the node will cause a pipeline stall, sort // them according to their height. - // If neither will cause a pipeline stall, try to reduce register pressure. if (LStall) { if (!RStall) return true; - if (left->getHeight() != right->getHeight()) - return left->getHeight() > right->getHeight(); + if (LHeight != RHeight) + return LHeight > RHeight; } else if (RStall) return false; - // If either node is scheduling for latency, sort them by height and latency - // first. + // If either node is scheduling for latency, sort them by height + // and latency. if (left->SchedulingPref == Sched::Latency || right->SchedulingPref == Sched::Latency) { - if (left->getHeight() != right->getHeight()) - return left->getHeight() > right->getHeight(); + if (LHeight != RHeight) + return LHeight > RHeight; if (left->Latency != right->Latency) return left->Latency > right->Latency; } @@ -1631,19 +1688,6 @@ RegReductionPriorityQueue<SF>::canClobber(const SUnit *SU, const SUnit *Op) { return false; } -/// hasCopyToRegUse - Return true if SU has a value successor that is a -/// CopyToReg node. -static bool hasCopyToRegUse(const SUnit *SU) { - for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end(); - I != E; ++I) { - if (I->isCtrl()) continue; - const SUnit *SuccSU = I->getSUnit(); - if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) - return true; - } - return false; -} - /// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's /// physical register defs. static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU, @@ -1813,6 +1857,7 @@ void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() { if (!Node || !Node->isMachineOpcode() || SU->getNode()->getFlaggedNode()) continue; + bool isLiveOut = hasOnlyLiveOutUses(SU); unsigned Opc = Node->getMachineOpcode(); const TargetInstrDesc &TID = TII->get(Opc); unsigned NumRes = TID.getNumDefs(); @@ -1862,7 +1907,7 @@ void RegReductionPriorityQueue<SF>::AddPseudoTwoAddrDeps() { SuccOpc == TargetOpcode::SUBREG_TO_REG) continue; if ((!canClobber(SuccSU, DUSU) || - (hasCopyToRegUse(SU) && !hasCopyToRegUse(SuccSU)) || + (isLiveOut && !hasOnlyLiveOutUses(SuccSU)) || (!SU->isCommutable && SuccSU->isCommutable)) && !scheduleDAG->IsReachable(SuccSU, SU)) { DEBUG(dbgs() << " Adding a pseudo-two-addr edge from SU #" diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index d34a52d801..7d01bd31b9 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -458,6 +458,15 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use, // Adjust the use operand index by num of defs. OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs(); int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx); + if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg && + !BB->succ_empty()) { + unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) + // This copy is a liveout value. It is likely coalesced, so reduce the + // latency so not to penalize the def. + // FIXME: need target specific adjustment here? + Latency = (Latency > 1) ? Latency - 1 : 1; + } if (Latency >= 0) dep.setLatency(Latency); } diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 51db677581..c23bc93925 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1967,8 +1967,13 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData, if (!ItinData || ItinData->isEmpty()) return DefTID.mayLoad() ? 3 : 1; - if (!UseNode->isMachineOpcode()) - return ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx); + if (!UseNode->isMachineOpcode()) { + int Latency = ItinData->getOperandCycle(DefTID.getSchedClass(), DefIdx); + if (Subtarget.isCortexA9()) + return Latency <= 2 ? 1 : Latency - 1; + else + return Latency <= 3 ? 1 : Latency - 2; + } const TargetInstrDesc &UseTID = get(UseNode->getMachineOpcode()); const MachineSDNode *DefMN = dyn_cast<MachineSDNode>(DefNode); |