summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/llvm/CodeGen/ScheduleDAGInstrs.h3
-rw-r--r--lib/CodeGen/MachineScheduler.cpp89
-rw-r--r--lib/CodeGen/ScheduleDAGInstrs.cpp61
3 files changed, 132 insertions, 21 deletions
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 4a447e2f4a..1b813141a8 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -197,6 +197,9 @@ namespace llvm {
/// input.
void buildSchedGraph(AliasAnalysis *AA, RegPressureTracker *RPTracker = 0);
+ /// Compute the cyclic critical path through the DAG.
+ unsigned computeCyclicCriticalPath();
+
/// addSchedBarrierDeps - Add dependencies from instructions in the current
/// list of instructions being scheduled to scheduling barrier. We want to
/// make sure instructions which define registers that are either used by
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index da6920575d..36eeb67d64 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -53,6 +53,9 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
static bool ViewMISchedDAGs = false;
#endif // NDEBUG
+static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
+ cl::desc("Enable cyclic critical path analysis."), cl::init(false));
+
static cl::opt<bool> EnableLoadCluster("misched-cluster", cl::Hidden,
cl::desc("Enable load clustering."), cl::init(true));
@@ -1207,16 +1210,21 @@ public:
struct SchedRemainder {
// Critical path through the DAG in expected latency.
unsigned CriticalPath;
+ unsigned CyclicCritPath;
// Scaled count of micro-ops left to schedule.
unsigned RemIssueCount;
+ bool IsAcyclicLatencyLimited;
+
// Unscheduled resources
SmallVector<unsigned, 16> RemainingCounts;
void reset() {
CriticalPath = 0;
+ CyclicCritPath = 0;
RemIssueCount = 0;
+ IsAcyclicLatencyLimited = false;
RemainingCounts.clear();
}
@@ -1434,6 +1442,8 @@ public:
virtual void registerRoots();
protected:
+ void checkAcyclicLatency();
+
void tryCandidate(SchedCandidate &Cand,
SchedCandidate &TryCand,
SchedBoundary &Zone,
@@ -1547,8 +1557,32 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
Bot.releaseNode(SU, SU->BotReadyCycle);
}
+void ConvergingScheduler::checkAcyclicLatency() {
+ if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
+ return;
+
+ unsigned BufferLimit =
+ SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
+ unsigned LatencyLag = Rem.CriticalPath - Rem.CyclicCritPath;
+ Rem.IsAcyclicLatencyLimited =
+ (LatencyLag * SchedModel->getLatencyFactor()) > BufferLimit;
+
+ DEBUG(dbgs() << "BufferLimit " << BufferLimit << "u / "
+ << Rem.RemIssueCount << "u = "
+ << (BufferLimit + Rem.RemIssueCount) / Rem.RemIssueCount << " iters. "
+ << "Latency = " << LatencyLag << "c = "
+ << LatencyLag * SchedModel->getLatencyFactor() << "u\n";
+ if (Rem.IsAcyclicLatencyLimited)
+ dbgs() << " ACYCLIC LATENCY LIMIT\n");
+}
+
void ConvergingScheduler::registerRoots() {
Rem.CriticalPath = DAG->ExitSU.getDepth();
+
+ if (EnableCyclicPath) {
+ Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
+ checkAcyclicLatency();
+ }
// Some roots may not feed into ExitSU. Check all of them in case.
for (std::vector<SUnit*>::const_iterator
I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
@@ -2096,6 +2130,32 @@ static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
return 0;
}
+static bool tryLatency(ConvergingScheduler::SchedCandidate &TryCand,
+ ConvergingScheduler::SchedCandidate &Cand,
+ ConvergingScheduler::SchedBoundary &Zone) {
+ if (Zone.isTop()) {
+ if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
+ if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+ TryCand, Cand, ConvergingScheduler::TopDepthReduce))
+ return true;
+ }
+ if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+ TryCand, Cand, ConvergingScheduler::TopPathReduce))
+ return true;
+ }
+ else {
+ if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
+ if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+ TryCand, Cand, ConvergingScheduler::BotHeightReduce))
+ return true;
+ }
+ if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+ TryCand, Cand, ConvergingScheduler::BotPathReduce))
+ return true;
+ }
+ return false;
+}
+
/// Apply a set of heursitics to a new candidate. Heuristics are currently
/// hierarchical. This may be more efficient than a graduated cost model because
/// we don't need to evaluate all aspects of the model for each node in the
@@ -2135,6 +2195,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
RegExcess))
return;
+ // For loops that are acyclic path limited, aggressively schedule for latency.
+ if (Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, Zone))
+ return;
+
// Avoid increasing the max critical pressure in the scheduled region.
if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical))
@@ -2174,27 +2238,10 @@ void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
return;
// Avoid serializing long latency dependence chains.
- if (Cand.Policy.ReduceLatency) {
- if (Zone.isTop()) {
- if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
- if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
- TryCand, Cand, TopDepthReduce))
- return;
- }
- if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
- TryCand, Cand, TopPathReduce))
- return;
- }
- else {
- if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
- if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
- TryCand, Cand, BotHeightReduce))
- return;
- }
- if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
- TryCand, Cand, BotPathReduce))
- return;
- }
+ // For acyclic path limited loops, latency was already checked above.
+ if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
+ && tryLatency(TryCand, Cand, Zone)) {
+ return;
}
// Prefer immediate defs/users of the last scheduled instruction. This is a
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 24714089da..0b5eb0ebe8 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -36,6 +36,8 @@
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetRegisterInfo.h"
#include "llvm/Target/TargetSubtargetInfo.h"
+#include <queue>
+
using namespace llvm;
static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
@@ -979,6 +981,65 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
PendingLoads.clear();
}
+/// Compute the max cyclic critical path through the DAG. For loops that span
+/// basic blocks, MachineTraceMetrics should be used for this instead.
+unsigned ScheduleDAGInstrs::computeCyclicCriticalPath() {
+ // This only applies to single block loop.
+ if (!BB->isSuccessor(BB))
+ return 0;
+
+ unsigned MaxCyclicLatency = 0;
+ // Visit each live out vreg def to find def/use pairs that cross iterations.
+ for (SUnit::const_pred_iterator
+ PI = ExitSU.Preds.begin(), PE = ExitSU.Preds.end(); PI != PE; ++PI) {
+ MachineInstr *MI = PI->getSUnit()->getInstr();
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ const MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg() || !MO.isDef())
+ break;
+ unsigned Reg = MO.getReg();
+ if (!Reg || TRI->isPhysicalRegister(Reg))
+ continue;
+
+ const LiveInterval &LI = LIS->getInterval(Reg);
+ unsigned LiveOutHeight = PI->getSUnit()->getHeight();
+ unsigned LiveOutDepth = PI->getSUnit()->getDepth() + PI->getLatency();
+ // Visit all local users of the vreg def.
+ for (VReg2UseMap::iterator
+ UI = VRegUses.find(Reg); UI != VRegUses.end(); ++UI) {
+ if (UI->SU == &ExitSU)
+ continue;
+
+ // Only consider uses of the phi.
+ LiveRangeQuery LRQ(LI, LIS->getInstructionIndex(UI->SU->getInstr()));
+ if (!LRQ.valueIn()->isPHIDef())
+ continue;
+
+ // Cheat a bit and assume that a path spanning two iterations is a
+ // cycle, which could overestimate in strange cases. This allows cyclic
+ // latency to be estimated as the minimum height or depth slack.
+ unsigned CyclicLatency = 0;
+ if (LiveOutDepth > UI->SU->getDepth())
+ CyclicLatency = LiveOutDepth - UI->SU->getDepth();
+ unsigned LiveInHeight = UI->SU->getHeight() + PI->getLatency();
+ if (LiveInHeight > LiveOutHeight) {
+ if (LiveInHeight - LiveOutHeight < CyclicLatency)
+ CyclicLatency = LiveInHeight - LiveOutHeight;
+ }
+ else
+ CyclicLatency = 0;
+ DEBUG(dbgs() << "Cyclic Path: SU(" << PI->getSUnit()->NodeNum
+ << ") -> SU(" << UI->SU->NodeNum << ") = "
+ << CyclicLatency << "\n");
+ if (CyclicLatency > MaxCyclicLatency)
+ MaxCyclicLatency = CyclicLatency;
+ }
+ }
+ }
+ DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "\n");
+ return MaxCyclicLatency;
+}
+
void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
SU->getInstr()->dump();