Increased the register pressure limit on x86_64 from 8 to 12

regs. This is the only change in this checkin that may affects the default scheduler. With better register tracking and heuristics, it doesn't make sense to artificially lower the register limit so much. Added -sched-high-latency-cycles and X86InstrInfo::isHighLatencyDef to give the scheduler a way to account for div and sqrt on targets that don't have an itinerary. It is currently defaults to 10 (the actual number doesn't matter much), but only takes effect on non-default schedulers: list-hybrid and list-ilp. Added several heuristics that can be individually disabled for the non-default sched=list-ilp mode. This helps us determine how much better we can do on a given benchmark than the default scheduler. Certain compute intensive loops run much faster in this mode with the right set of heuristics, and it doesn't seem to have much negative impact elsewhere. Not all of the heuristics are needed, but we still need to experiment to decide which should be disabled by default for sched=list-ilp. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@127067 91177308-0d34-0410-b5e6-96231b3b80d8
author: Andrew Trick <atrick@apple.com> 2011-03-05 08:00:22 +0000
committer: Andrew Trick <atrick@apple.com> 2011-03-05 08:00:22 +0000
commit: e0ef509aeb47b396cf1bdc170ca4f468f799719f (patch)
tree: be309e6cc6e8a6f933206cba78de2930eba7a668
parent: a4a1c3f0d92f8737b5ff72bfd4b44bf19c859cee (diff)
download: llvm-e0ef509aeb47b396cf1bdc170ca4f468f799719f.tar.gz
llvm-e0ef509aeb47b396cf1bdc170ca4f468f799719f.tar.bz2
llvm-e0ef509aeb47b396cf1bdc170ca4f468f799719f.tar.xz
6 files changed, 173 insertions, 30 deletions
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index fc7b51ec6c..13bcee652f 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -641,6 +641,10 @@ public:
   virtual int getInstrLatency(const InstrItineraryData *ItinData,
                               SDNode *Node) const;
 
+  /// isHighLatencyDef - Return true if this opcode has high latency to its
+  /// result.
+  bool isHighLatencyDef(int opc) const { return false; }
+
   /// hasHighOperandLatency - Compute operand latency between a def of 'Reg'
   /// and an use in the current loop, return true if the target considered
   /// it 'high'. This is used by optimization passes such as machine LICM to
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index d253b3148b..035d027afe 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -70,6 +70,43 @@ static cl::opt<bool> DisableSchedCycles(
   "disable-sched-cycles", cl::Hidden, cl::init(false),
   cl::desc("Disable cycle-level precision during preRA scheduling"));
 
+// Temporary sched=list-ilp flags until the heuristics are robust.
+static cl::opt<bool> DisableSchedRegPressure(
+  "disable-sched-reg-pressure", cl::Hidden, cl::init(false),
+  cl::desc("Disable regpressure priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedLiveUses(
+  "disable-sched-live-uses", cl::Hidden, cl::init(false),
+  cl::desc("Disable live use priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedStalls(
+  "disable-sched-stalls", cl::Hidden, cl::init(false),
+  cl::desc("Disable no-stall priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedCriticalPath(
+  "disable-sched-critical-path", cl::Hidden, cl::init(false),
+  cl::desc("Disable critical path priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedHeight(
+  "disable-sched-height", cl::Hidden, cl::init(false),
+  cl::desc("Disable scheduled-height priority in sched=list-ilp"));
+
+static cl::opt<int> MaxReorderWindow(
+  "max-sched-reorder", cl::Hidden, cl::init(6),
+  cl::desc("Number of instructions to allow ahead of the critical path "
+           "in sched=list-ilp"));
+
+static cl::opt<unsigned> AvgIPC(
+  "sched-avg-ipc", cl::Hidden, cl::init(1),
+  cl::desc("Average inst/cycle whan no target itinerary exists."));
+
+#ifndef NDEBUG
+namespace {
+  // For sched=list-ilp, Count the number of times each factor comes into play.
+  enum { FactPressureDiff, FactRegUses, FactHeight, FactDepth, FactUllman,
+         NumFactors };
+}
+static const char *FactorName[NumFactors] =
+{"PressureDiff", "RegUses", "Height", "Depth","Ullman"};
+static int FactorCount[NumFactors];
+#endif //!NDEBUG
+
 namespace {
 //===----------------------------------------------------------------------===//
 /// ScheduleDAGRRList - The actual register reduction list scheduler
@@ -103,6 +140,10 @@ private:
   /// MinAvailableCycle - Cycle of the soonest available instruction.
   unsigned MinAvailableCycle;
 
+  /// IssueCount - Count instructions issued in this cycle
+  /// Currently valid only for bottom-up scheduling.
+  unsigned IssueCount;
+
   /// LiveRegDefs - A set of physical registers and their definition
   /// that are "live". These nodes must be scheduled before any other nodes that
   /// modifies the registers can be scheduled.
@@ -234,8 +275,14 @@ void ScheduleDAGRRList::Schedule() {
   DEBUG(dbgs()
         << "********** List Scheduling BB#" << BB->getNumber()
         << " '" << BB->getName() << "' **********\n");
+#ifndef NDEBUG
+  for (int i = 0; i < NumFactors; ++i) {
+    FactorCount[i] = 0;
+  }
+#endif //!NDEBUG
 
   CurCycle = 0;
+  IssueCount = 0;
   MinAvailableCycle = DisableSchedCycles ? 0 : UINT_MAX;
   NumLiveRegs = 0;
   LiveRegDefs.resize(TRI->getNumRegs(), NULL);
@@ -258,6 +305,11 @@ void ScheduleDAGRRList::Schedule() {
   else
     ListScheduleTopDown();
 
+#ifndef NDEBUG
+  for (int i = 0; i < NumFactors; ++i) {
+    DEBUG(dbgs() << FactorName[i] << "\t" << FactorCount[i] << "\n");
+  }
+#endif // !NDEBUG
   AvailableQueue->releaseState();
 }
 
@@ -383,6 +435,7 @@ void ScheduleDAGRRList::AdvanceToCycle(unsigned NextCycle) {
   if (NextCycle <= CurCycle)
     return;
 
+  IssueCount = 0;
   AvailableQueue->setCurCycle(NextCycle);
   if (!HazardRec->isEnabled()) {
     // Bypass lots of virtual calls in case of long latency.
@@ -502,10 +555,10 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
 
   AvailableQueue->ScheduledNode(SU);
 
-  // If HazardRec is disabled, count each inst as one cycle.
-  // Advance CurCycle before ReleasePredecessors to avoid useles pushed to
+  // If HazardRec is disabled, and each inst counts as one cycle, then
+  // advance CurCycle before ReleasePredecessors to avoid useles pushed to
   // PendingQueue for schedulers that implement HasReadyFilter.
-  if (!HazardRec->isEnabled())
+  if (!HazardRec->isEnabled() && AvgIPC < 2)
     AdvanceToCycle(CurCycle + 1);
 
   // Update liveness of predecessors before successors to avoid treating a
@@ -533,7 +586,9 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
   // If HazardRec is disabled, the cycle was advanced earlier.
   //
   // Check AvailableQueue after ReleasePredecessors in case of zero latency.
+  ++IssueCount;
   if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
+      || (!HazardRec->isEnabled() && AvgIPC > 1 && IssueCount == AvgIPC)
       || AvailableQueue->empty())
     AdvanceToCycle(CurCycle + 1);
 }
@@ -1458,7 +1513,9 @@ public:
 
   bool HighRegPressure(const SUnit *SU) const;
 
-  bool MayReduceRegPressure(SUnit *SU);
+  bool MayReduceRegPressure(SUnit *SU) const;
+
+  int RegPressureDiff(SUnit *SU, unsigned &LiveUses) const;
 
   void ScheduledNode(SUnit *SU);
 
@@ -1678,7 +1735,7 @@ bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
   return false;
 }
 
-bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) {
+bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) const {
   const SDNode *N = SU->getNode();
 
   if (!N->isMachineOpcode() || !SU->NumSuccs)
@@ -1696,6 +1753,53 @@ bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) {
   return false;
 }
 
+// Compute the register pressure contribution by this instruction by count up
+// for uses that are not live and down for defs. Only count register classes
+// that are already under high pressure. As a side effect, compute the number of
+// uses of registers that are already live.
+//
+// FIXME: This encompasses the logic in HighRegPressure and MayReduceRegPressure
+// so could probably be factored.
+int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
+  LiveUses = 0;
+  int PDiff = 0;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl())
+      continue;
+    SUnit *PredSU = I->getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      if (PredSU->getNode()->isMachineOpcode())
+        ++LiveUses;
+      continue;
+    }
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance()) {
+      EVT VT = RegDefPos.GetValue();
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      if (RegPressure[RCId] >= RegLimit[RCId])
+        ++PDiff;
+    }
+  }
+  const SDNode *N = SU->getNode();
+
+  if (!N->isMachineOpcode() || !SU->NumSuccs)
+    return PDiff;
+
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  for (unsigned i = 0; i != NumDefs; ++i) {
+    EVT VT = N->getValueType(i);
+    if (!N->hasAnyUseOfValue(i))
+      continue;
+    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+    if (RegPressure[RCId] >= RegLimit[RCId])
+      --PDiff;
+  }
+  return PDiff;
+}
+
 void RegReductionPQBase::ScheduledNode(SUnit *SU) {
   if (!TracksRegPressure)
     return;
@@ -1998,9 +2102,10 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
 static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
   unsigned LPriority = SPQ->getNodePriority(left);
   unsigned RPriority = SPQ->getNodePriority(right);
-  if (LPriority != RPriority)
+  if (LPriority != RPriority) {
+    DEBUG(++FactorCount[FactUllman]);
     return LPriority > RPriority;
-
+  }
   // Try schedule def + use closer when Sethi-Ullman numbers are the same.
   // e.g.
   // t1 = op t2, c1
@@ -2128,21 +2233,37 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
     // No way to compute latency of calls.
     return BURRSort(left, right, SPQ);
 
-  bool LHigh = SPQ->HighRegPressure(left);
-  bool RHigh = SPQ->HighRegPressure(right);
-  // Avoid causing spills. If register pressure is high, schedule for
-  // register pressure reduction.
-  if (LHigh && !RHigh)
-    return true;
-  else if (!LHigh && RHigh)
-    return false;
-  else if (!LHigh && !RHigh) {
-    // Low register pressure situation, schedule to maximize instruction level
-    // parallelism.
-    if (left->NumPreds > right->NumPreds)
-      return false;
-    else if (left->NumPreds < right->NumPreds)
-      return true;
+  unsigned LLiveUses, RLiveUses;
+  int LPDiff = SPQ->RegPressureDiff(left, LLiveUses);
+  int RPDiff = SPQ->RegPressureDiff(right, RLiveUses);
+  if (!DisableSchedRegPressure && LPDiff != RPDiff) {
+    DEBUG(++FactorCount[FactPressureDiff]);
+    return LPDiff > RPDiff;
+  }
+
+  if (!DisableSchedLiveUses && LLiveUses != RLiveUses) {
+    DEBUG(dbgs() << "Live uses " << left->NodeNum << " = " << LLiveUses
+          << " != " << right->NodeNum << " = " << RLiveUses << "\n");
+    DEBUG(++FactorCount[FactRegUses]);
+    return LLiveUses < RLiveUses;
+  }
+
+  bool LStall = BUHasStall(left, left->getHeight(), SPQ);
+  bool RStall = BUHasStall(right, right->getHeight(), SPQ);
+  if (!DisableSchedStalls && LStall != RStall) {
+    DEBUG(++FactorCount[FactHeight]);
+    return left->getHeight() > right->getHeight();
+  }
+
+  if (!DisableSchedCriticalPath
+      && abs((long)left->getDepth() - right->getDepth()) > MaxReorderWindow) {
+    DEBUG(++FactorCount[FactDepth]);
+    return left->getDepth() < right->getDepth();
+  }
+
+  if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
+    DEBUG(++FactorCount[FactHeight]);
+    return left->getHeight() > right->getHeight();
   }
 
   return BURRSort(left, right, SPQ);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 477c1ffe65..25f0927a3c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -27,12 +27,21 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 STATISTIC(LoadsClustered, "Number of loads clustered together");
 
+// This allows latency based scheduler to notice high latency instructions
+// without a target itinerary. The choise if number here has more to do with
+// balancing scheduler heursitics than with the actual machine latency.
+static cl::opt<int> HighLatencyCycles(
+  "sched-high-latency-cycles", cl::Hidden, cl::init(10),
+  cl::desc("Roughly estimate the number of cycles that 'long latency'"
+           "instructions take for targets with no itinerary"));
+
 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
   : ScheduleDAG(mf),
     InstrItins(mf.getTarget().getInstrItineraryData()) {}
@@ -506,7 +515,10 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) {
   }
 
   if (!InstrItins || InstrItins->isEmpty()) {
-    SU->Latency = 1;
+    if (SU->getNode() && TII->isHighLatencyDef(SU->getNode()->getOpcode()))
+      SU->Latency = HighLatencyCycles;
+    else
+      SU->Latency = 1;
     return;
   }
 
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 722202dc2d..c21aa43ee3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1284,7 +1284,7 @@ X86TargetLowering::getRegPressureLimit(const TargetRegisterClass *RC,
   case X86::GR32RegClassID:
     return 4 - FPDiff;
   case X86::GR64RegClassID:
-    return 8 - FPDiff;
+    return 12 - FPDiff;
   case X86::VR128RegClassID:
     return Subtarget->is64Bit() ? 10 : 4;
   case X86::VR64RegClassID:
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 76a9b12b8a..21df57c9cd 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -3085,12 +3085,8 @@ void X86InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
   NopInst.setOpcode(X86::NOOP);
 }
 
-bool X86InstrInfo::
-hasHighOperandLatency(const InstrItineraryData *ItinData,
-                      const MachineRegisterInfo *MRI,
-                      const MachineInstr *DefMI, unsigned DefIdx,
-                      const MachineInstr *UseMI, unsigned UseIdx) const {
-  switch (DefMI->getOpcode()) {
+bool X86InstrInfo::isHighLatencyDef(int opc) const {
+  switch (opc) {
   default: return false;
   case X86::DIVSDrm:
   case X86::DIVSDrm_Int:
@@ -3120,6 +3116,14 @@ hasHighOperandLatency(const InstrItineraryData *ItinData,
   }
 }
 
+bool X86InstrInfo::
+hasHighOperandLatency(const InstrItineraryData *ItinData,
+                      const MachineRegisterInfo *MRI,
+                      const MachineInstr *DefMI, unsigned DefIdx,
+                      const MachineInstr *UseMI, unsigned UseIdx) const {
+  return isHighLatencyDef(DefMI->getOpcode());
+}
+
 namespace {
   /// CGBR - Create Global Base Reg pass. This initializes the PIC
   /// global base register for x86-32.
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index f4581dae4d..81bbd3dd03 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -858,6 +858,8 @@ public:
                                       const SmallVectorImpl<MachineOperand> &MOs,
                                       unsigned Size, unsigned Alignment) const;
 
+  bool isHighLatencyDef(int opc) const;
+
   bool hasHighOperandLatency(const InstrItineraryData *ItinData,
                              const MachineRegisterInfo *MRI,
                              const MachineInstr *DefMI, unsigned DefIdx,
author	Andrew Trick <atrick@apple.com>	2011-03-05 08:00:22 +0000
committer	Andrew Trick <atrick@apple.com>	2011-03-05 08:00:22 +0000
commit	e0ef509aeb47b396cf1bdc170ca4f468f799719f (patch)
tree	be309e6cc6e8a6f933206cba78de2930eba7a668
parent	a4a1c3f0d92f8737b5ff72bfd4b44bf19c859cee (diff)
download	llvm-e0ef509aeb47b396cf1bdc170ca4f468f799719f.tar.gz llvm-e0ef509aeb47b396cf1bdc170ca4f468f799719f.tar.bz2 llvm-e0ef509aeb47b396cf1bdc170ca4f468f799719f.tar.xz