R600: Use a refined heuristic to choose when switching clause

This is using a hint from AMD APP OpenCL Programming Guide with empirically tweaked parameters. I used Unigine Heaven 3.0 to determine best parameters on my system (i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark : it went from 38.8 average fps to 39.6, which is ~3% gain. (Lightmark 2008.2 gain is much more marginal: from 537 to 539) There is no lit test provided as the parameter were determined empirically and it it would be nearly impossiblet to find a test program that check for optimal behavior. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183593 91177308-0d34-0410-b5e6-96231b3b80d8
author: Vincent Lejeune <vljn@ovi.com> 2013-06-07 23:30:34 +0000
committer: Vincent Lejeune <vljn@ovi.com> 2013-06-07 23:30:34 +0000
commit: 843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4 (patch)
tree: 5c1348e877b36cf4695b7d7d50c3e01edca25996 /lib/Target/R600/R600MachineScheduler.cpp
parent: b01bdf87ff5e13eb22fcc20cd395bf282fbf1ecd (diff)
download: llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.gz
llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.bz2
llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.xz
1 files changed, 43 insertions, 9 deletions
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 29f3e1a7d5..a330d88574 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -38,7 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
 
   const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
   InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
-
+  AluInstCount = 0;
+  FetchInstCount = 0;
 }
 
 void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
@@ -48,6 +49,12 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
   QSrc.clear();
 }
 
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+  assert (GPRCount && "GPRCount cannot be 0");
+  return 248 / GPRCount;
+}
+
 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
   SUnit *SU = 0;
   NextInstKind = IDOther;
@@ -60,6 +67,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
   bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
       (!Available[IDFetch].empty() || !Available[IDOther].empty());
 
+  if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+    // We use the heuristic provided by AMD Accelerated Parallel Processing
+    // OpenCL Programming Guide :
+    // The approx. number of WF that allows TEX inst to hide ALU inst is :
+    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+    float ALUFetchRationEstimate = 
+        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+        (FetchInstCount + Available[IDFetch].size());
+    unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+    DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+    // We assume the local GPR requirements to be "dominated" by the requirement
+    // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+    // after TEX are indeed likely to consume or generate values from/for the
+    // TEX clause.
+    // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+    // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+    // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+    // (TODO : use RegisterPressure)
+    // If we are going too use too many GPR, we flush Fetch instruction to lower
+    // register pressure on 128 bits regs.
+    unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+    if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+      AllowSwitchFromAlu = true;
+  }
+
+
   // We want to scheduled AR defs as soon as possible to make sure they aren't
   // put in a different ALU clause from their uses.
   if (!SU && !UnscheduledARDefs.empty()) {
@@ -133,6 +166,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
   }
 
   if (CurInstKind == IDAlu) {
+    AluInstCount ++;
     switch (getAluKind(SU)) {
     case AluT_XYZW:
       CurEmitted += 4;
@@ -158,7 +192,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
 
   if (CurInstKind != IDFetch) {
     MoveUnits(Pending[IDFetch], Available[IDFetch]);
-  }
+  } else
+    FetchInstCount++;
 }
 
 static bool
@@ -370,16 +405,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
   return UnslotedSU;
 }
 
-bool R600SchedStrategy::isAvailablesAluEmpty() const {
-  return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
-      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
-      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
-      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
-      AvailableAlus[AluPredX].empty();
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+      AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
 }
 
 SUnit* R600SchedStrategy::pickAlu() {
-  while (!isAvailablesAluEmpty()) {
+  while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
     if (!OccupedSlotsMask) {
       // Bottom up scheduling : predX must comes first
       if (!AvailableAlus[AluPredX].empty()) {
author	Vincent Lejeune <vljn@ovi.com>	2013-06-07 23:30:34 +0000
committer	Vincent Lejeune <vljn@ovi.com>	2013-06-07 23:30:34 +0000
commit	843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4 (patch)
tree	5c1348e877b36cf4695b7d7d50c3e01edca25996 /lib/Target/R600/R600MachineScheduler.cpp
parent	b01bdf87ff5e13eb22fcc20cd395bf282fbf1ecd (diff)
download	llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.gz llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.bz2 llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.xz