summaryrefslogtreecommitdiff
path: root/lib/Target/R600/R600MachineScheduler.cpp
diff options
context:
space:
mode:
authorVincent Lejeune <vljn@ovi.com>2013-06-07 23:30:34 +0000
committerVincent Lejeune <vljn@ovi.com>2013-06-07 23:30:34 +0000
commit843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4 (patch)
tree5c1348e877b36cf4695b7d7d50c3e01edca25996 /lib/Target/R600/R600MachineScheduler.cpp
parentb01bdf87ff5e13eb22fcc20cd395bf282fbf1ecd (diff)
downloadllvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.gz
llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.bz2
llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.xz
R600: Use a refined heuristic to choose when switching clause
This is using a hint from AMD APP OpenCL Programming Guide with empirically tweaked parameters. I used Unigine Heaven 3.0 to determine best parameters on my system (i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark : it went from 38.8 average fps to 39.6, which is ~3% gain. (Lightmark 2008.2 gain is much more marginal: from 537 to 539) There is no lit test provided as the parameter were determined empirically and it it would be nearly impossiblet to find a test program that check for optimal behavior. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183593 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/R600/R600MachineScheduler.cpp')
-rw-r--r--lib/Target/R600/R600MachineScheduler.cpp52
1 files changed, 43 insertions, 9 deletions
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 29f3e1a7d5..a330d88574 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -38,7 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
-
+ AluInstCount = 0;
+ FetchInstCount = 0;
}
void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
@@ -48,6 +49,12 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
QSrc.clear();
}
+static
+unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
+ assert (GPRCount && "GPRCount cannot be 0");
+ return 248 / GPRCount;
+}
+
SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
SUnit *SU = 0;
NextInstKind = IDOther;
@@ -60,6 +67,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
(!Available[IDFetch].empty() || !Available[IDOther].empty());
+ if (CurInstKind == IDAlu && !Available[IDFetch].empty()) {
+ // We use the heuristic provided by AMD Accelerated Parallel Processing
+ // OpenCL Programming Guide :
+ // The approx. number of WF that allows TEX inst to hide ALU inst is :
+ // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
+ float ALUFetchRationEstimate =
+ (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
+ (FetchInstCount + Available[IDFetch].size());
+ unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
+ DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" );
+ // We assume the local GPR requirements to be "dominated" by the requirement
+ // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
+ // after TEX are indeed likely to consume or generate values from/for the
+ // TEX clause.
+ // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
+ // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
+ // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
+ // (TODO : use RegisterPressure)
+ // If we are going too use too many GPR, we flush Fetch instruction to lower
+ // register pressure on 128 bits regs.
+ unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
+ if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
+ AllowSwitchFromAlu = true;
+ }
+
+
// We want to scheduled AR defs as soon as possible to make sure they aren't
// put in a different ALU clause from their uses.
if (!SU && !UnscheduledARDefs.empty()) {
@@ -133,6 +166,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
}
if (CurInstKind == IDAlu) {
+ AluInstCount ++;
switch (getAluKind(SU)) {
case AluT_XYZW:
CurEmitted += 4;
@@ -158,7 +192,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
if (CurInstKind != IDFetch) {
MoveUnits(Pending[IDFetch], Available[IDFetch]);
- }
+ } else
+ FetchInstCount++;
}
static bool
@@ -370,16 +405,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
return UnslotedSU;
}
-bool R600SchedStrategy::isAvailablesAluEmpty() const {
- return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
- AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
- AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
- AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() &&
- AvailableAlus[AluPredX].empty();
+unsigned R600SchedStrategy::AvailablesAluCount() const {
+ return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
+ AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
+ AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
+ AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size();
}
SUnit* R600SchedStrategy::pickAlu() {
- while (!isAvailablesAluEmpty()) {
+ while (AvailablesAluCount() || !Pending[IDAlu].empty()) {
if (!OccupedSlotsMask) {
// Bottom up scheduling : predX must comes first
if (!AvailableAlus[AluPredX].empty()) {