diff options
author | Vincent Lejeune <vljn@ovi.com> | 2013-06-07 23:30:34 +0000 |
---|---|---|
committer | Vincent Lejeune <vljn@ovi.com> | 2013-06-07 23:30:34 +0000 |
commit | 843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4 (patch) | |
tree | 5c1348e877b36cf4695b7d7d50c3e01edca25996 /lib/Target/R600/R600MachineScheduler.cpp | |
parent | b01bdf87ff5e13eb22fcc20cd395bf282fbf1ecd (diff) | |
download | llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.gz llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.bz2 llvm-843c6c2d0e83bcd52a215d768bacaa7b5ffe16a4.tar.xz |
R600: Use a refined heuristic to choose when switching clause
This is using a hint from AMD APP OpenCL Programming Guide with
empirically tweaked parameters.
I used Unigine Heaven 3.0 to determine best parameters on my system
(i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark :
it went from 38.8 average fps to 39.6, which is ~3% gain.
(Lightmark 2008.2 gain is much more marginal: from 537 to 539)
There is no lit test provided as the parameter were determined
empirically and it it would be nearly impossiblet to find a test
program that check for optimal behavior.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@183593 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/R600/R600MachineScheduler.cpp')
-rw-r--r-- | lib/Target/R600/R600MachineScheduler.cpp | 52 |
1 files changed, 43 insertions, 9 deletions
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 29f3e1a7d5..a330d88574 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -38,7 +38,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>(); InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); - + AluInstCount = 0; + FetchInstCount = 0; } void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, @@ -48,6 +49,12 @@ void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, QSrc.clear(); } +static +unsigned getWFCountLimitedByGPR(unsigned GPRCount) { + assert (GPRCount && "GPRCount cannot be 0"); + return 248 / GPRCount; +} + SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { SUnit *SU = 0; NextInstKind = IDOther; @@ -60,6 +67,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && (!Available[IDFetch].empty() || !Available[IDOther].empty()); + if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { + // We use the heuristic provided by AMD Accelerated Parallel Processing + // OpenCL Programming Guide : + // The approx. number of WF that allows TEX inst to hide ALU inst is : + // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) + float ALUFetchRationEstimate = + (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / + (FetchInstCount + Available[IDFetch].size()); + unsigned NeededWF = 62.5f / ALUFetchRationEstimate; + DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); + // We assume the local GPR requirements to be "dominated" by the requirement + // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and + // after TEX are indeed likely to consume or generate values from/for the + // TEX clause. + // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause + // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need + // one GPR) or TmXYZW = TnXYZW (need 2 GPR). + // (TODO : use RegisterPressure) + // If we are going too use too many GPR, we flush Fetch instruction to lower + // register pressure on 128 bits regs. + unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); + if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) + AllowSwitchFromAlu = true; + } + + // We want to scheduled AR defs as soon as possible to make sure they aren't // put in a different ALU clause from their uses. if (!SU && !UnscheduledARDefs.empty()) { @@ -133,6 +166,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { } if (CurInstKind == IDAlu) { + AluInstCount ++; switch (getAluKind(SU)) { case AluT_XYZW: CurEmitted += 4; @@ -158,7 +192,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { if (CurInstKind != IDFetch) { MoveUnits(Pending[IDFetch], Available[IDFetch]); - } + } else + FetchInstCount++; } static bool @@ -370,16 +405,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) { return UnslotedSU; } -bool R600SchedStrategy::isAvailablesAluEmpty() const { - return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() && - AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() && - AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() && - AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() && - AvailableAlus[AluPredX].empty(); +unsigned R600SchedStrategy::AvailablesAluCount() const { + return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + + AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + + AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + + AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size(); } SUnit* R600SchedStrategy::pickAlu() { - while (!isAvailablesAluEmpty()) { + while (AvailablesAluCount() || !Pending[IDAlu].empty()) { if (!OccupedSlotsMask) { // Bottom up scheduling : predX must comes first if (!AvailableAlus[AluPredX].empty()) { |