diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2014-01-23 16:18:02 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2014-01-23 16:18:02 +0000 |
commit | 7d3b9d96b6405c1a0b6d1ae73d5a4a1d506d500d (patch) | |
tree | 6b2c2f1ebc797eca2bc9921c0ac47e59d83568bd | |
parent | 03b94a460d9cfd86e438cd88596d93c63fc3c3e0 (diff) | |
download | llvm-7d3b9d96b6405c1a0b6d1ae73d5a4a1d506d500d.tar.gz llvm-7d3b9d96b6405c1a0b6d1ae73d5a4a1d506d500d.tar.bz2 llvm-7d3b9d96b6405c1a0b6d1ae73d5a4a1d506d500d.tar.xz |
R600: Recommit 199842: Add work-around for the CF stack entry HW bug
The unit test is now disabled on non-asserts builds.
The CF stack can be corrupted if you use CF_ALU_PUSH_BEFORE,
CF_ALU_ELSE_AFTER, CF_ALU_BREAK, or CF_ALU_CONTINUE when the number of
sub-entries on the stack is greater than or equal to the stack entry
size and sub-entries modulo 4 is either 0 or 3 (on cedar the bug is
present when number of sub-entries module 8 is either 7 or 0)
We choose to be conservative and always apply the work-around when the
number of sub-enries is greater than or equal to the stack entry size,
so that we can safely over-allocate the stack when we are unsure of the
stack allocation rules.
reviewed-by: Vincent Lejeune <vljn at ovi.com>
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@199905 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/R600/AMDGPU.td | 5 | ||||
-rw-r--r-- | lib/Target/R600/AMDGPUSubtarget.cpp | 6 | ||||
-rw-r--r-- | lib/Target/R600/AMDGPUSubtarget.h | 2 | ||||
-rw-r--r-- | lib/Target/R600/Processors.td | 14 | ||||
-rw-r--r-- | lib/Target/R600/R600ControlFlowFinalizer.cpp | 43 | ||||
-rw-r--r-- | test/CodeGen/R600/cf-stack-bug.ll | 227 |
6 files changed, 290 insertions, 7 deletions
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index c4e5efc8d6..d1e2cf5319 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -63,6 +63,11 @@ def FeatureCaymanISA : SubtargetFeature<"caymanISA", "true", "Use Cayman ISA">; +def FeatureCFALUBug : SubtargetFeature<"cfalubug", + "CFALUBug", + "true", + "GPU has CF_ALU bug">; + class SubtargetFeatureFetchLimit <string Value> : SubtargetFeature <"fetch"#Value, "TexVTXClauseSize", diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index f36aa2071c..e77ab5e6d1 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -39,6 +39,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : EnableIRStructurizer = true; EnableIfCvt = true; WavefrontSize = 0; + CFALUBug = false; ParseSubtargetFeatures(GPU, FS); DevName = GPU; } @@ -97,6 +98,11 @@ AMDGPUSubtarget::getStackEntrySize() const { } } bool +AMDGPUSubtarget::hasCFAluBug() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + return CFALUBug; +} +bool AMDGPUSubtarget::isTargetELF() const { return false; } diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 68d853218b..7e7f4d0c00 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -52,6 +52,7 @@ private: bool EnableIRStructurizer; bool EnableIfCvt; unsigned WavefrontSize; + bool CFALUBug; InstrItineraryData InstrItins; @@ -71,6 +72,7 @@ public: bool isIfCvtEnabled() const; unsigned getWavefrontSize() const; unsigned getStackEntrySize() const; + bool hasCFAluBug() const; virtual bool enableMachineScheduler() const { return getGeneration() <= NORTHERN_ISLANDS; diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index e601f35316..fde4481497 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -46,13 +46,15 @@ def : Proc<"rv770", R600_VLIW5_Itin, //===----------------------------------------------------------------------===// def : Proc<"cedar", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32]>; + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32, + FeatureCFALUBug]>; def : Proc<"redwood", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; + [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64, + FeatureCFALUBug]>; def : Proc<"sumo", R600_VLIW5_Itin, - [FeatureEvergreen, FeatureWavefrontSize64]>; + [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>; def : Proc<"juniper", R600_VLIW5_Itin, [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>; @@ -66,13 +68,13 @@ def : Proc<"cypress", R600_VLIW5_Itin, //===----------------------------------------------------------------------===// def : Proc<"barts", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache]>; + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; def : Proc<"turks", R600_VLIW5_Itin, - [FeatureNorthernIslands, FeatureVertexCache]>; + [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>; def : Proc<"caicos", R600_VLIW5_Itin, - [FeatureNorthernIslands]>; + [FeatureNorthernIslands, FeatureCFALUBug]>; def : Proc<"cayman", R600_VLIW4_Itin, [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index 6b42a7a9fa..470ff2e107 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -73,6 +73,44 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) { return false; } +bool CFStack::requiresWorkAroundForInst(unsigned Opcode) { + if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() && + getLoopDepth() > 1) + return true; + + if (!ST.hasCFAluBug()) + return false; + + switch(Opcode) { + default: return false; + case AMDGPU::CF_ALU_PUSH_BEFORE: + case AMDGPU::CF_ALU_ELSE_AFTER: + case AMDGPU::CF_ALU_BREAK: + case AMDGPU::CF_ALU_CONTINUE: + if (CurrentSubEntries == 0) + return false; + if (ST.getWavefrontSize() == 64) { + // We are being conservative here. We only require this work-around if + // CurrentSubEntries > 3 && + // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0) + // + // We have to be conservative, because we don't know for certain that + // our stack allocation algorithm for Evergreen/NI is correct. Applying this + // work-around when CurrentSubEntries > 3 allows us to over-allocate stack + // resources without any problems. + return CurrentSubEntries > 3; + } else { + assert(ST.getWavefrontSize() == 32); + // We are being conservative here. We only require the work-around if + // CurrentSubEntries > 7 && + // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0) + // See the comment on the wavefront size == 64 case for why we are + // being conservative. + return CurrentSubEntries > 7; + } + } +} + unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { switch(Item) { default: @@ -472,9 +510,12 @@ public: if (MI->getOpcode() == AMDGPU::CF_ALU) LastAlu.back() = MI; I++; + bool RequiresWorkAround = + CFStack.requiresWorkAroundForInst(MI->getOpcode()); switch (MI->getOpcode()) { case AMDGPU::CF_ALU_PUSH_BEFORE: - if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) { + if (RequiresWorkAround) { + DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n"); BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); diff --git a/test/CodeGen/R600/cf-stack-bug.ll b/test/CodeGen/R600/cf-stack-bug.ll new file mode 100644 index 0000000000..c3a4612e6a --- /dev/null +++ b/test/CodeGen/R600/cf-stack-bug.ll @@ -0,0 +1,227 @@ +; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG32 --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC +; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC + +; REQUIRES: asserts + +; We are currently allocating 2 extra sub-entries on Evergreen / NI for +; non-WQM push instructions if we change this to 1, then we will need to +; add one level of depth to each of these tests. + +; BUG64-NOT: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested3 +define void @nested3(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.store.1 + +if.store.1: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + store i32 3, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested4 +define void @nested4(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + store i32 4, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32-NOT: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested7 +define void @nested7(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + %4 = icmp sgt i32 %cond, 40 + br i1 %4, label %if.5, label %if.4.store + +if.4.store: + store i32 4, i32 addrspace(1)* %out + br label %end + +if.5: + %5 = icmp sgt i32 %cond, 50 + br i1 %5, label %if.6, label %if.5.store + +if.5.store: + store i32 5, i32 addrspace(1)* %out + br label %end + +if.6: + %6 = icmp sgt i32 %cond, 60 + br i1 %6, label %if.7, label %if.6.store + +if.6.store: + store i32 6, i32 addrspace(1)* %out + br label %end + +if.7: + store i32 7, i32 addrspace(1)* %out + br label %end + +end: + ret void +} + +; BUG64: Applying bug work-around +; BUG32: Applying bug work-around +; NOBUG-NOT: Applying bug work-around +; FUNC-LABEL: @nested8 +define void @nested8(i32 addrspace(1)* %out, i32 %cond) { +entry: + %0 = icmp sgt i32 %cond, 0 + br i1 %0, label %if.1, label %end + +if.1: + %1 = icmp sgt i32 %cond, 10 + br i1 %1, label %if.2, label %if.1.store + +if.1.store: + store i32 1, i32 addrspace(1)* %out + br label %end + +if.2: + %2 = icmp sgt i32 %cond, 20 + br i1 %2, label %if.3, label %if.2.store + +if.2.store: + store i32 2, i32 addrspace(1)* %out + br label %end + +if.3: + %3 = icmp sgt i32 %cond, 30 + br i1 %3, label %if.4, label %if.3.store + +if.3.store: + store i32 3, i32 addrspace(1)* %out + br label %end + +if.4: + %4 = icmp sgt i32 %cond, 40 + br i1 %4, label %if.5, label %if.4.store + +if.4.store: + store i32 4, i32 addrspace(1)* %out + br label %end + +if.5: + %5 = icmp sgt i32 %cond, 50 + br i1 %5, label %if.6, label %if.5.store + +if.5.store: + store i32 5, i32 addrspace(1)* %out + br label %end + +if.6: + %6 = icmp sgt i32 %cond, 60 + br i1 %6, label %if.7, label %if.6.store + +if.6.store: + store i32 6, i32 addrspace(1)* %out + br label %end + +if.7: + %7 = icmp sgt i32 %cond, 70 + br i1 %7, label %if.8, label %if.7.store + +if.7.store: + store i32 7, i32 addrspace(1)* %out + br label %end + +if.8: + store i32 8, i32 addrspace(1)* %out + br label %end + +end: + ret void +} |