From 3cd8cf6bbd76c7c52a2e7075579c069542f61696 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 26 Jun 2014 17:22:30 +0000 Subject: R600/SI: Add FP mode bits to binary. The default rounding mode to initialize the mode register needs to be reported to the runtime. Fill in other bits a kernel may be interested in setting for future use. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211791 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUAsmPrinter.cpp | 69 ++++++++++++++++++++++++++++++------ lib/Target/R600/AMDGPUAsmPrinter.h | 23 +++++++++--- lib/Target/R600/SIDefines.h | 50 ++++++++++++++++++++++++++ test/CodeGen/R600/default-fp-mode.ll | 10 ++++++ 4 files changed, 138 insertions(+), 14 deletions(-) create mode 100644 test/CodeGen/R600/default-fp-mode.ll diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 2da7792a88..a6e217b969 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -36,6 +36,24 @@ using namespace llvm; +// TODO: This should get the default rounding mode from the kernel. We just set +// the default here, but this could change if the OpenCL rounding mode pragmas +// are used. +// +// The denormal mode here should match what is reported by the OpenCL runtime +// for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but +// can also be override to flush with the -cl-denorms-are-zero compiler flag. +// +// AMD OpenCL only sets flush none and reports CL_FP_DENORM for double +// precision, and leaves single precision to flush all and does not report +// CL_FP_DENORM for CL_DEVICE_SINGLE_FP_CONFIG. Mesa's OpenCL currently reports +// CL_FP_DENORM for both. +static uint32_t getFPMode(MachineFunction &) { + return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) | + FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) | + FP_DENORM_MODE_SP(FP_DENORM_FLUSH_NONE) | + FP_DENORM_MODE_DP(FP_DENORM_FLUSH_NONE); +} static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm, MCStreamer &Streamer) { @@ -93,6 +111,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { false); OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), false); + OutStreamer.emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), + false); + OutStreamer.emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), + false); } else { R600MachineFunctionInfo *MFI = MF.getInfo(); OutStreamer.emitRawComment( @@ -280,16 +302,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, if (VCCUsed) MaxSGPR += 2; - ProgInfo.CodeLen = CodeSize; - ProgInfo.NumSGPR = MaxSGPR; ProgInfo.NumVGPR = MaxVGPR; + ProgInfo.NumSGPR = MaxSGPR; + + // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode + // register. + ProgInfo.FloatMode = getFPMode(MF); + + // XXX: Not quite sure what this does, but sc seems to unset this. + ProgInfo.IEEEMode = 0; + + // Do not clamp NAN to 0. + ProgInfo.DX10Clamp = 0; + + ProgInfo.CodeLen = CodeSize; } void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, const SIProgramInfo &KernelInfo) { const AMDGPUSubtarget &STM = TM.getSubtarget(); - SIMachineFunctionInfo *MFI = MF.getInfo(); + unsigned RsrcReg; switch (MFI->ShaderType) { default: // Fall through @@ -299,25 +332,41 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF, case ShaderType::VERTEX: RsrcReg = R_00B128_SPI_SHADER_PGM_RSRC1_VS; break; } - OutStreamer.EmitIntValue(RsrcReg, 4); - OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | - S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); - unsigned LDSAlignShift; if (STM.getGeneration() < AMDGPUSubtarget::SEA_ISLANDS) { - // LDS is allocated in 64 dword blocks + // LDS is allocated in 64 dword blocks. LDSAlignShift = 8; } else { - // LDS is allocated in 128 dword blocks + // LDS is allocated in 128 dword blocks. LDSAlignShift = 9; } + unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; if (MFI->ShaderType == ShaderType::COMPUTE) { + OutStreamer.EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); + + const uint32_t ComputePGMRSrc1 = + S_00B848_VGPRS(KernelInfo.NumVGPR / 4) | + S_00B848_SGPRS(KernelInfo.NumSGPR / 8) | + S_00B848_PRIORITY(KernelInfo.Priority) | + S_00B848_FLOAT_MODE(KernelInfo.FloatMode) | + S_00B848_PRIV(KernelInfo.Priv) | + S_00B848_DX10_CLAMP(KernelInfo.DX10Clamp) | + S_00B848_IEEE_MODE(KernelInfo.DebugMode) | + S_00B848_IEEE_MODE(KernelInfo.IEEEMode); + + OutStreamer.EmitIntValue(ComputePGMRSrc1, 4); + OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4); + } else { + OutStreamer.EmitIntValue(RsrcReg, 4); + OutStreamer.EmitIntValue(S_00B028_VGPRS(KernelInfo.NumVGPR / 4) | + S_00B028_SGPRS(KernelInfo.NumSGPR / 8), 4); } + if (MFI->ShaderType == ShaderType::PIXEL) { OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer.EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(LDSBlocks), 4); diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h index 71adc9a4d1..c1acb6ea0f 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.h +++ b/lib/Target/R600/AMDGPUAsmPrinter.h @@ -25,13 +25,28 @@ class AMDGPUAsmPrinter : public AsmPrinter { private: struct SIProgramInfo { SIProgramInfo() : - CodeLen(0), + NumVGPR(0), NumSGPR(0), - NumVGPR(0) {} + Priority(0), + FloatMode(0), + Priv(0), + DX10Clamp(0), + DebugMode(0), + IEEEMode(0), + CodeLen(0) {} + // Fields set in PGM_RSRC1 pm4 packet. + uint32_t NumVGPR; + uint32_t NumSGPR; + uint32_t Priority; + uint32_t FloatMode; + uint32_t Priv; + uint32_t DX10Clamp; + uint32_t DebugMode; + uint32_t IEEEMode; + + // Bonus information for debugging. uint64_t CodeLen; - unsigned NumSGPR; - unsigned NumVGPR; }; void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const; diff --git a/lib/Target/R600/SIDefines.h b/lib/Target/R600/SIDefines.h index 2cbce282cb..4d31a11859 100644 --- a/lib/Target/R600/SIDefines.h +++ b/lib/Target/R600/SIDefines.h @@ -35,4 +35,54 @@ enum { #define S_00B84C_LDS_SIZE(x) (((x) & 0x1FF) << 15) #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC + +#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 +#define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) +#define G_00B848_VGPRS(x) (((x) >> 0) & 0x3F) +#define C_00B848_VGPRS 0xFFFFFFC0 +#define S_00B848_SGPRS(x) (((x) & 0x0F) << 6) +#define G_00B848_SGPRS(x) (((x) >> 6) & 0x0F) +#define C_00B848_SGPRS 0xFFFFFC3F +#define S_00B848_PRIORITY(x) (((x) & 0x03) << 10) +#define G_00B848_PRIORITY(x) (((x) >> 10) & 0x03) +#define C_00B848_PRIORITY 0xFFFFF3FF +#define S_00B848_FLOAT_MODE(x) (((x) & 0xFF) << 12) +#define G_00B848_FLOAT_MODE(x) (((x) >> 12) & 0xFF) +#define C_00B848_FLOAT_MODE 0xFFF00FFF +#define S_00B848_PRIV(x) (((x) & 0x1) << 20) +#define G_00B848_PRIV(x) (((x) >> 20) & 0x1) +#define C_00B848_PRIV 0xFFEFFFFF +#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21) +#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1) +#define C_00B848_DX10_CLAMP 0xFFDFFFFF +#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22) +#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1) +#define C_00B848_DEBUG_MODE 0xFFBFFFFF +#define S_00B848_IEEE_MODE(x) (((x) & 0x1) << 23) +#define G_00B848_IEEE_MODE(x) (((x) >> 23) & 0x1) +#define C_00B848_IEEE_MODE 0xFF7FFFFF + + +// Helpers for setting FLOAT_MODE +#define FP_ROUND_ROUND_TO_NEAREST 0 +#define FP_ROUND_ROUND_TO_INF 1 +#define FP_ROUND_ROUND_TO_NEGINF 2 +#define FP_ROUND_ROUND_TO_ZERO 3 + +// Bits 3:0 control rounding mode. 1:0 control single precision, 3:2 double +// precision. +#define FP_ROUND_MODE_SP(x) ((x) & 0x3) +#define FP_ROUND_MODE_DP(x) (((x) & 0x3) << 2) + +#define FP_DENORM_FLUSH_IN_FLUSH_OUT 0 +#define FP_DENORM_FLUSH_OUT 1 +#define FP_DENORM_FLUSH_IN 2 +#define FP_DENORM_FLUSH_NONE 3 + + +// Bits 7:4 control denormal handling. 5:4 control single precision, 6:7 double +// precision. +#define FP_DENORM_MODE_SP(x) (((x) & 0x3) << 4) +#define FP_DENORM_MODE_DP(x) (((x) & 0x3) << 6) + #endif // SIDEFINES_H_ diff --git a/test/CodeGen/R600/default-fp-mode.ll b/test/CodeGen/R600/default-fp-mode.ll new file mode 100644 index 0000000000..214b2c2036 --- /dev/null +++ b/test/CodeGen/R600/default-fp-mode.ll @@ -0,0 +1,10 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: @test_kernel +; SI: FloatMode: 240 +; SI: IeeeMode: 0 +define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind { + store float 0.0, float addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} -- cgit v1.2.3