summaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2014-06-24 22:13:39 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2014-06-24 22:13:39 +0000
commit95eb45c5d94bfe9360ffc021697a50c6cf8c08cd (patch)
treeccff569afcf67b58b968ce88f204ad9641431949 /lib/Target
parent0029534141ee83d827267d4ac9418bbed5704c2e (diff)
downloadllvm-95eb45c5d94bfe9360ffc021697a50c6cf8c08cd.tar.gz
llvm-95eb45c5d94bfe9360ffc021697a50c6cf8c08cd.tar.bz2
llvm-95eb45c5d94bfe9360ffc021697a50c6cf8c08cd.tar.xz
R600: Fix inconsistency in rsq instructions.
R600 was using a clamped version of rsq, but SI was not. Add a new rsq_clamped intrinsic and use them consistently. It's unclear to me from the documentation what behavior the R600 instructions have, so I assume they have the legacy behavior described by the SI documents. For R600, use RECIPSQRT_IEEE for both llvm.AMDGPU.rsq.legacy and llvm.AMDGPU.rsq. R600 also has RECIPSQRT_FF, which I'm not sure how it fits in here. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211637 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.cpp8
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.h2
-rw-r--r--lib/Target/R600/AMDGPUInstrInfo.td6
-rw-r--r--lib/Target/R600/AMDGPUInstructions.td15
-rw-r--r--lib/Target/R600/AMDGPUIntrinsics.td8
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp3
-rw-r--r--lib/Target/R600/R600Instructions.td9
-rw-r--r--lib/Target/R600/SIInstructions.td18
8 files changed, 55 insertions, 14 deletions
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 6b70d4c010..f467436a3a 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -805,6 +805,12 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::AMDGPU_rsq:
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
+ case AMDGPUIntrinsic::AMDGPU_legacy_rsq:
+ return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+
+ case Intrinsic::AMDGPU_rsq_clamped:
+ return DAG.getNode(AMDGPUISD::RSQ_CLAMPED, DL, VT, Op.getOperand(1));
+
case AMDGPUIntrinsic::AMDGPU_imax:
return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
Op.getOperand(2));
@@ -2052,6 +2058,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TRIG_PREOP)
NODE_NAME_CASE(RCP)
NODE_NAME_CASE(RSQ)
+ NODE_NAME_CASE(RSQ_LEGACY)
+ NODE_NAME_CASE(RSQ_CLAMPED)
NODE_NAME_CASE(DOT4)
NODE_NAME_CASE(BFE_U32)
NODE_NAME_CASE(BFE_I32)
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 7d9dcc9ad2..307921e65d 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -192,6 +192,8 @@ enum {
// For f64, max error 2^29 ULP, handles denormals.
RCP,
RSQ,
+ RSQ_LEGACY,
+ RSQ_CLAMPED,
DOT4,
BFE_U32, // Extract range of bits with zero extension to 32-bits.
BFE_I32, // Extract range of bits with sign extension to 32-bits.
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index d0ee40a678..934d59d126 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -43,6 +43,12 @@ def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a)
def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+// out = 1.0 / sqrt(a)
+def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+
+// out = 1.0 / sqrt(a) result clamped to +/- max_float.
+def AMDGPUrsq_clamped : SDNode<"AMDGPUISD::RSQ_CLAMPED", SDTFPUnaryOp>;
+
// out = max(a, b) a and b are floats
def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
[SDNPCommutative, SDNPAssociative]
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 14bfd8cc18..b86b7818fc 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -524,10 +524,17 @@ class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
(RcpInst $src)
>;
-class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
- (AMDGPUrcp (fsqrt vt:$src)),
- (RsqInst $src)
->;
+multiclass RsqPat<Instruction RsqInst, ValueType vt> {
+ def : Pat <
+ (fdiv FP_ONE, (fsqrt vt:$src)),
+ (RsqInst $src)
+ >;
+
+ def : Pat <
+ (AMDGPUrcp (fsqrt vt:$src)),
+ (RsqInst $src)
+ >;
+}
include "R600Instructions.td"
include "R700Instructions.td"
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index 27c0dbea09..d934676bbf 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -24,6 +24,14 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
def int_AMDGPU_fract : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+
+ // This is named backwards (instead of rsq_legacy) so we don't have
+ // to define it with the public builtins intrinsics. This is a
+ // workaround for how intrinsic names are parsed. If the name is
+ // llvm.AMDGPU.rsq.legacy, the parser assumes that you meant
+ // llvm.AMDGPU.rsq.{f32 | f64} and incorrectly mangled the name.
+ def int_AMDGPU_legacy_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+
def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
def int_AMDGPU_kilp : Intrinsic<[], [], []>;
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 13d555e7e1..996117c4bd 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -814,6 +814,9 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
case Intrinsic::r600_read_tidig_z:
return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
AMDGPU::T0_Z, VT);
+ case Intrinsic::AMDGPU_rsq:
+ // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
+ return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
}
// break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
break;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 616b3b533b..73fa345ac0 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1079,18 +1079,21 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
let Itinerary = TransALU;
}
+// Clamped to maximum.
class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
- inst, "RECIPSQRT_CLAMPED", AMDGPUrsq
+ inst, "RECIPSQRT_CLAMPED", AMDGPUrsq_clamped
> {
let Itinerary = TransALU;
}
-class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
- inst, "RECIPSQRT_IEEE", []
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+ inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy
> {
let Itinerary = TransALU;
}
+// TODO: There is also RECIPSQRT_FF which clamps to zero.
+
class SIN_Common <bits<11> inst> : R600_1OP <
inst, "SIN", [(set f32:$dst, (SIN_HW f32:$src0))]>{
let Trig = 1;
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 72c51a11fa..507cfe8e81 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -1123,22 +1123,26 @@ defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
[(set f32:$dst, (AMDGPUrcp f32:$src0))]
>;
defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
-defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
+defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32",
+ [(set f32:$dst, (AMDGPUrsq_clamped f32:$src0))]
+>;
defm V_RSQ_LEGACY_F32 : VOP1_32 <
0x0000002d, "V_RSQ_LEGACY_F32",
- [(set f32:$dst, (AMDGPUrsq f32:$src0))]
+ [(set f32:$dst, (AMDGPUrsq_legacy f32:$src0))]
>;
defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32",
- [(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))]
+ [(set f32:$dst, (AMDGPUrsq f32:$src0))]
>;
defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
[(set f64:$dst, (AMDGPUrcp f64:$src0))]
>;
defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
- [(set f64:$dst, (fdiv FP_ONE, (fsqrt f64:$src0)))]
+ [(set f64:$dst, (AMDGPUrsq f64:$src0))]
+>;
+defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64",
+ [(set f64:$dst, (AMDGPUrsq_clamped f64:$src0))]
>;
-defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32",
[(set f32:$dst, (fsqrt f32:$src0))]
>;
@@ -1781,8 +1785,8 @@ def : Pat <
def : RcpPat<V_RCP_F32_e32, f32>;
def : RcpPat<V_RCP_F64_e32, f64>;
-def : RsqPat<V_RSQ_F32_e32, f32>;
-def : RsqPat<V_RSQ_F64_e32, f64>;
+defm : RsqPat<V_RSQ_F32_e32, f32>;
+defm : RsqPat<V_RSQ_F64_e32, f64>;
//===----------------------------------------------------------------------===//
// VOP2 Patterns