From d9b35435b89015d154b0e20f4d4796d936237f84 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 19 Jun 2014 01:19:19 +0000 Subject: R600/SI: Add intrinsics for various math instructions. These will be used for custom lowering and for library implementations of various math functions, so it's useful to expose these as builtins. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211247 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/R600/AMDGPUISelLowering.cpp | 32 +++++++++++++++++++-- lib/Target/R600/AMDGPUISelLowering.h | 12 ++++++++ lib/Target/R600/AMDGPUInstrInfo.td | 29 +++++++++++++++++++ lib/Target/R600/AMDGPUInstructions.td | 10 +++++++ lib/Target/R600/AMDGPUIntrinsics.td | 2 -- lib/Target/R600/R600Instructions.td | 2 +- lib/Target/R600/SIInsertWaits.cpp | 2 ++ lib/Target/R600/SIInstructions.td | 37 +++++++++++++++++++------ lib/Transforms/InstCombine/InstCombineCalls.cpp | 14 ++++++++++ 9 files changed, 127 insertions(+), 13 deletions(-) (limited to 'lib') diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 34c2b2bf61..1aa92fadbe 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -842,6 +842,28 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::CLAMP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::AMDGPU_div_scale: + return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_div_fmas: + return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_div_fixup: + return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::AMDGPU_trig_preop: + return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, + Op.getOperand(1), Op.getOperand(2)); + + case Intrinsic::AMDGPU_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); + + case Intrinsic::AMDGPU_rsq: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + case AMDGPUIntrinsic::AMDGPU_imax: return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1), Op.getOperand(2)); @@ -2042,6 +2064,14 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(FMIN) NODE_NAME_CASE(SMIN) NODE_NAME_CASE(UMIN) + NODE_NAME_CASE(URECIP) + NODE_NAME_CASE(DIV_SCALE) + NODE_NAME_CASE(DIV_FMAS) + NODE_NAME_CASE(DIV_FIXUP) + NODE_NAME_CASE(TRIG_PREOP) + NODE_NAME_CASE(RCP) + NODE_NAME_CASE(RSQ) + NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) NODE_NAME_CASE(BFI) @@ -2051,8 +2081,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) - NODE_NAME_CASE(URECIP) - NODE_NAME_CASE(DOT4) NODE_NAME_CASE(EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index b2bb2579dc..e2000a04ba 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -175,6 +175,9 @@ enum { DWORDADDR, FRACT, CLAMP, + + // SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi. + // Denormals handled on some parts. COS_HW, SIN_HW, FMAX, @@ -184,6 +187,15 @@ enum { SMIN, UMIN, URECIP, + DIV_SCALE, + DIV_FMAS, + DIV_FIXUP, + TRIG_PREOP, // 1 ULP max error for f64 + + // RCP, RSQ - For f32, 1 ULP max error, no denormal handling. + // For f64, max error 2^29 ULP, handles denormals. + RCP, + RSQ, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td index 942a9e8ff3..d0ee40a678 100644 --- a/lib/Target/R600/AMDGPUInstrInfo.td +++ b/lib/Target/R600/AMDGPUInstrInfo.td @@ -19,6 +19,14 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3> ]>; +def AMDGPUTrigPreOp : SDTypeProfile<1, 2, + [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>] +>; + +def AMDGPUDivScaleOp : SDTypeProfile<2, 3, + [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>] +>; + //===----------------------------------------------------------------------===// // AMDGPU DAG Nodes // @@ -29,6 +37,12 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>; // out = a - floor(a) def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; +// out = 1.0 / a +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; + +// out = 1.0 / sqrt(a) +def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>; + // out = max(a, b) a and b are floats def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] @@ -78,6 +92,21 @@ def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3", // e is rounding error def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>; +// Special case divide preop and flags. +def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>; + +// Special case divide FMA with scale and flags (src0 = Quotient, +// src1 = Denominator, src2 = Numerator). +def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>; + +// Single or double precision division fixup. +// Special case divide fixup and flags(src0 = Quotient, src1 = +// Denominator, src2 = Numerator). +def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>; + +// Look Up 2.0 / pi src0 with segment select src1[4:0] +def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>; + def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD", SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>, [SDNPHasChain, SDNPMayLoad]>; diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index 8bfc11cd46..14bfd8cc18 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -519,6 +519,16 @@ multiclass Expand24UBitOps { >; } +class RcpPat : Pat < + (fdiv FP_ONE, vt:$src), + (RcpInst $src) +>; + +class RsqPat : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) +>; + include "R600Instructions.td" include "R700Instructions.td" include "EvergreenInstructions.td" diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td index 6dc7612d46..538b4cd8af 100644 --- a/lib/Target/R600/AMDGPUIntrinsics.td +++ b/lib/Target/R600/AMDGPUIntrinsics.td @@ -30,8 +30,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in { def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 58c704d8ec..47b7da0955 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1083,7 +1083,7 @@ class RECIP_UINT_Common inst> : R600_1OP_Helper < } class RECIPSQRT_CLAMPED_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq + inst, "RECIPSQRT_CLAMPED", AMDGPUrsq > { let Itinerary = TransALU; } diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp index a17fed7e7e..173332674f 100644 --- a/lib/Target/R600/SIInsertWaits.cpp +++ b/lib/Target/R600/SIInsertWaits.cpp @@ -341,6 +341,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) { return Result; } +// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States" +// around other non-memory instructions. bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) { bool Changes = false; diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 60eb8f9781..26024dc4f2 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1116,22 +1116,23 @@ defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>; defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", [(set f32:$dst, (flog2 f32:$src0))] >; + defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] + [(set f32:$dst, (AMDGPUrcp f32:$src0))] >; defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; defm V_RSQ_LEGACY_F32 : VOP1_32 < 0x0000002d, "V_RSQ_LEGACY_F32", - [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))] + [(set f32:$dst, (AMDGPUrsq f32:$src0))] >; defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", [(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))] >; defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", - [(set f64:$dst, (fdiv FP_ONE, f64:$src0))] + [(set f64:$dst, (AMDGPUrcp f64:$src0))] >; defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", @@ -1417,8 +1418,12 @@ defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>; //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>; defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>; ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>; -defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>; -def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>; +defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", + [(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", + [(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))] +>; def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64", [(set i64:$dst, (shl i64:$src0, i32:$src1))] @@ -1452,12 +1457,19 @@ defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>; defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>; def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>; -defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>; -def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>; + +defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", + [(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))] +>; +def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", + [(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))] +>; //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>; //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>; //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>; -def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>; +def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64", + [(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))] +>; //===----------------------------------------------------------------------===// // Pseudo Instructions @@ -1748,6 +1760,15 @@ def : Pat < (S_BARRIER) >; +//===----------------------------------------------------------------------===// +// VOP1 Patterns +//===----------------------------------------------------------------------===// + +def : RcpPat; +def : RcpPat; +def : RsqPat; +def : RsqPat; + //===----------------------------------------------------------------------===// // VOP2 Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index d4bdd75fa8..ff74564158 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -922,6 +922,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::AMDGPU_rcp: { + if (const ConstantFP *C = dyn_cast(II->getArgOperand(0))) { + const APFloat &ArgVal = C->getValueAPF(); + APFloat Val(ArgVal.getSemantics(), 1.0); + APFloat::opStatus Status = Val.divide(ArgVal, + APFloat::rmNearestTiesToEven); + // Only do this if it was exact and therefore not dependent on the + // rounding mode. + if (Status == APFloat::opOK) + return ReplaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val)); + } + + break; + } case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. -- cgit v1.2.3