From 3a880de6e613beae380255d0812a299bd9552759 Mon Sep 17 00:00:00 2001 From: Elena Demikhovsky Date: Sun, 5 Jan 2014 10:46:09 +0000 Subject: AVX-512: Added more intrinsics for convert and min/max. Removed vzeroupper from AVX-512 mode - our optimization gude does not recommend to insert vzeroupper at all. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@198557 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsX86.td | 32 ++++++++++------- lib/Target/X86/X86ISelLowering.cpp | 10 +----- lib/Target/X86/X86InstrAVX512.td | 50 +++++++++++++++++++------- lib/Target/X86/X86VZeroUpper.cpp | 23 ++++-------- test/CodeGen/X86/avx512-intrinsics.ll | 68 ++++++++++++++++++++++++++++++++++- 5 files changed, 132 insertions(+), 51 deletions(-) diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td index 8a208d13a0..4b37fa327a 100644 --- a/include/llvm/IR/IntrinsicsX86.td +++ b/include/llvm/IR/IntrinsicsX86.td @@ -2747,7 +2747,13 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtdq2pd_512 : GCCBuiltin<"__builtin_ia32_cvtdq2pd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8i32_ty, llvm_v8f64_ty, - llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + llvm_i8_ty], [IntrNoMem]>; + def int_x86_avx512_mask_cvtudq2ps_512 : GCCBuiltin<"__builtin_ia32_cvtudq2ps512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty, llvm_v16f32_ty, + llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_cvtudq2pd_512 : GCCBuiltin<"__builtin_ia32_cvtudq2pd512_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8i32_ty, llvm_v8f64_ty, + llvm_i8_ty], [IntrNoMem]>; } // Vector load with broadcast @@ -2800,18 +2806,18 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx512_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512">, - Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, - llvm_v16f32_ty], [IntrNoMem]>; - def int_x86_avx512_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512">, - Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, - llvm_v8f64_ty], [IntrNoMem]>; - def int_x86_avx512_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512">, - Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, - llvm_v16f32_ty], [IntrNoMem]>; - def int_x86_avx512_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512">, - Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, - llvm_v8f64_ty], [IntrNoMem]>; + def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, + llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, + llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512_mask">, + Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, + llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx512_mask_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512_mask">, + Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, + llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_pmaxu_d : GCCBuiltin<"__builtin_ia32_pmaxud512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index f4a2f69218..d4af1eb7a9 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -11462,14 +11462,10 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_sse2_max_pd: case Intrinsic::x86_avx_max_ps_256: case Intrinsic::x86_avx_max_pd_256: - case Intrinsic::x86_avx512_max_ps_512: - case Intrinsic::x86_avx512_max_pd_512: case Intrinsic::x86_sse_min_ps: case Intrinsic::x86_sse2_min_pd: case Intrinsic::x86_avx_min_ps_256: - case Intrinsic::x86_avx_min_pd_256: - case Intrinsic::x86_avx512_min_ps_512: - case Intrinsic::x86_avx512_min_pd_512: { + case Intrinsic::x86_avx_min_pd_256: { unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); // Can't reach here. @@ -11477,16 +11473,12 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) { case Intrinsic::x86_sse2_max_pd: case Intrinsic::x86_avx_max_ps_256: case Intrinsic::x86_avx_max_pd_256: - case Intrinsic::x86_avx512_max_ps_512: - case Intrinsic::x86_avx512_max_pd_512: Opcode = X86ISD::FMAX; break; case Intrinsic::x86_sse_min_ps: case Intrinsic::x86_sse2_min_pd: case Intrinsic::x86_avx_min_ps_256: case Intrinsic::x86_avx_min_pd_256: - case Intrinsic::x86_avx512_min_ps_512: - case Intrinsic::x86_avx512_min_pd_512: Opcode = X86ISD::FMIN; break; } diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 6f25272866..8d9ef8ff1a 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2038,6 +2038,25 @@ defm VDIVPDZ : avx512_fp_packed<0x5E, "divpd", fdiv, VR512, v8f64, f512mem, SSE_ALU_ITINS_P.d, 0>, EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; +def : Pat<(v16f32 (int_x86_avx512_mask_max_ps_512 (v16f32 VR512:$src1), + (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), + (i16 -1), FROUND_CURRENT)), + (VMAXPSZrr VR512:$src1, VR512:$src2)>; + +def : Pat<(v8f64 (int_x86_avx512_mask_max_pd_512 (v8f64 VR512:$src1), + (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)), + (i8 -1), FROUND_CURRENT)), + (VMAXPDZrr VR512:$src1, VR512:$src2)>; + +def : Pat<(v16f32 (int_x86_avx512_mask_min_ps_512 (v16f32 VR512:$src1), + (v16f32 VR512:$src2), (bc_v16f32 (v16i32 immAllZerosV)), + (i16 -1), FROUND_CURRENT)), + (VMINPSZrr VR512:$src1, VR512:$src2)>; + +def : Pat<(v8f64 (int_x86_avx512_mask_min_pd_512 (v8f64 VR512:$src1), + (v8f64 VR512:$src2), (bc_v8f64 (v16i32 immAllZerosV)), + (i8 -1), FROUND_CURRENT)), + (VMINPDZrr VR512:$src1, VR512:$src2)>; //===----------------------------------------------------------------------===// // AVX-512 VPTESTM instructions //===----------------------------------------------------------------------===// @@ -2731,7 +2750,7 @@ def : Pat<(extloadf32 addr:$src), def : Pat<(f32 (fround FR64X:$src)), (VCVTSD2SSZrr FR64X:$src, FR64X:$src)>, Requires<[HasAVX512]>; -multiclass avx512_vcvt_fp opc, string asm, RegisterClass SrcRC, +multiclass avx512_vcvt_fp_with_rc opc, string asm, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT, ValueType InVT, Domain d> { @@ -2751,7 +2770,7 @@ let neverHasSideEffects = 1 in { } // neverHasSideEffects = 1 } -multiclass avx512_vcvtt_fp opc, string asm, RegisterClass SrcRC, +multiclass avx512_vcvt_fp opc, string asm, RegisterClass SrcRC, RegisterClass DstRC, SDNode OpNode, PatFrag mem_frag, X86MemOperand x86memop, ValueType OpVT, ValueType InVT, Domain d> { @@ -2768,8 +2787,7 @@ let neverHasSideEffects = 1 in { } // neverHasSideEffects = 1 } - -defm VCVTPD2PSZ : avx512_vcvt_fp<0x5A, "vcvtpd2ps", VR512, VR256X, fround, +defm VCVTPD2PSZ : avx512_vcvt_fp_with_rc<0x5A, "vcvtpd2ps", VR512, VR256X, fround, memopv8f64, f512mem, v8f32, v8f64, SSEPackedSingle>, EVEX_V512, VEX_W, OpSize, EVEX_CD8<64, CD8VF>; @@ -2784,7 +2802,7 @@ def : Pat<(v8f64 (extloadv8f32 addr:$src)), // AVX-512 Vector convert from sign integer to float/double //===----------------------------------------------------------------------===// -defm VCVTDQ2PSZ : avx512_vcvt_fp<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp, +defm VCVTDQ2PSZ : avx512_vcvt_fp_with_rc<0x5B, "vcvtdq2ps", VR512, VR512, sint_to_fp, memopv8i64, i512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; @@ -2793,17 +2811,17 @@ defm VCVTDQ2PDZ : avx512_vcvt_fp<0xE6, "vcvtdq2pd", VR256X, VR512, sint_to_fp, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; -defm VCVTTPS2DQZ : avx512_vcvtt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, +defm VCVTTPS2DQZ : avx512_vcvt_fp<0x5B, "vcvttps2dq", VR512, VR512, fp_to_sint, memopv16f32, f512mem, v16i32, v16f32, SSEPackedSingle>, EVEX_V512, XS, EVEX_CD8<32, CD8VF>; -defm VCVTTPD2DQZ : avx512_vcvtt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, +defm VCVTTPD2DQZ : avx512_vcvt_fp<0xE6, "vcvttpd2dq", VR512, VR256X, fp_to_sint, memopv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, OpSize, VEX_W, EVEX_CD8<64, CD8VF>; -defm VCVTTPS2UDQZ : avx512_vcvtt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint, +defm VCVTTPS2UDQZ : avx512_vcvt_fp<0x78, "vcvttps2udq", VR512, VR512, fp_to_uint, memopv16f32, f512mem, v16i32, v16f32, SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>; @@ -2813,7 +2831,7 @@ def : Pat<(v16i32 (int_x86_avx512_mask_cvttps2udq_512 (v16f32 VR512:$src), (v16i32 immAllZerosV), (i16 -1), FROUND_CURRENT)), (VCVTTPS2UDQZrr VR512:$src)>; -defm VCVTTPD2UDQZ : avx512_vcvtt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint, +defm VCVTTPD2UDQZ : avx512_vcvt_fp<0x78, "vcvttpd2udq", VR512, VR256X, fp_to_uint, memopv8f64, f512mem, v8i32, v8f64, SSEPackedDouble>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; @@ -2828,7 +2846,7 @@ defm VCVTUDQ2PDZ : avx512_vcvt_fp<0x7A, "vcvtudq2pd", VR256X, VR512, uint_to_fp, SSEPackedDouble>, EVEX_V512, XS, EVEX_CD8<32, CD8VH>; -defm VCVTUDQ2PSZ : avx512_vcvt_fp<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, +defm VCVTUDQ2PSZ : avx512_vcvt_fp_with_rc<0x7A, "vcvtudq2ps", VR512, VR512, uint_to_fp, memopv16i32, f512mem, v16f32, v16i32, SSEPackedSingle>, EVEX_V512, XD, EVEX_CD8<32, CD8VF>; @@ -2839,9 +2857,17 @@ def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src), - (v16f32 immAllZerosV), (i16 -1), imm:$rc)), + (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)), (VCVTDQ2PSZrrb VR512:$src, imm:$rc)>; - +def : Pat<(v8f64 (int_x86_avx512_mask_cvtdq2pd_512 (v8i32 VR256X:$src), + (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), + (VCVTDQ2PDZrr VR256X:$src)>; +def : Pat<(v16f32 (int_x86_avx512_mask_cvtudq2ps_512 (v16i32 VR512:$src), + (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)), + (VCVTUDQ2PSZrrb VR512:$src, imm:$rc)>; +def : Pat<(v8f64 (int_x86_avx512_mask_cvtudq2pd_512 (v8i32 VR256X:$src), + (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1))), + (VCVTUDQ2PDZrr VR256X:$src)>; multiclass avx512_vcvt_fp2int opc, string asm, RegisterClass SrcRC, RegisterClass DstRC, PatFrag mem_frag, diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index 66ae9c2d7f..ec5c9da37e 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -17,6 +17,7 @@ #define DEBUG_TYPE "x86-vzeroupper" #include "X86.h" #include "X86InstrInfo.h" +#include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -105,28 +106,20 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() { } static bool isYmmReg(unsigned Reg) { - return (Reg >= X86::YMM0 && Reg <= X86::YMM31); -} - -static bool isZmmReg(unsigned Reg) { - return (Reg >= X86::ZMM0 && Reg <= X86::ZMM31); + return (Reg >= X86::YMM0 && Reg <= X86::YMM15); } static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) { for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I) - if (isYmmReg(I->first) || isZmmReg(I->first)) + if (isYmmReg(I->first)) return true; return false; } static bool clobbersAllYmmRegs(const MachineOperand &MO) { - for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) { - if (!MO.clobbersPhysReg(reg)) - return false; - } - for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) { + for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { if (!MO.clobbersPhysReg(reg)) return false; } @@ -155,11 +148,7 @@ static bool clobbersAnyYmmReg(MachineInstr *MI) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isRegMask()) continue; - for (unsigned reg = X86::YMM0; reg <= X86::YMM31; ++reg) { - if (MO.clobbersPhysReg(reg)) - return true; - } - for (unsigned reg = X86::ZMM0; reg <= X86::ZMM31; ++reg) { + for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) { if (MO.clobbersPhysReg(reg)) return true; } @@ -170,6 +159,8 @@ static bool clobbersAnyYmmReg(MachineInstr *MI) { /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// vzero upper instructions before function calls. bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { + if (MF.getTarget().getSubtarget().hasAVX512()) + return false; TII = MF.getTarget().getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); bool EverMadeChange = false; diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index b43c00bf48..a7185f9853 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -447,4 +447,70 @@ declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i1> %a0, <8 x i64> %a1, %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4) ret i8 %res } - declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) \ No newline at end of file + declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32) + + ; cvt intrinsics + define <16 x float> @test_cvtdq2ps(<16 x i32> %a) { + ;CHECK: vcvtdq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x5b,0xc0] + %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float>%res + } + declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32) + + define <16 x float> @test_cvtudq2ps(<16 x i32> %a) { + ;CHECK: vcvtudq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7f,0x38,0x7a,0xc0] + %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1) + ret <16 x float>%res + } + declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32) + + define <8 x double> @test_cvtdq2pd(<8 x i32> %a) { + ;CHECK: vcvtdq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xc0] + %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1) + ret <8 x double>%res + } + declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8) + + define <8 x double> @test_cvtudq2pd(<8 x i32> %a) { + ;CHECK: vcvtudq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xc0] + %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1) + ret <8 x double>%res + } + declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8) + + ; fp min - max +define <16 x float> @test_vmaxps(<16 x float> %a0, <16 x float> %a1) { + ; CHECK: vmaxps + %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>, + <16 x float>, i16, i32) + +define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) { + ; CHECK: vmaxpd + %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double>zeroinitializer, i8 -1, i32 4) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>, + <8 x double>, i8, i32) + +define <16 x float> @test_vminps(<16 x float> %a0, <16 x float> %a1) { + ; CHECK: vminps + %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1, + <16 x float>zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} +declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>, + <16 x float>, i16, i32) + +define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) { + ; CHECK: vminpd + %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1, + <8 x double>zeroinitializer, i8 -1, i32 4) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>, + <8 x double>, i8, i32) -- cgit v1.2.3