diff options
-rw-r--r-- | include/llvm/IntrinsicsX86.td | 46 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrFormats.td | 14 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrSSE.td | 95 | ||||
-rw-r--r-- | test/CodeGen/X86/avx2-intrinsics-x86.ll | 80 |
4 files changed, 214 insertions, 21 deletions
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index 3b11ba5608..b443a096cf 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -1668,22 +1668,52 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pblendw : GCCBuiltin<"__builtin_ia32_pblendw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx2_pblendd_128 : GCCBuiltin<"__builtin_ia32_pblendd128">, + Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, + llvm_i32_ty], [IntrNoMem]>; + def int_x86_avx2_pblendd_256 : GCCBuiltin<"__builtin_ia32_pblendd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, + llvm_i32_ty], [IntrNoMem]>; } // Vector load with broadcast let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_vbroadcast_ss_ps : - GCCBuiltin<"__builtin_ia32_vbroadcastss_ps">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrReadMem]>; + GCCBuiltin<"__builtin_ia32_vbroadcastss_ps">, + Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_avx2_vbroadcast_sd_pd_256 : - GCCBuiltin<"__builtin_ia32_vbroadcastsd_pd256">, - Intrinsic<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrReadMem]>; + GCCBuiltin<"__builtin_ia32_vbroadcastsd_pd256">, + Intrinsic<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_avx2_vbroadcast_ss_ps_256 : - GCCBuiltin<"__builtin_ia32_vbroadcastss_ps256">, - Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrReadMem]>; + GCCBuiltin<"__builtin_ia32_vbroadcastss_ps256">, + Intrinsic<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_avx2_vbroadcasti128 : - GCCBuiltin<"__builtin_ia32_vbroadcastsi256">, - Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>; + GCCBuiltin<"__builtin_ia32_vbroadcastsi256">, + Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>; + def int_x86_avx2_pbroadcastb_128 : + GCCBuiltin<"__builtin_ia32_pbroadcastb128">, + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; + def int_x86_avx2_pbroadcastb_256 : + GCCBuiltin<"__builtin_ia32_pbroadcastb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v16i8_ty], [IntrNoMem]>; + def int_x86_avx2_pbroadcastw_128 : + GCCBuiltin<"__builtin_ia32_pbroadcastw128">, + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>; + def int_x86_avx2_pbroadcastw_256 : + GCCBuiltin<"__builtin_ia32_pbroadcastw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v8i16_ty], [IntrNoMem]>; + def int_x86_avx2_pbroadcastd_128 : + GCCBuiltin<"__builtin_ia32_pbroadcastd128">, + Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>; + def int_x86_avx2_pbroadcastd_256 : + GCCBuiltin<"__builtin_ia32_pbroadcastd256">, + Intrinsic<[llvm_v8i32_ty], [llvm_v4i32_ty], [IntrNoMem]>; + def int_x86_avx2_pbroadcastq_128 : + GCCBuiltin<"__builtin_ia32_pbroadcastq128">, + Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; + def int_x86_avx2_pbroadcastq_256 : + GCCBuiltin<"__builtin_ia32_pbroadcastq256">, + Intrinsic<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; } // Misc. diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index 5236dafd99..b7c172e03d 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -451,6 +451,20 @@ class AVXAIi8<bits<8> o, Format F, dag outs, dag ins, string asm, : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, OpSize, Requires<[HasAVX]>; +// AVX2 Instruction Templates: +// Instructions introduced in AVX2 (no SSE equivalent forms) +// +// AVX28I - AVX2 instructions with T8 and OpSize prefix. +// AVX2AIi8 - AVX2 instructions with TA, OpSize prefix and ImmT = Imm8. +class AVX28I<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, OpSize, + Requires<[HasAVX2]>; +class AVX2Ii8<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag> pattern> + : Ii8<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, OpSize, + Requires<[HasAVX2]>; + // AES Instruction Templates: // // AES8I diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index d2291a246e..acd9a80438 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7083,11 +7083,12 @@ class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (Int addr:$src))]>, VEX; -class avx_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, - Intrinsic Int> : - AVX8I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set RC:$dst, (Int VR128:$src))]>, VEX; +// AVX2 adds register forms +class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC, + Intrinsic Int> : + AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set RC:$dst, (Int VR128:$src))]>, VEX; def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem, int_x86_avx_vbroadcast_ss>; @@ -7098,16 +7099,16 @@ def VBROADCASTSDrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem, def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem, int_x86_avx_vbroadcastf128_pd_256>; -let Predicates = [HasAVX2] in { -def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, - int_x86_avx2_vbroadcasti128>; -def VBROADCASTSSrr : avx_broadcast_reg<0x18, "vbroadcastss", VR128, +def VBROADCASTSSrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR128, int_x86_avx2_vbroadcast_ss_ps>; -def VBROADCASTSSYrr : avx_broadcast_reg<0x18, "vbroadcastss", VR256, +def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256, int_x86_avx2_vbroadcast_ss_ps_256>; -def VBROADCASTSDrr : avx_broadcast_reg<0x19, "vbroadcastsd", VR256, +def VBROADCASTSDrr : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256, int_x86_avx2_vbroadcast_sd_pd_256>; -} + +let Predicates = [HasAVX2] in +def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem, + int_x86_avx2_vbroadcasti128>; def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src), (VBROADCASTF128 addr:$src)>; @@ -7364,7 +7365,7 @@ let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, //===----------------------------------------------------------------------===// // Half precision conversion instructions -// +//===----------------------------------------------------------------------===// multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { let Predicates = [HasAVX, HasF16C] in { def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), @@ -7396,3 +7397,71 @@ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>; defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>; + +//===----------------------------------------------------------------------===// +// AVX2 Instructions +//===----------------------------------------------------------------------===// + +/// AVX2I_binop_rmi_int - AVX2 binary operator with 8-bit immediate +multiclass AVX2I_binop_rmi_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop> { + let isCommutable = 1 in + def rri : AVX2Ii8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u32u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))]>, + VEX_4V; + def rmi : AVX2Ii8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u32u8imm:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + [(set RC:$dst, + (IntId RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3))]>, + VEX_4V; +} + +let isCommutable = 0 in { +defm VPBLENDD : AVX2I_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128, + VR128, memopv16i8, i128mem>; +defm VPBLENDDY : AVX2I_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256, + VR256, memopv32i8, i256mem>; +} + +//===----------------------------------------------------------------------===// +// VPBROADCAST - Load from memory and broadcast to all elements of the +// destination operand +// +multiclass avx2_broadcast<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, PatFrag ld_frag, + Intrinsic Int128, Intrinsic Int256> { + def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, (Int128 VR128:$src))]>, VEX; + def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR128:$dst, + (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX; + def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, (Int256 VR128:$src))]>, VEX; + def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src), + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), + [(set VR256:$dst, + (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, VEX; +} + +defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8, + int_x86_avx2_pbroadcastb_128, + int_x86_avx2_pbroadcastb_256>; +defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16, + int_x86_avx2_pbroadcastw_128, + int_x86_avx2_pbroadcastw_256>; +defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, + int_x86_avx2_pbroadcastd_128, + int_x86_avx2_pbroadcastd_256>; +defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, + int_x86_avx2_pbroadcastq_128, + int_x86_avx2_pbroadcastq_256>; diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll index 81ffdea827..24471e6f1a 100644 --- a/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -766,3 +766,83 @@ define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) { ret <8 x float> %res } declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly + + +define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) { + ; CHECK: vpblendd + %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone + + +define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) { + ; CHECK: vpblendd + %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone + + +define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) { + ; CHECK: vpbroadcastb + %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly + + +define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) { + ; CHECK: vpbroadcastb + %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly + + +define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) { + ; CHECK: vpbroadcastw + %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly + + +define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) { + ; CHECK: vpbroadcastw + %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly + + +define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) { + ; CHECK: vpbroadcastd + %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %res +} +declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly + + +define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) { + ; CHECK: vpbroadcastd + %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1] + ret <8 x i32> %res +} +declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly + + +define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) { + ; CHECK: vpbroadcastq + %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %res +} +declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly + + +define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) { + ; CHECK: vpbroadcastq + %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1] + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly |