summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdam Nemet <anemet@apple.com>2014-05-29 23:35:36 +0000
committerAdam Nemet <anemet@apple.com>2014-05-29 23:35:36 +0000
commit7cd893a9859dcba1045aef2ca66de95f0561344b (patch)
tree7524b413f41f1b5da2eb7273e06bade4f022dcc1
parent23356d4d64d4511d340bc6dad1464c8a3c65473b (diff)
downloadllvm-7cd893a9859dcba1045aef2ca66de95f0561344b.tar.gz
llvm-7cd893a9859dcba1045aef2ca66de95f0561344b.tar.bz2
llvm-7cd893a9859dcba1045aef2ca66de95f0561344b.tar.xz
[X86] Remove AVX1 vbroadcast intrinsics
The corresponding CFE patch replaces these intrinsics with vector initializers in avxintrin.h. This patch removes the LLVM intrinsics from the backend. We now stop lowering at X86ISD::VBROADCAST custom node rather than lowering that further to the intrinsics. The patch only changes VBROADCASTS* and leaves VBROADCAST[FI]128 to continue to use intrinsics. As explained in the CFE patch, the reason is that we currently don't generate as good code for them without the intrinsics. CodeGen/X86/avx-vbroadcast.ll already provides coverage for this change. It checks that for a series of insertelements we generate the appropriate vbroadcast instruction. Also verified that there was no assembly change in the test-suite before and after this patch. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209864 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/IR/IntrinsicsX86.td9
-rw-r--r--lib/Target/X86/X86InstrSSE.td32
-rw-r--r--test/CodeGen/X86/avx-intrinsics-x86.ll24
3 files changed, 17 insertions, 48 deletions
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 36d93fee8a..88ee1cec2d 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -1304,15 +1304,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// Vector load with broadcast
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
- def int_x86_avx_vbroadcast_ss :
- GCCBuiltin<"__builtin_ia32_vbroadcastss">,
- Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
- def int_x86_avx_vbroadcast_sd_256 :
- GCCBuiltin<"__builtin_ia32_vbroadcastsd256">,
- Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
- def int_x86_avx_vbroadcast_ss_256 :
- GCCBuiltin<"__builtin_ia32_vbroadcastss256">,
- Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
def int_x86_avx_vbroadcastf128_pd_256 :
GCCBuiltin<"__builtin_ia32_vbroadcastf128_pd256">,
Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty], [IntrReadArgMem]>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 1eb04851b7..043b2f32c6 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -7969,6 +7969,16 @@ class avx_broadcast<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (Int addr:$src))]>, Sched<[Sched]>, VEX;
+class avx_broadcast_no_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop, ValueType VT,
+ PatFrag ld_frag, SchedWrite Sched> :
+ AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
+ Sched<[Sched]>, VEX {
+ let mayLoad = 1;
+}
+
// AVX2 adds register forms
class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
Intrinsic Int, SchedWrite Sched> :
@@ -7977,16 +7987,15 @@ class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
[(set RC:$dst, (Int VR128:$src))]>, Sched<[Sched]>, VEX;
let ExeDomain = SSEPackedSingle in {
- def VBROADCASTSSrm : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
- int_x86_avx_vbroadcast_ss, WriteLoad>;
- def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
- int_x86_avx_vbroadcast_ss_256,
- WriteFShuffleLd>, VEX_L;
+ def VBROADCASTSSrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR128,
+ f32mem, v4f32, loadf32, WriteLoad>;
+ def VBROADCASTSSYrm : avx_broadcast_no_int<0x18, "vbroadcastss", VR256,
+ f32mem, v8f32, loadf32,
+ WriteFShuffleLd>, VEX_L;
}
let ExeDomain = SSEPackedDouble in
-def VBROADCASTSDYrm : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
- int_x86_avx_vbroadcast_sd_256,
- WriteFShuffleLd>, VEX_L;
+def VBROADCASTSDYrm : avx_broadcast_no_int<0x19, "vbroadcastsd", VR256, f64mem,
+ v4f64, loadf64, WriteFShuffleLd>, VEX_L;
def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
int_x86_avx_vbroadcastf128_pd_256,
WriteFShuffleLd>, VEX_L;
@@ -8543,13 +8552,6 @@ def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
}
let Predicates = [HasAVX] in {
-def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
- (VBROADCASTSSYrm addr:$src)>;
-def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
- (VBROADCASTSDYrm addr:$src)>;
-def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
- (VBROADCASTSSrm addr:$src)>;
-
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
let AddedComplexity = 20 in {
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 0be83f648d..ce31161dbb 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2219,14 +2219,6 @@ define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
-define <4 x double> @test_x86_avx_vbroadcast_sd_256(i8* %a0) {
- ; CHECK: vbroadcastsd
- %res = call <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8* %a0) ; <<4 x double>> [#uses=1]
- ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.vbroadcast.sd.256(i8*) nounwind readonly
-
-
define <4 x double> @test_x86_avx_vbroadcastf128_pd_256(i8* %a0) {
; CHECK: vbroadcastf128
%res = call <4 x double> @llvm.x86.avx.vbroadcastf128.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
@@ -2243,22 +2235,6 @@ define <8 x float> @test_x86_avx_vbroadcastf128_ps_256(i8* %a0) {
declare <8 x float> @llvm.x86.avx.vbroadcastf128.ps.256(i8*) nounwind readonly
-define <4 x float> @test_x86_avx_vbroadcast_ss(i8* %a0) {
- ; CHECK: vbroadcastss
- %res = call <4 x float> @llvm.x86.avx.vbroadcast.ss(i8* %a0) ; <<4 x float>> [#uses=1]
- ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.avx.vbroadcast.ss(i8*) nounwind readonly
-
-
-define <8 x float> @test_x86_avx_vbroadcast_ss_256(i8* %a0) {
- ; CHECK: vbroadcastss
- %res = call <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8* %a0) ; <<8 x float>> [#uses=1]
- ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.vbroadcast.ss.256(i8*) nounwind readonly
-
-
define <2 x double> @test_x86_avx_vextractf128_pd_256(<4 x double> %a0) {
; CHECK: vextractf128
%res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 7) ; <<2 x double>> [#uses=1]