summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/llvm/IntrinsicsX86.td80
-rw-r--r--lib/Target/X86/X86InstrSSE.td111
-rw-r--r--test/CodeGen/X86/avx2-intrinsics-x86.ll137
3 files changed, 316 insertions, 12 deletions
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index acccb14959..04a119f055 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@@ -1398,6 +1398,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq256">,
Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
llvm_v8i32_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_pmul_dq : GCCBuiltin<"__builtin_ia32_pmuldq256">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v8i32_ty,
+ llvm_v8i32_ty], [IntrNoMem, Commutative]>;
def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty,
llvm_v16i16_ty], [IntrNoMem, Commutative]>;
@@ -1407,21 +1410,49 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw256">,
Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
llvm_v16i16_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty,
+ llvm_v32i8_ty], [IntrNoMem, Commutative]>;
+}
+
+// Vector min, max
+let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_pmaxu_b : GCCBuiltin<"__builtin_ia32_pmaxub256">,
Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
llvm_v32i8_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_pmaxu_w : GCCBuiltin<"__builtin_ia32_pmaxuw256">,
+ Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
+ llvm_v16i16_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_pmaxu_d : GCCBuiltin<"__builtin_ia32_pmaxud256">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
+ llvm_v8i32_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_pmaxs_b : GCCBuiltin<"__builtin_ia32_pmaxsb256">,
+ Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
+ llvm_v32i8_ty], [IntrNoMem, Commutative]>;
def int_x86_avx2_pmaxs_w : GCCBuiltin<"__builtin_ia32_pmaxsw256">,
Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
llvm_v16i16_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_pmaxs_d : GCCBuiltin<"__builtin_ia32_pmaxsd256">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
+ llvm_v8i32_ty], [IntrNoMem, Commutative]>;
def int_x86_avx2_pminu_b : GCCBuiltin<"__builtin_ia32_pminub256">,
Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
llvm_v32i8_ty], [IntrNoMem, Commutative]>;
- def int_x86_avx2_pmins_w : GCCBuiltin<"__builtin_ia32_pminsw256">,
+ def int_x86_avx2_pminu_w : GCCBuiltin<"__builtin_ia32_pminuw256">,
Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
llvm_v16i16_ty], [IntrNoMem, Commutative]>;
- def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">,
- Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty,
+ def int_x86_avx2_pminu_d : GCCBuiltin<"__builtin_ia32_pminud256">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
+ llvm_v8i32_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_pmins_b : GCCBuiltin<"__builtin_ia32_pminsb256">,
+ Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
llvm_v32i8_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_pmins_w : GCCBuiltin<"__builtin_ia32_pminsw256">,
+ Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
+ llvm_v16i16_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_pmins_d : GCCBuiltin<"__builtin_ia32_pminsd256">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
+ llvm_v8i32_ty], [IntrNoMem, Commutative]>;
}
// Integer shift ops.
@@ -1501,15 +1532,21 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_pcmpeq_d : GCCBuiltin<"__builtin_ia32_pcmpeqd256">,
Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
[IntrNoMem, Commutative]>;
+ def int_x86_avx2_pcmpeq_q : GCCBuiltin<"__builtin_ia32_pcmpeqq256">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
+ [IntrNoMem, Commutative]>;
def int_x86_avx2_pcmpgt_b : GCCBuiltin<"__builtin_ia32_pcmpgtb256">,
- Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
- llvm_v32i8_ty], [IntrNoMem]>;
+ Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty],
+ [IntrNoMem]>;
def int_x86_avx2_pcmpgt_w : GCCBuiltin<"__builtin_ia32_pcmpgtw256">,
- Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty,
- llvm_v16i16_ty], [IntrNoMem]>;
+ Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty],
+ [IntrNoMem]>;
def int_x86_avx2_pcmpgt_d : GCCBuiltin<"__builtin_ia32_pcmpgtd256">,
- Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty,
- llvm_v8i32_ty], [IntrNoMem]>;
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty],
+ [IntrNoMem]>;
+ def int_x86_avx2_pcmpgt_q : GCCBuiltin<"__builtin_ia32_pcmpgtq256">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty],
+ [IntrNoMem]>;
}
// Pack ops.
@@ -1523,6 +1560,9 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_packuswb : GCCBuiltin<"__builtin_ia32_packuswb256">,
Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty,
llvm_v16i16_ty], [IntrNoMem]>;
+ def int_x86_avx2_packusdw : GCCBuiltin<"__builtin_ia32_packusdw256">,
+ Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty,
+ llvm_v8i32_ty], [IntrNoMem]>;
}
// Absolute value ops
@@ -1620,6 +1660,23 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
[IntrNoMem]>;
}
+// Vector blend
+let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
+ def int_x86_avx2_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb256">,
+ Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
+ llvm_v32i8_ty], [IntrNoMem]>;
+ def int_x86_avx2_pblendw : GCCBuiltin<"__builtin_ia32_pblendw256">,
+ Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty,
+ llvm_i32_ty], [IntrNoMem]>;
+}
+
+// Vector load with broadcast
+let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
+ def int_x86_avx2_vbroadcasti128 :
+ GCCBuiltin<"__builtin_ia32_vbroadcastsi256">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
+}
+
// Misc.
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">,
@@ -1627,6 +1684,11 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx2_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb256">,
Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty,
llvm_v32i8_ty], [IntrNoMem]>;
+ def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">,
+ Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty,
+ llvm_i32_ty], [IntrNoMem, Commutative]>;
+ def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">,
+ Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index f30a0c4699..77a9031090 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -6248,6 +6248,22 @@ multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
(bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
}
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId256> {
+ let isCommutable = 1 in
+ def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>, OpSize;
+ def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (IntId256 VR256:$src1,
+ (bitconvert (memopv32i8 addr:$src2))))]>, OpSize;
+}
+
let Predicates = [HasAVX] in {
let isCommutable = 0 in
defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
@@ -6279,6 +6295,32 @@ let Predicates = [HasAVX] in {
(VPCMPEQQrm VR128:$src1, addr:$src2)>;
}
+let Predicates = [HasAVX2] in {
+ let isCommutable = 0 in
+ defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
+ int_x86_avx2_packusdw>, VEX_4V;
+ defm VPCMPEQQ : SS41I_binop_rm_int_y<0x29, "vpcmpeqq",
+ int_x86_avx2_pcmpeq_q>, VEX_4V;
+ defm VPMINSB : SS41I_binop_rm_int_y<0x38, "vpminsb",
+ int_x86_avx2_pmins_b>, VEX_4V;
+ defm VPMINSD : SS41I_binop_rm_int_y<0x39, "vpminsd",
+ int_x86_avx2_pmins_d>, VEX_4V;
+ defm VPMINUD : SS41I_binop_rm_int_y<0x3B, "vpminud",
+ int_x86_avx2_pminu_d>, VEX_4V;
+ defm VPMINUW : SS41I_binop_rm_int_y<0x3A, "vpminuw",
+ int_x86_avx2_pminu_w>, VEX_4V;
+ defm VPMAXSB : SS41I_binop_rm_int_y<0x3C, "vpmaxsb",
+ int_x86_avx2_pmaxs_b>, VEX_4V;
+ defm VPMAXSD : SS41I_binop_rm_int_y<0x3D, "vpmaxsd",
+ int_x86_avx2_pmaxs_d>, VEX_4V;
+ defm VPMAXUD : SS41I_binop_rm_int_y<0x3F, "vpmaxud",
+ int_x86_avx2_pmaxu_d>, VEX_4V;
+ defm VPMAXUW : SS41I_binop_rm_int_y<0x3E, "vpmaxuw",
+ int_x86_avx2_pmaxu_w>, VEX_4V;
+ defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq",
+ int_x86_avx2_pmul_dq>, VEX_4V;
+}
+
let Constraints = "$src1 = $dst" in {
let isCommutable = 0 in
defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
@@ -6301,7 +6343,7 @@ def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
/// SS48I_binop_rm - Simple SSE41 binary operator.
multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType OpVT, bit Is2Addr = 1> {
+ ValueType OpVT, bit Is2Addr = 1> {
let isCommutable = 1 in
def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
@@ -6320,8 +6362,27 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpSize;
}
+/// SS48I_binop_rm - Simple SSE41 binary operator.
+multiclass SS48I_binop_rm_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT> {
+ let isCommutable = 1 in
+ def Yrr : SS48I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (OpVT (OpNode VR256:$src1, VR256:$src2)))]>,
+ OpSize;
+ def Yrm : SS48I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (OpNode VR256:$src1,
+ (bc_v8i32 (memopv4i64 addr:$src2))))]>,
+ OpSize;
+}
+
let Predicates = [HasAVX] in
defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, 0>, VEX_4V;
+let Predicates = [HasAVX2] in
+ defm VPMULLD : SS48I_binop_rm_y<0x40, "vpmulld", mul, v8i32>, VEX_4V;
let Constraints = "$src1 = $dst" in
defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32>;
@@ -6375,6 +6436,15 @@ let Predicates = [HasAVX] in {
VR256, memopv32i8, i256mem, 0>, VEX_4V;
}
+let Predicates = [HasAVX2] in {
+ let isCommutable = 0 in {
+ defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
+ VR256, memopv32i8, i256mem, 0>, VEX_4V;
+ defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
+ VR256, memopv32i8, i256mem, 0>, VEX_4V;
+ }
+}
+
let Constraints = "$src1 = $dst" in {
let isCommutable = 0 in {
defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
@@ -6393,7 +6463,6 @@ let Constraints = "$src1 = $dst" in {
}
/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
-let Predicates = [HasAVX] in {
multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
RegisterClass RC, X86MemOperand x86memop,
PatFrag mem_frag, Intrinsic IntId> {
@@ -6413,8 +6482,8 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
RC:$src3))],
SSEPackedInt>, OpSize, TA, VEX_4V, VEX_I8IMM;
}
-}
+let Predicates = [HasAVX] in {
defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem,
memopv16i8, int_x86_sse41_blendvpd>;
defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem,
@@ -6425,6 +6494,12 @@ defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
memopv32i8, int_x86_avx_blendv_pd_256>;
defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
memopv32i8, int_x86_avx_blendv_ps_256>;
+}
+
+let Predicates = [HasAVX2] in {
+defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
+ memopv32i8, int_x86_avx2_pblendvb>;
+}
let Predicates = [HasAVX] in {
def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
@@ -6503,6 +6578,11 @@ def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vmovntdqa\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
OpSize, VEX;
+let Predicates = [HasAVX2] in
+def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+ "vmovntdqa\t{$src, $dst|$dst, $src}",
+ [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
+ OpSize, VEX;
def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movntdqa\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
@@ -6532,6 +6612,22 @@ multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
(bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
}
+/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
+multiclass SS42I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId256> {
+ def Yrr : SS428I<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
+ OpSize;
+ def Yrm : SS428I<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, i256mem:$src2),
+ !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (IntId256 VR256:$src1,
+ (bitconvert (memopv32i8 addr:$src2))))]>, OpSize;
+}
+
let Predicates = [HasAVX] in {
defm VPCMPGTQ : SS42I_binop_rm_int<0x37, "vpcmpgtq", int_x86_sse42_pcmpgtq,
0>, VEX_4V;
@@ -6542,6 +6638,11 @@ let Predicates = [HasAVX] in {
(VPCMPGTQrm VR128:$src1, addr:$src2)>;
}
+let Predicates = [HasAVX2] in {
+ defm VPCMPGTQ : SS42I_binop_rm_int_y<0x37, "vpcmpgtq", int_x86_avx2_pcmpgt_q>,
+ VEX_4V;
+}
+
let Constraints = "$src1 = $dst" in
defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
@@ -6991,6 +7092,10 @@ def VBROADCASTSD : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
int_x86_avx_vbroadcastf128_pd_256>;
+let Predicates = [HasAVX2] in
+def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
+ int_x86_avx2_vbroadcasti128>;
+
def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
(VBROADCASTF128 addr:$src)>;
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 2dc0f5ce2e..8aef255cae 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -512,6 +512,119 @@ define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) {
declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
+define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) {
+ ; CHECK: movl
+ ; CHECK: vmovntdqa
+ %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
+
+
+define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
+ ; CHECK: vmpsadbw
+ %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
+ ; CHECK: vpackusdw
+ %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+
+define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
+ ; CHECK: vpblendvb
+ %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1]
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+ ; CHECK: vpblendw
+ %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pcmpeqq(<4 x i64> %a0, <4 x i64> %a1) {
+ ; CHECK: vpcmpeqq
+ %res = call <4 x i64> @llvm.x86.avx2.pcmpeq.q(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pcmpeq.q(<4 x i64>, <4 x i64>) nounwind readnone
+
+
+define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
+ ; CHECK: vpmaxsb
+ %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
+ ; CHECK: vpmaxsd
+ %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
+ ; CHECK: vpmaxud
+ %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
+ ; CHECK: vpmaxuw
+ %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+
+define <32 x i8> @test_x86_avx2_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
+ ; CHECK: vpminsb
+ %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
+ ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
+ ; CHECK: vpminsd
+ %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+
+define <8 x i32> @test_x86_avx2_pminud(<8 x i32> %a0, <8 x i32> %a1) {
+ ; CHECK: vpminud
+ %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
+ ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+
+define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
+ ; CHECK: vpminuw
+ %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
+ ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+
define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) {
; CHECK: vpmovsxbd
%res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
@@ -606,3 +719,27 @@ define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
ret <4 x i64> %res
}
declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pmul.dq(<8 x i32> %a0, <8 x i32> %a1) {
+ ; CHECK: vpmuldq
+ %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<2 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_pcmpgtq(<4 x i64> %a0, <4 x i64> %a1) {
+ ; CHECK: vpcmpgtq
+ %res = call <4 x i64> @llvm.x86.avx2.pcmpgt.q(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.pcmpgt.q(<4 x i64>, <4 x i64>) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_vbroadcasti128(i8* %a0) {
+ ; CHECK: vbroadcasti128
+ %res = call <4 x i64> @llvm.x86.avx2.vbroadcasti128(i8* %a0) ; <<4 x i64>> [#uses=1]
+ ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.vbroadcasti128(i8*) nounwind readonly