summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJan Sjödin <jan_sjodin@yahoo.com>2011-11-30 22:09:42 +0000
committerJan Sjödin <jan_sjodin@yahoo.com>2011-11-30 22:09:42 +0000
commitdd649e35e522f5e0b5da3f9d172e06a375c12f77 (patch)
tree2efbf6e9eed61f3b0cc5c8504eee3269b7b43a9b
parent3dad610aaa50a78225a8a61a8f2aa9d7e30a7136 (diff)
downloadllvm-dd649e35e522f5e0b5da3f9d172e06a375c12f77.tar.gz
llvm-dd649e35e522f5e0b5da3f9d172e06a375c12f77.tar.bz2
llvm-dd649e35e522f5e0b5da3f9d172e06a375c12f77.tar.xz
Support for encoding all FMA4 instructions and tablegen patterns for all
remaining FMA4 instructions and intrinsics with tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@145525 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/IntrinsicsX86.td128
-rw-r--r--lib/Target/X86/X86InstrFMA.td349
-rw-r--r--test/CodeGen/X86/fma4-intrinsics-x86_64.ll236
-rw-r--r--test/MC/X86/x86_64-fma4-encoding.s378
4 files changed, 1090 insertions, 1 deletions
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index ea7597ca2c..2d5d9ff0d2 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@@ -1825,10 +1825,138 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// FMA4
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
+ def int_x86_fma4_vfmadd_ss : GCCBuiltin<"__builtin_ia32_vfmaddss">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
def int_x86_fma4_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd">,
Intrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
[IntrNoMem]>;
+ def int_x86_fma4_vfmadd_ps : GCCBuiltin<"__builtin_ia32_vfmaddps">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmadd_pd : GCCBuiltin<"__builtin_ia32_vfmaddpd">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddps256">,
+ Intrinsic<[llvm_v8f32_ty],
+ [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddpd256">,
+ Intrinsic<[llvm_v4f64_ty],
+ [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsub_ss : GCCBuiltin<"__builtin_ia32_vfmsubss">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsub_sd : GCCBuiltin<"__builtin_ia32_vfmsubsd">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsub_ps : GCCBuiltin<"__builtin_ia32_vfmsubps">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsub_pd : GCCBuiltin<"__builtin_ia32_vfmsubpd">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmsubps256">,
+ Intrinsic<[llvm_v8f32_ty],
+ [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmsubpd256">,
+ Intrinsic<[llvm_v4f64_ty],
+ [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmadd_ss : GCCBuiltin<"__builtin_ia32_vfnmaddss">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmadd_sd : GCCBuiltin<"__builtin_ia32_vfnmaddsd">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmadd_ps : GCCBuiltin<"__builtin_ia32_vfnmaddps">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmadd_pd : GCCBuiltin<"__builtin_ia32_vfnmaddpd">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmadd_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmaddps256">,
+ Intrinsic<[llvm_v8f32_ty],
+ [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmadd_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmaddpd256">,
+ Intrinsic<[llvm_v4f64_ty],
+ [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmsub_ss : GCCBuiltin<"__builtin_ia32_vfnmsubss">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmsub_sd : GCCBuiltin<"__builtin_ia32_vfnmsubsd">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmsub_ps : GCCBuiltin<"__builtin_ia32_vfnmsubps">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmsub_pd : GCCBuiltin<"__builtin_ia32_vfnmsubpd">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfnmsubps256">,
+ Intrinsic<[llvm_v8f32_ty],
+ [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfnmsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfnmsubpd256">,
+ Intrinsic<[llvm_v4f64_ty],
+ [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmaddsub_pd : GCCBuiltin<"__builtin_ia32_vfmaddsubpd">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmaddsub_ps_256 :
+ GCCBuiltin<"__builtin_ia32_vfmaddsubps256">,
+ Intrinsic<[llvm_v8f32_ty],
+ [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmaddsub_pd_256 :
+ GCCBuiltin<"__builtin_ia32_vfmaddsubpd256">,
+ Intrinsic<[llvm_v4f64_ty],
+ [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsubadd_ps : GCCBuiltin<"__builtin_ia32_vfmsubaddps">,
+ Intrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsubadd_pd : GCCBuiltin<"__builtin_ia32_vfmsubaddpd">,
+ Intrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsubadd_ps_256 :
+ GCCBuiltin<"__builtin_ia32_vfmsubaddps256">,
+ Intrinsic<[llvm_v8f32_ty],
+ [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty],
+ [IntrNoMem]>;
+ def int_x86_fma4_vfmsubadd_pd_256 :
+ GCCBuiltin<"__builtin_ia32_vfmsubaddpd256">,
+ Intrinsic<[llvm_v4f64_ty],
+ [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty],
+ [IntrNoMem]>;
}
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index bdf797d5e1..015b01ecff 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -83,12 +83,74 @@ multiclass fma4s<bits<8> opc, string OpcodeStr> {
}
+multiclass fma4p<bits<8> opc, string OpcodeStr> {
+ def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src3, $src1, $dst|$dst, $src1, $src3, $src2}"),
+ []>, XOP_W;
+ def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_W;
+ def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, f128mem:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>;
+ def rrY : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src3, $src1, $dst|$dst, $src1, $src3, $src2}"),
+ []>, XOP_W;
+ def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, XOP_W;
+ def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, f256mem:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>;
+}
+
let isAsmParserOnly = 1 in {
+ defm VFMADDSS4 : fma4s<0x6A, "vfmaddss">;
defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd">;
+ defm VFMADDPS4 : fma4p<0x68, "vfmaddps">;
+ defm VFMADDPD4 : fma4p<0x69, "vfmaddpd">;
+ defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss">;
+ defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd">;
+ defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps">;
+ defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd">;
+ defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss">;
+ defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd">;
+ defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps">;
+ defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd">;
+ defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss">;
+ defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd">;
+ defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps">;
+ defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd">;
+ defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps">;
+ defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd">;
+ defm VFMSUBADDPS4 : fma4p<0x5E, "vfmsubaddps">;
+ defm VFMSUBADDPD4 : fma4p<0x5F, "vfmsubaddpd">;
}
// FMA4 Intrinsics patterns
+// VFMADD
+def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, VR128:$src3),
(VFMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2,
@@ -97,3 +159,290 @@ def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2,
def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
VR128:$src3),
(VFMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, VR128:$src2,
+ (alignedloadv2f64 addr:$src3)),
+ (VFMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+ VR128:$src3),
+ (VFMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1, VR256:$src2,
+ (alignedloadv8f32 addr:$src3)),
+ (VFMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_ps_256 VR256:$src1,
+ (alignedloadv8f32 addr:$src2),
+ VR256:$src3),
+ (VFMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1, VR256:$src2,
+ (alignedloadv4f64 addr:$src3)),
+ (VFMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmadd_pd_256 VR256:$src1,
+ (alignedloadv4f64 addr:$src2),
+ VR256:$src3),
+ (VFMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFMSUB
+def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, VR128:$src2,
+ (alignedloadv2f64 addr:$src3)),
+ (VFMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
+ VR128:$src3),
+ (VFMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, VR128:$src2,
+ (alignedloadv2f64 addr:$src3)),
+ (VFMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+ VR128:$src3),
+ (VFMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1, VR256:$src2,
+ (alignedloadv8f32 addr:$src3)),
+ (VFMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_ps_256 VR256:$src1,
+ (alignedloadv8f32 addr:$src2),
+ VR256:$src3),
+ (VFMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1, VR256:$src2,
+ (alignedloadv4f64 addr:$src3)),
+ (VFMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsub_pd_256 VR256:$src1,
+ (alignedloadv4f64 addr:$src2),
+ VR256:$src3),
+ (VFMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFNMADD
+def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFNMADDSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFNMADDSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFNMADDSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFNMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, VR128:$src2,
+ (alignedloadv2f64 addr:$src3)),
+ (VFNMADDSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
+ VR128:$src3),
+ (VFNMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFNMADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFNMADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFNMADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFNMADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, VR128:$src2,
+ (alignedloadv2f64 addr:$src3)),
+ (VFNMADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+ VR128:$src3),
+ (VFNMADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFNMADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1, VR256:$src2,
+ (alignedloadv8f32 addr:$src3)),
+ (VFNMADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_ps_256 VR256:$src1,
+ (alignedloadv8f32 addr:$src2),
+ VR256:$src3),
+ (VFNMADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFNMADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1, VR256:$src2,
+ (alignedloadv4f64 addr:$src3)),
+ (VFNMADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmadd_pd_256 VR256:$src1,
+ (alignedloadv4f64 addr:$src2),
+ VR256:$src3),
+ (VFNMADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFNMSUB
+def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFNMSUBSS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFNMSUBSS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ss VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFNMSUBSS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFNMSUBSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, VR128:$src2,
+ (alignedloadv2f64 addr:$src3)),
+ (VFNMSUBSD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_sd VR128:$src1, (alignedloadv2f64 addr:$src2),
+ VR128:$src3),
+ (VFNMSUBSD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFNMSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFNMSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFNMSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFNMSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, VR128:$src2,
+ (alignedloadv2f64 addr:$src3)),
+ (VFNMSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+ VR128:$src3),
+ (VFNMSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFNMSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1, VR256:$src2,
+ (alignedloadv8f32 addr:$src3)),
+ (VFNMSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_ps_256 VR256:$src1,
+ (alignedloadv8f32 addr:$src2),
+ VR256:$src3),
+ (VFNMSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFNMSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1, VR256:$src2,
+ (alignedloadv4f64 addr:$src3)),
+ (VFNMSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfnmsub_pd_256 VR256:$src1,
+ (alignedloadv4f64 addr:$src2),
+ VR256:$src3),
+ (VFNMSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFMADDSUB
+def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMADDSUBPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFMADDSUBPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFMADDSUBPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMADDSUBPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, VR128:$src2,
+ (alignedloadv2f64 addr:$src3)),
+ (VFMADDSUBPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+ VR128:$src3),
+ (VFMADDSUBPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFMADDSUBPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1, VR256:$src2,
+ (alignedloadv8f32 addr:$src3)),
+ (VFMADDSUBPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_ps_256 VR256:$src1,
+ (alignedloadv8f32 addr:$src2),
+ VR256:$src3),
+ (VFMADDSUBPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFMADDSUBPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1, VR256:$src2,
+ (alignedloadv4f64 addr:$src3)),
+ (VFMADDSUBPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmaddsub_pd_256 VR256:$src1,
+ (alignedloadv4f64 addr:$src2),
+ VR256:$src3),
+ (VFMADDSUBPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+// VFMSUBADD
+def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMSUBADDPS4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, VR128:$src2,
+ (alignedloadv4f32 addr:$src3)),
+ (VFMSUBADDPS4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_ps VR128:$src1, (alignedloadv4f32 addr:$src2),
+ VR128:$src3),
+ (VFMSUBADDPS4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2, VR128:$src3),
+ (VFMSUBADDPD4rr VR128:$src1, VR128:$src2, VR128:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, VR128:$src2,
+ (alignedloadv2f64 addr:$src3)),
+ (VFMSUBADDPD4rm VR128:$src1, VR128:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_pd VR128:$src1, (alignedloadv2f64 addr:$src2),
+ VR128:$src3),
+ (VFMSUBADDPD4mr VR128:$src1, addr:$src2, VR128:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFMSUBADDPS4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1, VR256:$src2,
+ (alignedloadv8f32 addr:$src3)),
+ (VFMSUBADDPS4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_ps_256 VR256:$src1,
+ (alignedloadv8f32 addr:$src2),
+ VR256:$src3),
+ (VFMSUBADDPS4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
+
+def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2, VR256:$src3),
+ (VFMSUBADDPD4rrY VR256:$src1, VR256:$src2, VR256:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1, VR256:$src2,
+ (alignedloadv4f64 addr:$src3)),
+ (VFMSUBADDPD4rmY VR256:$src1, VR256:$src2, addr:$src3)>;
+def : Pat<(int_x86_fma4_vfmsubadd_pd_256 VR256:$src1,
+ (alignedloadv4f64 addr:$src2),
+ VR256:$src3),
+ (VFMSUBADDPD4mrY VR256:$src1, addr:$src2, VR256:$src3)>;
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index 39c2311eb5..bd94c134ce 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,12 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+
+; VFMADD
+define < 4 x float > @test_x86_fma4_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfmaddss
+ %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
define < 2 x double > @test_x86_fma4_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
; CHECK: vfmaddsd
@@ -7,3 +15,229 @@ define < 2 x double > @test_x86_fma4_vfmadd_sd(< 2 x double > %a0, < 2 x double
}
declare < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+define < 4 x float > @test_x86_fma4_vfmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfmaddps
+ %res = call < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma4_vfmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+ ; CHECK: vfmaddpd
+ %res = call < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma4.vfmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma4_vfmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+ ; CHECK: vfmaddps
+ ; CHECK: ymm
+ %res = call < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma4.vfmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma4_vfmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+ ; CHECK: vfmaddpd
+ ; CHECK: ymm
+ %res = call < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma4.vfmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFMSUB
+define < 4 x float > @test_x86_fma4_vfmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfmsubss
+ %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma4_vfmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+ ; CHECK: vfmsubsd
+ %res = call < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma4.vfmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 4 x float > @test_x86_fma4_vfmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfmsubps
+ %res = call < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma4_vfmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+ ; CHECK: vfmsubpd
+ %res = call < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma4.vfmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma4_vfmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+ ; CHECK: vfmsubps
+ ; CHECK: ymm
+ %res = call < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma4.vfmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma4_vfmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+ ; CHECK: vfmsubpd
+ ; CHECK: ymm
+ %res = call < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma4.vfmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFNMADD
+define < 4 x float > @test_x86_fma4_vfnmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfnmaddss
+ %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfnmadd.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma4_vfnmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+ ; CHECK: vfnmaddsd
+ %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma4.vfnmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 4 x float > @test_x86_fma4_vfnmadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfnmaddps
+ %res = call < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfnmadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma4_vfnmadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+ ; CHECK: vfnmaddpd
+ %res = call < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma4.vfnmadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma4_vfnmadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+ ; CHECK: vfnmaddps
+ ; CHECK: ymm
+ %res = call < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma4.vfnmadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma4_vfnmadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+ ; CHECK: vfnmaddpd
+ ; CHECK: ymm
+ %res = call < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma4.vfnmadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFNMSUB
+define < 4 x float > @test_x86_fma4_vfnmsub_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfnmsubss
+ %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfnmsub.ss(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma4_vfnmsub_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+ ; CHECK: vfnmsubsd
+ %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma4.vfnmsub.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 4 x float > @test_x86_fma4_vfnmsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfnmsubps
+ %res = call < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfnmsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma4_vfnmsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+ ; CHECK: vfnmsubpd
+ %res = call < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma4.vfnmsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma4_vfnmsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+ ; CHECK: vfnmsubps
+ ; CHECK: ymm
+ %res = call < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma4.vfnmsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma4_vfnmsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+ ; CHECK: vfnmsubpd
+ ; CHECK: ymm
+ %res = call < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma4.vfnmsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFMADDSUB
+define < 4 x float > @test_x86_fma4_vfmaddsub_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfmaddsubps
+ %res = call < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfmaddsub.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma4_vfmaddsub_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+ ; CHECK: vfmaddsubpd
+ %res = call < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma4.vfmaddsub.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma4_vfmaddsub_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+ ; CHECK: vfmaddsubps
+ ; CHECK: ymm
+ %res = call < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma4.vfmaddsub.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma4_vfmaddsub_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+ ; CHECK: vfmaddsubpd
+ ; CHECK: ymm
+ %res = call < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma4.vfmaddsub.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
+
+; VFMSUBADD
+define < 4 x float > @test_x86_fma4_vfmsubadd_ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
+ ; CHECK: vfmsubaddps
+ %res = call < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) ; <i64> [#uses=1]
+ ret < 4 x float > %res
+}
+declare < 4 x float > @llvm.x86.fma4.vfmsubadd.ps(< 4 x float >, < 4 x float >, < 4 x float >) nounwind readnone
+
+define < 2 x double > @test_x86_fma4_vfmsubadd_pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) {
+ ; CHECK: vfmsubaddpd
+ %res = call < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1]
+ ret < 2 x double > %res
+}
+declare < 2 x double > @llvm.x86.fma4.vfmsubadd.pd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone
+
+define < 8 x float > @test_x86_fma4_vfmsubadd_ps_256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) {
+ ; CHECK: vfmsubaddps
+ ; CHECK: ymm
+ %res = call < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float > %a0, < 8 x float > %a1, < 8 x float > %a2) ; <i64> [#uses=1]
+ ret < 8 x float > %res
+}
+declare < 8 x float > @llvm.x86.fma4.vfmsubadd.ps.256(< 8 x float >, < 8 x float >, < 8 x float >) nounwind readnone
+
+define < 4 x double > @test_x86_fma4_vfmsubadd_pd_256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) {
+ ; CHECK: vfmsubaddpd
+ ; CHECK: ymm
+ %res = call < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double > %a0, < 4 x double > %a1, < 4 x double > %a2) ; <i64> [#uses=1]
+ ret < 4 x double > %res
+}
+declare < 4 x double > @llvm.x86.fma4.vfmsubadd.pd.256(< 4 x double >, < 4 x double >, < 4 x double >) nounwind readnone
diff --git a/test/MC/X86/x86_64-fma4-encoding.s b/test/MC/X86/x86_64-fma4-encoding.s
index e0d2602901..805fc23cf4 100644
--- a/test/MC/X86/x86_64-fma4-encoding.s
+++ b/test/MC/X86/x86_64-fma4-encoding.s
@@ -1,5 +1,18 @@
// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+// vfmadd
+// CHECK: vfmaddss (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6a,0x01,0x10]
+ vfmaddss (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddss %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x6a,0x01,0x10]
+ vfmaddss %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6a,0xc2,0x10]
+ vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
+
// CHECK: vfmaddsd (%rcx), %xmm1, %xmm0, %xmm0
// CHECK: encoding: [0xc4,0xe3,0xf9,0x6b,0x01,0x10]
vfmaddsd (%rcx), %xmm1, %xmm0, %xmm0
@@ -11,3 +24,368 @@
// CHECK: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
// CHECK: encoding: [0xc4,0xe3,0xf9,0x6b,0xc2,0x10]
vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddps (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x68,0x01,0x10]
+ vfmaddps (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddps %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x68,0x01,0x10]
+ vfmaddps %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x68,0xc2,0x10]
+ vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddpd (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x69,0x01,0x10]
+ vfmaddpd (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddpd %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x69,0x01,0x10]
+ vfmaddpd %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x69,0xc2,0x10]
+ vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddps (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x68,0x01,0x10]
+ vfmaddps (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmaddps %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x68,0x01,0x10]
+ vfmaddps %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x68,0xc2,0x10]
+ vfmaddps %ymm2, %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmaddpd (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x69,0x01,0x10]
+ vfmaddpd (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmaddpd %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x69,0x01,0x10]
+ vfmaddpd %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x69,0xc2,0x10]
+ vfmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+
+// vfmsub
+// CHECK: vfmsubss (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6e,0x01,0x10]
+ vfmsubss (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubss %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x6e,0x01,0x10]
+ vfmsubss %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6e,0xc2,0x10]
+ vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubsd (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6f,0x01,0x10]
+ vfmsubsd (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubsd %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x6f,0x01,0x10]
+ vfmsubsd %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6f,0xc2,0x10]
+ vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubps (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6c,0x01,0x10]
+ vfmsubps (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubps %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x6c,0x01,0x10]
+ vfmsubps %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6c,0xc2,0x10]
+ vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubpd (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6d,0x01,0x10]
+ vfmsubpd (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubpd %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x6d,0x01,0x10]
+ vfmsubpd %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x6d,0xc2,0x10]
+ vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubps (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x6c,0x01,0x10]
+ vfmsubps (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmsubps %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x6c,0x01,0x10]
+ vfmsubps %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x6c,0xc2,0x10]
+ vfmsubps %ymm2, %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmsubpd (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x6d,0x01,0x10]
+ vfmsubpd (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmsubpd %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x6d,0x01,0x10]
+ vfmsubpd %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x6d,0xc2,0x10]
+ vfmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+
+// vfnmadd
+// CHECK: vfnmaddss (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7a,0x01,0x10]
+ vfnmaddss (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmaddss %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x7a,0x01,0x10]
+ vfnmaddss %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7a,0xc2,0x10]
+ vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmaddsd (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7b,0x01,0x10]
+ vfnmaddsd (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmaddsd %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x7b,0x01,0x10]
+ vfnmaddsd %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7b,0xc2,0x10]
+ vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmaddps (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x78,0x01,0x10]
+ vfnmaddps (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmaddps %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x78,0x01,0x10]
+ vfnmaddps %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x78,0xc2,0x10]
+ vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmaddpd (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x79,0x01,0x10]
+ vfnmaddpd (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmaddpd %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x79,0x01,0x10]
+ vfnmaddpd %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x79,0xc2,0x10]
+ vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmaddps (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x78,0x01,0x10]
+ vfnmaddps (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfnmaddps %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x78,0x01,0x10]
+ vfnmaddps %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x78,0xc2,0x10]
+ vfnmaddps %ymm2, %ymm1, %ymm0, %ymm0
+
+// CHECK: vfnmaddpd (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x79,0x01,0x10]
+ vfnmaddpd (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfnmaddpd %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x79,0x01,0x10]
+ vfnmaddpd %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x79,0xc2,0x10]
+ vfnmaddpd %ymm2, %ymm1, %ymm0, %ymm0
+
+// vfnmsub
+// CHECK: vfnmsubss (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7e,0x01,0x10]
+ vfnmsubss (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmsubss %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x7e,0x01,0x10]
+ vfnmsubss %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7e,0xc2,0x10]
+ vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmsubsd (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7f,0x01,0x10]
+ vfnmsubsd (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmsubsd %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x7f,0x01,0x10]
+ vfnmsubsd %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7f,0xc2,0x10]
+ vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmsubps (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7c,0x01,0x10]
+ vfnmsubps (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmsubps %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x7c,0x01,0x10]
+ vfnmsubps %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7c,0xc2,0x10]
+ vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmsubpd (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7d,0x01,0x10]
+ vfnmsubpd (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmsubpd %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x7d,0x01,0x10]
+ vfnmsubpd %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x7d,0xc2,0x10]
+ vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfnmsubps (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x7c,0x01,0x10]
+ vfnmsubps (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfnmsubps %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x7c,0x01,0x10]
+ vfnmsubps %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x7c,0xc2,0x10]
+ vfnmsubps %ymm2, %ymm1, %ymm0, %ymm0
+
+// CHECK: vfnmsubpd (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x7d,0x01,0x10]
+ vfnmsubpd (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfnmsubpd %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x7d,0x01,0x10]
+ vfnmsubpd %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x7d,0xc2,0x10]
+ vfnmsubpd %ymm2, %ymm1, %ymm0, %ymm0
+
+// vfmaddsub
+// CHECK: vfmaddsubps (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x5c,0x01,0x10]
+ vfmaddsubps (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddsubps %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x5c,0x01,0x10]
+ vfmaddsubps %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x5c,0xc2,0x10]
+ vfmaddsubps %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddsubpd (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x5d,0x01,0x10]
+ vfmaddsubpd (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddsubpd %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x5d,0x01,0x10]
+ vfmaddsubpd %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x5d,0xc2,0x10]
+ vfmaddsubpd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmaddsubps (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x5c,0x01,0x10]
+ vfmaddsubps (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmaddsubps %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x5c,0x01,0x10]
+ vfmaddsubps %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x5c,0xc2,0x10]
+ vfmaddsubps %ymm2, %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmaddsubpd (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x5d,0x01,0x10]
+ vfmaddsubpd (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmaddsubpd %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x5d,0x01,0x10]
+ vfmaddsubpd %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x5d,0xc2,0x10]
+ vfmaddsubpd %ymm2, %ymm1, %ymm0, %ymm0
+
+// vfmsubadd
+// CHECK: vfmsubaddps (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x5e,0x01,0x10]
+ vfmsubaddps (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubaddps %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x5e,0x01,0x10]
+ vfmsubaddps %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x5e,0xc2,0x10]
+ vfmsubaddps %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubaddpd (%rcx), %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x5f,0x01,0x10]
+ vfmsubaddpd (%rcx), %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubaddpd %xmm1, (%rcx), %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0x79,0x5f,0x01,0x10]
+ vfmsubaddpd %xmm1, (%rcx),%xmm0, %xmm0
+
+// CHECK: vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
+// CHECK: encoding: [0xc4,0xe3,0xf9,0x5f,0xc2,0x10]
+ vfmsubaddpd %xmm2, %xmm1, %xmm0, %xmm0
+
+// CHECK: vfmsubaddps (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x5e,0x01,0x10]
+ vfmsubaddps (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmsubaddps %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x5e,0x01,0x10]
+ vfmsubaddps %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x5e,0xc2,0x10]
+ vfmsubaddps %ymm2, %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmsubaddpd (%rcx), %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x5f,0x01,0x10]
+ vfmsubaddpd (%rcx), %ymm1, %ymm0, %ymm0
+
+// CHECK: vfmsubaddpd %ymm1, (%rcx), %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0x7d,0x5f,0x01,0x10]
+ vfmsubaddpd %ymm1, (%rcx),%ymm0, %ymm0
+
+// CHECK: vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0
+// CHECK: encoding: [0xc4,0xe3,0xfd,0x5f,0xc2,0x10]
+ vfmsubaddpd %ymm2, %ymm1, %ymm0, %ymm0