summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElena Demikhovsky <elena.demikhovsky@intel.com>2012-08-01 12:06:00 +0000
committerElena Demikhovsky <elena.demikhovsky@intel.com>2012-08-01 12:06:00 +0000
commit1503aba4a036f5394c7983417bc1e64613b2fc77 (patch)
tree859379e44f296187b184f9bc832a3ce072a95a37
parent6ce2471806316ae9e1c7715f703c380d4f4311b6 (diff)
downloadllvm-1503aba4a036f5394c7983417bc1e64613b2fc77.tar.gz
llvm-1503aba4a036f5394c7983417bc1e64613b2fc77.tar.bz2
llvm-1503aba4a036f5394c7983417bc1e64613b2fc77.tar.xz
Added FMA functionality to X86 target.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@161110 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/CodeGen/SelectionDAG/DAGCombiner.cpp28
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp51
-rw-r--r--lib/Target/X86/X86ISelLowering.h14
-rw-r--r--lib/Target/X86/X86InstrFMA.td246
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td17
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp66
-rw-r--r--test/CodeGen/X86/fma.ll12
-rwxr-xr-xtest/CodeGen/X86/fma3-intrinsics.ll36
-rw-r--r--test/CodeGen/X86/fma_patterns.ll139
9 files changed, 462 insertions, 147 deletions
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 747bc446a7..0f019ef716 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5679,7 +5679,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
DAG.getTarget().Options.UnsafeFPMath) &&
DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
- TLI.isOperationLegal(ISD::FMA, VT)) {
+ TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
// fold (fadd (fmul x, y), z) -> (fma x, y, z)
if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
@@ -5704,6 +5704,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
EVT VT = N->getValueType(0);
+ DebugLoc dl = N->getDebugLoc();
// fold vector ops
if (VT.isVector()) {
@@ -5724,11 +5725,11 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
return GetNegatedExpression(N1, DAG, LegalOperations);
if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
- return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N1);
+ return DAG.getNode(ISD::FNEG, dl, VT, N1);
}
// fold (fsub A, (fneg B)) -> (fadd A, B)
if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
- return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0,
+ return DAG.getNode(ISD::FADD, dl, VT, N0,
GetNegatedExpression(N1, DAG, LegalOperations));
// If 'unsafe math' is enabled, fold
@@ -5756,23 +5757,34 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
DAG.getTarget().Options.UnsafeFPMath) &&
DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
- TLI.isOperationLegal(ISD::FMA, VT)) {
+ TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
// fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
- return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+ return DAG.getNode(ISD::FMA, dl, VT,
N0.getOperand(0), N0.getOperand(1),
- DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT, N1));
+ DAG.getNode(ISD::FNEG, dl, VT, N1));
}
// fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
// Note: Commutes FSUB operands.
if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
- return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
- DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT,
+ return DAG.getNode(ISD::FMA, dl, VT,
+ DAG.getNode(ISD::FNEG, dl, VT,
N1.getOperand(0)),
N1.getOperand(1), N0);
}
+
+ // fold (fsub (-(fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+ if (N0.getOpcode() == ISD::FNEG &&
+ N0.getOperand(0).getOpcode() == ISD::FMUL &&
+ N0->hasOneUse() && N0.getOperand(0).hasOneUse()) {
+ SDValue N00 = N0.getOperand(0).getOperand(0);
+ SDValue N01 = N0.getOperand(0).getOperand(1);
+ return DAG.getNode(ISD::FMA, dl, VT,
+ DAG.getNode(ISD::FNEG, dl, VT, N00), N01,
+ DAG.getNode(ISD::FNEG, dl, VT, N1));
+ }
}
return SDValue();
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 40283d87f8..fa19ad1388 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -730,6 +730,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+ setOperationAction(ISD::FMA, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
@@ -1071,6 +1072,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+ if (Subtarget->hasFMA()) {
+ setOperationAction(ISD::FMA, MVT::v8f32, Custom);
+ setOperationAction(ISD::FMA, MVT::v4f64, Custom);
+ setOperationAction(ISD::FMA, MVT::v4f32, Custom);
+ setOperationAction(ISD::FMA, MVT::v2f64, Custom);
+ setOperationAction(ISD::FMA, MVT::f32, Custom);
+ setOperationAction(ISD::FMA, MVT::f64, Custom);
+ }
if (Subtarget->hasAVX2()) {
setOperationAction(ISD::ADD, MVT::v4i64, Legal);
setOperationAction(ISD::ADD, MVT::v8i32, Legal);
@@ -1220,6 +1229,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::FADD);
setTargetDAGCombine(ISD::FSUB);
+ setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::SUB);
setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);
@@ -11289,6 +11299,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::WIN_FTOL: return "X86ISD::WIN_FTOL";
case X86ISD::SAHF: return "X86ISD::SAHF";
case X86ISD::RDRAND: return "X86ISD::RDRAND";
+ case X86ISD::FMADD: return "X86ISD::FMADD";
+ case X86ISD::FMSUB: return "X86ISD::FMSUB";
+ case X86ISD::FNMADD: return "X86ISD::FNMADD";
+ case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
+ case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
+ case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
}
}
@@ -15108,6 +15124,40 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue PerformFMACombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget* Subtarget) {
+ DebugLoc dl = N->getDebugLoc();
+ EVT VT = N->getValueType(0);
+
+ EVT ScalarVT = VT.getScalarType();
+ if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget->hasFMA())
+ return SDValue();
+
+ SDValue A = N->getOperand(0);
+ SDValue B = N->getOperand(1);
+ SDValue C = N->getOperand(2);
+
+ bool NegA = (A.getOpcode() == ISD::FNEG);
+ bool NegB = (B.getOpcode() == ISD::FNEG);
+ bool NegC = (C.getOpcode() == ISD::FNEG);
+
+ // Negative multiplication when NegA xor NegB
+ bool NegMul = (NegA != NegB);
+ if (NegA)
+ A = A.getOperand(0);
+ if (NegB)
+ B = B.getOperand(0);
+ if (NegC)
+ C = C.getOperand(0);
+
+ unsigned Opcode;
+ if (!NegMul)
+ Opcode = (!NegC)? X86ISD::FMADD : X86ISD::FMSUB;
+ else
+ Opcode = (!NegC)? X86ISD::FNMADD : X86ISD::FNMSUB;
+ return DAG.getNode(Opcode, dl, VT, A, B, C);
+}
+
static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
@@ -15447,6 +15497,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERMILP:
case X86ISD::VPERM2X128:
case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
+ case ISD::FMA: return PerformFMACombine(N, DAG, Subtarget);
}
return SDValue();
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 78e4d757c2..3bc2e31ce5 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -294,6 +294,14 @@ namespace llvm {
// PMULUDQ - Vector multiply packed unsigned doubleword integers
PMULUDQ,
+ // FMA nodes
+ FMADD,
+ FNMADD,
+ FMSUB,
+ FNMSUB,
+ FMADDSUB,
+ FMSUBADD,
+
// VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
// according to %al. An operator is needed so that this can be expanded
// with control flow.
@@ -597,6 +605,12 @@ namespace llvm {
virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
virtual bool isZExtFree(EVT VT1, EVT VT2) const;
+ /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
+ /// a pair of mul and add instructions. fmuladd intrinsics will be expanded to
+ /// FMAs when this method returns true (and FMAs are legal), otherwise fmuladd
+ /// is expanded to mul + add.
+ virtual bool isFMAFasterThanMulAndAdd(EVT) const { return true; }
+
/// isNarrowingProfitable - Return true if it's profitable to narrow
/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
/// from i32 to i8 but not from i32 to i16.
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 8802a2ecde..265b4bb997 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -19,156 +19,240 @@ let Constraints = "$src1 = $dst" in {
multiclass fma3p_rm<bits<8> opc, string OpcodeStr> {
let neverHasSideEffects = 1 in {
def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, VR128:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
let mayLoad = 1 in
def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, f128mem:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, VR256:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
let mayLoad = 1 in
def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, f256mem:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
} // neverHasSideEffects = 1
}
-// Intrinsic for 132 pattern
+// Intrinsic for 213 pattern
multiclass fma3p_rm_int<bits<8> opc, string OpcodeStr,
PatFrag MemFrag128, PatFrag MemFrag256,
- Intrinsic Int128, Intrinsic Int256> {
+ Intrinsic Int128, Intrinsic Int256, SDNode Op213,
+ ValueType OpVT128, ValueType OpVT256> {
def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, VR128:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src3, VR128:$src2))]>;
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1,
+ VR128:$src3))]>;
+
+ def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (OpVT128 (Op213 VR128:$src2,
+ VR128:$src1, VR128:$src3)))]>;
+
def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, f128mem:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst,
- (Int128 VR128:$src1, (MemFrag128 addr:$src3), VR128:$src2))]>;
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (Int128 VR128:$src2, VR128:$src1,
+ (MemFrag128 addr:$src3)))]>;
+
+ def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (OpVT128 (Op213 VR128:$src2, VR128:$src1,
+ (MemFrag128 addr:$src3))))]>;
+
+
def rY_Int : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, VR256:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src3, VR256:$src2))]>;
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1,
+ VR256:$src3))]>;
+
+ def rY : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, VR256:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (OpVT256 (Op213 VR256:$src2, VR256:$src1,
+ VR256:$src3)))]>;
+
def mY_Int : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, f256mem:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR256:$dst,
- (Int256 VR256:$src1, (MemFrag256 addr:$src3), VR256:$src2))]>;
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst, (Int256 VR256:$src2, VR256:$src1,
+ (MemFrag256 addr:$src3)))]>;
+
+ def mY : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR256:$dst,
+ (OpVT256 (Op213 VR256:$src2, VR256:$src1,
+ (MemFrag256 addr:$src3))))]>;
}
} // Constraints = "$src1 = $dst"
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpcodeStr, string PackTy,
PatFrag MemFrag128, PatFrag MemFrag256,
- Intrinsic Int128, Intrinsic Int256> {
- defm r132 : fma3p_rm_int <opc132, !strconcat(OpcodeStr,
- !strconcat("132", PackTy)), MemFrag128, MemFrag256,
- Int128, Int256>;
- defm r132 : fma3p_rm <opc132, !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
- defm r213 : fma3p_rm <opc213, !strconcat(OpcodeStr, !strconcat("213", PackTy))>;
- defm r231 : fma3p_rm <opc231, !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
+ Intrinsic Int128, Intrinsic Int256, SDNode Op,
+ ValueType OpTy128, ValueType OpTy256> {
+ defm r213 : fma3p_rm_int <opc213, !strconcat(OpcodeStr,
+ !strconcat("213", PackTy)), MemFrag128, MemFrag256,
+ Int128, Int256, Op, OpTy128, OpTy256>;
+ defm r132 : fma3p_rm <opc132,
+ !strconcat(OpcodeStr, !strconcat("132", PackTy))>;
+ defm r231 : fma3p_rm <opc231,
+ !strconcat(OpcodeStr, !strconcat("231", PackTy))>;
}
// Fused Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFMADDPS : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", memopv4f32,
- memopv8f32, int_x86_fma_vfmadd_ps, int_x86_fma_vfmadd_ps_256>;
- defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
- memopv8f32, int_x86_fma_vfmsub_ps, int_x86_fma_vfmsub_ps_256>;
+ memopv8f32, int_x86_fma_vfmadd_ps,
+ int_x86_fma_vfmadd_ps_256, X86Fmadd,
+ v4f32, v8f32>;
+ defm VFMSUBPS : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", memopv4f32,
+ memopv8f32, int_x86_fma_vfmsub_ps,
+ int_x86_fma_vfmsub_ps_256, X86Fmsub,
+ v4f32, v8f32>;
defm VFMADDSUBPS : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps",
- memopv4f32, memopv8f32, int_x86_fma_vfmaddsub_ps,
- int_x86_fma_vfmaddsub_ps_256>;
+ memopv4f32, memopv8f32,
+ int_x86_fma_vfmaddsub_ps,
+ int_x86_fma_vfmaddsub_ps_256, X86Fmaddsub,
+ v4f32, v8f32>;
defm VFMSUBADDPS : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps",
- memopv4f32, memopv8f32, int_x86_fma_vfmsubadd_ps,
- int_x86_fma_vfmaddsub_ps_256>;
+ memopv4f32, memopv8f32,
+ int_x86_fma_vfmsubadd_ps,
+ int_x86_fma_vfmaddsub_ps_256, X86Fmsubadd,
+ v4f32, v8f32>;
}
let ExeDomain = SSEPackedDouble in {
defm VFMADDPD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", memopv2f64,
- memopv4f64, int_x86_fma_vfmadd_pd, int_x86_fma_vfmadd_pd_256>, VEX_W;
+ memopv4f64, int_x86_fma_vfmadd_pd,
+ int_x86_fma_vfmadd_pd_256, X86Fmadd, v2f64,
+ v4f64>, VEX_W;
defm VFMSUBPD : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", memopv2f64,
- memopv4f64, int_x86_fma_vfmsub_pd, int_x86_fma_vfmsub_pd_256>, VEX_W;
- defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", memopv2f64,
- memopv4f64, int_x86_fma_vfmaddsub_pd, int_x86_fma_vfmaddsub_pd_256>, VEX_W;
- defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd", memopv2f64,
- memopv4f64, int_x86_fma_vfmsubadd_pd, int_x86_fma_vfmsubadd_pd_256>, VEX_W;
+ memopv4f64, int_x86_fma_vfmsub_pd,
+ int_x86_fma_vfmsub_pd_256, X86Fmsub, v2f64,
+ v4f64>, VEX_W;
+ defm VFMADDSUBPD : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd",
+ memopv2f64, memopv4f64,
+ int_x86_fma_vfmaddsub_pd,
+ int_x86_fma_vfmaddsub_pd_256, X86Fmaddsub,
+ v2f64, v4f64>, VEX_W;
+ defm VFMSUBADDPD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "pd",
+ memopv2f64, memopv4f64,
+ int_x86_fma_vfmsubadd_pd,
+ int_x86_fma_vfmsubadd_pd_256, X86Fmsubadd,
+ v2f64, v4f64>, VEX_W;
}
// Fused Negative Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFNMADDPS : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", memopv4f32,
- memopv8f32, int_x86_fma_vfnmadd_ps, int_x86_fma_vfnmadd_ps_256>;
+ memopv8f32, int_x86_fma_vfnmadd_ps,
+ int_x86_fma_vfnmadd_ps_256, X86Fnmadd, v4f32,
+ v8f32>;
defm VFNMSUBPS : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", memopv4f32,
- memopv8f32, int_x86_fma_vfnmsub_ps, int_x86_fma_vfnmsub_ps_256>;
+ memopv8f32, int_x86_fma_vfnmsub_ps,
+ int_x86_fma_vfnmsub_ps_256, X86Fnmsub, v4f32,
+ v8f32>;
}
let ExeDomain = SSEPackedDouble in {
defm VFNMADDPD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", memopv2f64,
- memopv4f64, int_x86_fma_vfnmadd_pd, int_x86_fma_vfnmadd_pd_256>, VEX_W;
- defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", memopv2f64,
- memopv4f64, int_x86_fma_vfnmsub_pd, int_x86_fma_vfnmsub_pd_256>, VEX_W;
+ memopv4f64, int_x86_fma_vfnmadd_pd,
+ int_x86_fma_vfnmadd_pd_256, X86Fnmadd, v2f64,
+ v4f64>, VEX_W;
+ defm VFNMSUBPD : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd",
+ memopv2f64,
+ memopv4f64, int_x86_fma_vfnmsub_pd,
+ int_x86_fma_vfnmsub_pd_256, X86Fnmsub, v2f64,
+ v4f64>, VEX_W;
}
-
let Constraints = "$src1 = $dst" in {
multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
RegisterClass RC> {
let neverHasSideEffects = 1 in {
def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, RC:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
let mayLoad = 1 in
def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, x86memop:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
} // neverHasSideEffects = 1
}
multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memop,
- ComplexPattern mem_cpat, Intrinsic IntId> {
+ ComplexPattern mem_cpat, Intrinsic IntId,
+ RegisterClass RC, SDNode OpNode, ValueType OpVT> {
def r_Int : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, VR128:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst, (IntId VR128:$src1, VR128:$src3, VR128:$src2))]>;
+ (ins VR128:$src1, VR128:$src2, VR128:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst, (IntId VR128:$src2, VR128:$src1,
+ VR128:$src3))]>;
def m_Int : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, memop:$src3),
- !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst,
- (IntId VR128:$src1, mem_cpat:$src3, VR128:$src2))]>;
+ (ins VR128:$src1, VR128:$src2, memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set VR128:$dst,
+ (IntId VR128:$src2, VR128:$src1, mem_cpat:$src3))]>;
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), []>;
}
} // Constraints = "$src1 = $dst"
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, Intrinsic IntF32, Intrinsic IntF64> {
+ string OpStr, Intrinsic IntF32, Intrinsic IntF64,
+ SDNode OpNode> {
defm SSr132 : fma3s_rm<opc132, !strconcat(OpStr, "132ss"), f32mem, FR32>;
- defm SSr213 : fma3s_rm<opc213, !strconcat(OpStr, "213ss"), f32mem, FR32>;
defm SSr231 : fma3s_rm<opc231, !strconcat(OpStr, "231ss"), f32mem, FR32>;
- defm SDr132 : fma3s_rm<opc132, !strconcat(OpStr, "132sd"), f64mem, FR64>, VEX_W;
- defm SDr213 : fma3s_rm<opc213, !strconcat(OpStr, "213sd"), f64mem, FR64>, VEX_W;
- defm SDr231 : fma3s_rm<opc231, !strconcat(OpStr, "231sd"), f64mem, FR64>, VEX_W;
- defm SSr132 : fma3s_rm_int <opc132, !strconcat(OpStr, "132ss"), ssmem,
- sse_load_f32, IntF32>;
- defm SDr132 : fma3s_rm_int <opc132, !strconcat(OpStr, "132sd"), sdmem,
- sse_load_f64, IntF64>;
+ defm SDr132 : fma3s_rm<opc132, !strconcat(OpStr, "132sd"), f64mem, FR64>,
+ VEX_W;
+ defm SDr231 : fma3s_rm<opc231, !strconcat(OpStr, "231sd"), f64mem, FR64>,
+ VEX_W;
+ defm SSr213 : fma3s_rm_int <opc213, !strconcat(OpStr, "213ss"), ssmem,
+ sse_load_f32, IntF32, FR32, OpNode, f32>;
+ defm SDr213 : fma3s_rm_int <opc213, !strconcat(OpStr, "213sd"), sdmem,
+ sse_load_f64, IntF64, FR64, OpNode, f64>, VEX_W;
}
defm VFMADD : fma3s_forms<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
- int_x86_fma_vfmadd_sd>, VEX_LIG;
+ int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
defm VFMSUB : fma3s_forms<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
- int_x86_fma_vfmsub_sd>, VEX_LIG;
+ int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
defm VFNMADD : fma3s_forms<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
- int_x86_fma_vfnmadd_sd>, VEX_LIG;
+ int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
defm VFNMSUB : fma3s_forms<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
- int_x86_fma_vfnmsub_sd>, VEX_LIG;
+ int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index ec030dd546..19de855ac5 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -125,7 +125,10 @@ def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
def SDTVBroadcast : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
+ SDTCisSameAs<1,2>, SDTCisVT<3, i32>]>;
+
+def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
@@ -160,9 +163,15 @@ def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
-def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>;
-def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>;
-def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>;
+def X86Blendpw : SDNode<"X86ISD::BLENDPW", SDTBlend>;
+def X86Blendps : SDNode<"X86ISD::BLENDPS", SDTBlend>;
+def X86Blendpd : SDNode<"X86ISD::BLENDPD", SDTBlend>;
+def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFma>;
+def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFma>;
+def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFma>;
+def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFma>;
+def X86Fmaddsub : SDNode<"X86ISD::FMSUBADD", SDTFma>;
+def X86Fmsubadd : SDNode<"X86ISD::FMADDSUB", SDTFma>;
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 89a57b53f3..3f16372497 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -1130,8 +1130,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFMADDSDr132r, X86::VFMADDSDr132m, 0 },
{ X86::VFMADDSSr213r, X86::VFMADDSSr213m, 0 },
{ X86::VFMADDSDr213r, X86::VFMADDSDr213m, 0 },
- { X86::VFMADDSSr132r_Int, X86::VFMADDSSr132m_Int, 0 },
- { X86::VFMADDSDr132r_Int, X86::VFMADDSDr132m_Int, 0 },
+ { X86::VFMADDSSr213r_Int, X86::VFMADDSSr213m_Int, 0 },
+ { X86::VFMADDSDr213r_Int, X86::VFMADDSDr213m_Int, 0 },
{ X86::VFMADDPSr231r, X86::VFMADDPSr231m, TB_ALIGN_16 },
{ X86::VFMADDPDr231r, X86::VFMADDPDr231m, TB_ALIGN_16 },
@@ -1145,10 +1145,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFMADDPDr132rY, X86::VFMADDPDr132mY, TB_ALIGN_32 },
{ X86::VFMADDPSr213rY, X86::VFMADDPSr213mY, TB_ALIGN_32 },
{ X86::VFMADDPDr213rY, X86::VFMADDPDr213mY, TB_ALIGN_32 },
- { X86::VFMADDPSr132r_Int, X86::VFMADDPSr132m_Int, TB_ALIGN_16 },
- { X86::VFMADDPDr132r_Int, X86::VFMADDPDr132m_Int, TB_ALIGN_16 },
- { X86::VFMADDPSr132rY_Int, X86::VFMADDPSr132mY_Int, TB_ALIGN_32 },
- { X86::VFMADDPDr132rY_Int, X86::VFMADDPDr132mY_Int, TB_ALIGN_32 },
+ { X86::VFMADDPSr213r_Int, X86::VFMADDPSr213m_Int, TB_ALIGN_16 },
+ { X86::VFMADDPDr213r_Int, X86::VFMADDPDr213m_Int, TB_ALIGN_16 },
+ { X86::VFMADDPSr213rY_Int, X86::VFMADDPSr213mY_Int, TB_ALIGN_32 },
+ { X86::VFMADDPDr213rY_Int, X86::VFMADDPDr213mY_Int, TB_ALIGN_32 },
{ X86::VFNMADDSSr231r, X86::VFNMADDSSr231m, 0 },
{ X86::VFNMADDSDr231r, X86::VFNMADDSDr231m, 0 },
@@ -1156,8 +1156,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFNMADDSDr132r, X86::VFNMADDSDr132m, 0 },
{ X86::VFNMADDSSr213r, X86::VFNMADDSSr213m, 0 },
{ X86::VFNMADDSDr213r, X86::VFNMADDSDr213m, 0 },
- { X86::VFNMADDSSr132r_Int, X86::VFNMADDSSr132m_Int, 0 },
- { X86::VFNMADDSDr132r_Int, X86::VFNMADDSDr132m_Int, 0 },
+ { X86::VFNMADDSSr213r_Int, X86::VFNMADDSSr213m_Int, 0 },
+ { X86::VFNMADDSDr213r_Int, X86::VFNMADDSDr213m_Int, 0 },
{ X86::VFNMADDPSr231r, X86::VFNMADDPSr231m, TB_ALIGN_16 },
{ X86::VFNMADDPDr231r, X86::VFNMADDPDr231m, TB_ALIGN_16 },
@@ -1171,10 +1171,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFNMADDPDr132rY, X86::VFNMADDPDr132mY, TB_ALIGN_32 },
{ X86::VFNMADDPSr213rY, X86::VFNMADDPSr213mY, TB_ALIGN_32 },
{ X86::VFNMADDPDr213rY, X86::VFNMADDPDr213mY, TB_ALIGN_32 },
- { X86::VFNMADDPSr132r_Int, X86::VFNMADDPSr132m_Int, TB_ALIGN_16 },
- { X86::VFNMADDPDr132r_Int, X86::VFNMADDPDr132m_Int, TB_ALIGN_16 },
- { X86::VFNMADDPSr132rY_Int, X86::VFNMADDPSr132mY_Int, TB_ALIGN_32 },
- { X86::VFNMADDPDr132rY_Int, X86::VFNMADDPDr132mY_Int, TB_ALIGN_32 },
+ { X86::VFNMADDPSr213r_Int, X86::VFNMADDPSr213m_Int, TB_ALIGN_16 },
+ { X86::VFNMADDPDr213r_Int, X86::VFNMADDPDr213m_Int, TB_ALIGN_16 },
+ { X86::VFNMADDPSr213rY_Int, X86::VFNMADDPSr213mY_Int, TB_ALIGN_32 },
+ { X86::VFNMADDPDr213rY_Int, X86::VFNMADDPDr213mY_Int, TB_ALIGN_32 },
{ X86::VFMSUBSSr231r, X86::VFMSUBSSr231m, 0 },
{ X86::VFMSUBSDr231r, X86::VFMSUBSDr231m, 0 },
@@ -1182,8 +1182,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFMSUBSDr132r, X86::VFMSUBSDr132m, 0 },
{ X86::VFMSUBSSr213r, X86::VFMSUBSSr213m, 0 },
{ X86::VFMSUBSDr213r, X86::VFMSUBSDr213m, 0 },
- { X86::VFMSUBSSr132r_Int, X86::VFMSUBSSr132m_Int, 0 },
- { X86::VFMSUBSDr132r_Int, X86::VFMSUBSDr132m_Int, 0 },
+ { X86::VFMSUBSSr213r_Int, X86::VFMSUBSSr213m_Int, 0 },
+ { X86::VFMSUBSDr213r_Int, X86::VFMSUBSDr213m_Int, 0 },
{ X86::VFMSUBPSr231r, X86::VFMSUBPSr231m, TB_ALIGN_16 },
{ X86::VFMSUBPDr231r, X86::VFMSUBPDr231m, TB_ALIGN_16 },
@@ -1197,10 +1197,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFMSUBPDr132rY, X86::VFMSUBPDr132mY, TB_ALIGN_32 },
{ X86::VFMSUBPSr213rY, X86::VFMSUBPSr213mY, TB_ALIGN_32 },
{ X86::VFMSUBPDr213rY, X86::VFMSUBPDr213mY, TB_ALIGN_32 },
- { X86::VFMSUBPSr132r_Int, X86::VFMSUBPSr132m_Int, TB_ALIGN_16 },
- { X86::VFMSUBPDr132r_Int, X86::VFMSUBPDr132m_Int, TB_ALIGN_16 },
- { X86::VFMSUBPSr132rY_Int, X86::VFMSUBPSr132mY_Int, TB_ALIGN_32 },
- { X86::VFMSUBPDr132rY_Int, X86::VFMSUBPDr132mY_Int, TB_ALIGN_32 },
+ { X86::VFMSUBPSr213r_Int, X86::VFMSUBPSr213m_Int, TB_ALIGN_16 },
+ { X86::VFMSUBPDr213r_Int, X86::VFMSUBPDr213m_Int, TB_ALIGN_16 },
+ { X86::VFMSUBPSr213rY_Int, X86::VFMSUBPSr213mY_Int, TB_ALIGN_32 },
+ { X86::VFMSUBPDr213rY_Int, X86::VFMSUBPDr213mY_Int, TB_ALIGN_32 },
{ X86::VFNMSUBSSr231r, X86::VFNMSUBSSr231m, 0 },
{ X86::VFNMSUBSDr231r, X86::VFNMSUBSDr231m, 0 },
@@ -1208,8 +1208,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFNMSUBSDr132r, X86::VFNMSUBSDr132m, 0 },
{ X86::VFNMSUBSSr213r, X86::VFNMSUBSSr213m, 0 },
{ X86::VFNMSUBSDr213r, X86::VFNMSUBSDr213m, 0 },
- { X86::VFNMSUBSSr132r_Int, X86::VFNMSUBSSr132m_Int, 0 },
- { X86::VFNMSUBSDr132r_Int, X86::VFNMSUBSDr132m_Int, 0 },
+ { X86::VFNMSUBSSr213r_Int, X86::VFNMSUBSSr213m_Int, 0 },
+ { X86::VFNMSUBSDr213r_Int, X86::VFNMSUBSDr213m_Int, 0 },
{ X86::VFNMSUBPSr231r, X86::VFNMSUBPSr231m, TB_ALIGN_16 },
{ X86::VFNMSUBPDr231r, X86::VFNMSUBPDr231m, TB_ALIGN_16 },
@@ -1223,10 +1223,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFNMSUBPDr132rY, X86::VFNMSUBPDr132mY, TB_ALIGN_32 },
{ X86::VFNMSUBPSr213rY, X86::VFNMSUBPSr213mY, TB_ALIGN_32 },
{ X86::VFNMSUBPDr213rY, X86::VFNMSUBPDr213mY, TB_ALIGN_32 },
- { X86::VFNMSUBPSr132r_Int, X86::VFNMSUBPSr132m_Int, TB_ALIGN_16 },
- { X86::VFNMSUBPDr132r_Int, X86::VFNMSUBPDr132m_Int, TB_ALIGN_16 },
- { X86::VFNMSUBPSr132rY_Int, X86::VFNMSUBPSr132mY_Int, TB_ALIGN_32 },
- { X86::VFNMSUBPDr132rY_Int, X86::VFNMSUBPDr132mY_Int, TB_ALIGN_32 },
+ { X86::VFNMSUBPSr213r_Int, X86::VFNMSUBPSr213m_Int, TB_ALIGN_16 },
+ { X86::VFNMSUBPDr213r_Int, X86::VFNMSUBPDr213m_Int, TB_ALIGN_16 },
+ { X86::VFNMSUBPSr213rY_Int, X86::VFNMSUBPSr213mY_Int, TB_ALIGN_32 },
+ { X86::VFNMSUBPDr213rY_Int, X86::VFNMSUBPDr213mY_Int, TB_ALIGN_32 },
{ X86::VFMADDSUBPSr231r, X86::VFMADDSUBPSr231m, TB_ALIGN_16 },
{ X86::VFMADDSUBPDr231r, X86::VFMADDSUBPDr231m, TB_ALIGN_16 },
@@ -1240,10 +1240,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFMADDSUBPDr132rY, X86::VFMADDSUBPDr132mY, TB_ALIGN_32 },
{ X86::VFMADDSUBPSr213rY, X86::VFMADDSUBPSr213mY, TB_ALIGN_32 },
{ X86::VFMADDSUBPDr213rY, X86::VFMADDSUBPDr213mY, TB_ALIGN_32 },
- { X86::VFMADDSUBPSr132r_Int, X86::VFMADDSUBPSr132m_Int, TB_ALIGN_16 },
- { X86::VFMADDSUBPDr132r_Int, X86::VFMADDSUBPDr132m_Int, TB_ALIGN_16 },
- { X86::VFMADDSUBPSr132rY_Int, X86::VFMADDSUBPSr132mY_Int, TB_ALIGN_32 },
- { X86::VFMADDSUBPDr132rY_Int, X86::VFMADDSUBPDr132mY_Int, TB_ALIGN_32 },
+ { X86::VFMADDSUBPSr213r_Int, X86::VFMADDSUBPSr213m_Int, TB_ALIGN_16 },
+ { X86::VFMADDSUBPDr213r_Int, X86::VFMADDSUBPDr213m_Int, TB_ALIGN_16 },
+ { X86::VFMADDSUBPSr213rY_Int, X86::VFMADDSUBPSr213mY_Int, TB_ALIGN_32 },
+ { X86::VFMADDSUBPDr213rY_Int, X86::VFMADDSUBPDr213mY_Int, TB_ALIGN_32 },
{ X86::VFMSUBADDPSr231r, X86::VFMSUBADDPSr231m, TB_ALIGN_16 },
{ X86::VFMSUBADDPDr231r, X86::VFMSUBADDPDr231m, TB_ALIGN_16 },
@@ -1257,10 +1257,10 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFMSUBADDPDr132rY, X86::VFMSUBADDPDr132mY, TB_ALIGN_32 },
{ X86::VFMSUBADDPSr213rY, X86::VFMSUBADDPSr213mY, TB_ALIGN_32 },
{ X86::VFMSUBADDPDr213rY, X86::VFMSUBADDPDr213mY, TB_ALIGN_32 },
- { X86::VFMSUBADDPSr132r_Int, X86::VFMSUBADDPSr132m_Int, TB_ALIGN_16 },
- { X86::VFMSUBADDPDr132r_Int, X86::VFMSUBADDPDr132m_Int, TB_ALIGN_16 },
- { X86::VFMSUBADDPSr132rY_Int, X86::VFMSUBADDPSr132mY_Int, TB_ALIGN_32 },
- { X86::VFMSUBADDPDr132rY_Int, X86::VFMSUBADDPDr132mY_Int, TB_ALIGN_32 },
+ { X86::VFMSUBADDPSr213r_Int, X86::VFMSUBADDPSr213m_Int, TB_ALIGN_16 },
+ { X86::VFMSUBADDPDr213r_Int, X86::VFMSUBADDPDr213m_Int, TB_ALIGN_16 },
+ { X86::VFMSUBADDPSr213rY_Int, X86::VFMSUBADDPSr213mY_Int, TB_ALIGN_32 },
+ { X86::VFMSUBADDPDr213rY_Int, X86::VFMSUBADDPDr213mY_Int, TB_ALIGN_32 },
};
for (unsigned i = 0, e = array_lengthof(OpTbl3); i != e; ++i) {
@@ -3481,6 +3481,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
OpcodeTablePtr = &RegOp2MemOpTable1;
} else if (i == 2) {
OpcodeTablePtr = &RegOp2MemOpTable2;
+ } else if (i == 3) {
+ OpcodeTablePtr = &RegOp2MemOpTable3;
}
// If table selected...
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 5deedb9dd9..b0c1d0a0dd 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,8 +1,11 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=CHECK-FMA-CALL
; CHECK: test_f32
-; CHECK: _fmaf
+; CHECK-FMA-INST: vfmadd213ss
+; CHECK-FMA-CALL: _fmaf
define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
entry:
@@ -11,7 +14,8 @@ entry:
}
; CHECK: test_f64
-; CHECK: _fma
+; CHECK-FMA-INST: vfmadd213sd
+; CHECK-FMA-CALL: _fma
define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 8659dfe326..90529e09d7 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -1,42 +1,42 @@
; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fmadd132ss %xmm
+ ; CHECK: fmadd213ss %xmm
%res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fmadd132ps
+ ; CHECK: fmadd213ps
%res = call <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
- ; CHECK: fmadd132ps {{.*\(%r.*}}, %ymm
+ ; CHECK: fmadd213ps {{.*\(%r.*}}, %ymm
%res = call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
ret <8 x float> %res
}
declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fnmadd132ss %xmm
+ ; CHECK: fnmadd213ss %xmm
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fnmadd132ps
+ ; CHECK: fnmadd213ps
%res = call <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmadd.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
- ; CHECK: fnmadd132ps {{.*\(%r.*}}, %ymm
+ ; CHECK: fnmadd213ps {{.*\(%r.*}}, %ymm
%res = call <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind
ret <8 x float> %res
}
@@ -44,28 +44,28 @@ declare <8 x float> @llvm.x86.fma.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x
define <4 x float> @test_x86_fmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fmsub132ss
+ ; CHECK: fmsub213ss
%res = call <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fmsub132ps
+ ; CHECK: fmsub213ps
%res = call <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfmsub.ps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fnmsub132ss
+ ; CHECK: fnmsub213ss
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
ret <4 x float> %res
}
declare <4 x float> @llvm.x86.fma.vfnmsub.ss(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
- ; CHECK: fnmsub132ps
+ ; CHECK: fnmsub213ps
%res = call <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
ret <4 x float> %res
}
@@ -74,28 +74,28 @@ declare <4 x float> @llvm.x86.fma.vfnmsub.ps(<4 x float>, <4 x float>, <4 x floa
;;;;
define <2 x double> @test_x86_fmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fmadd132sd
+ ; CHECK: fmadd213sd
%res = call <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fmadd132pd
+ ; CHECK: fmadd213pd
%res = call <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmadd.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fnmadd_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fnmadd132sd
+ ; CHECK: fnmadd213sd
%res = call <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmadd.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fnmadd_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fnmadd132pd
+ ; CHECK: fnmadd213pd
%res = call <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
ret <2 x double> %res
}
@@ -104,28 +104,28 @@ declare <2 x double> @llvm.x86.fma.vfnmadd.pd(<2 x double>, <2 x double>, <2 x d
define <2 x double> @test_x86_fmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fmsub132sd
+ ; CHECK: fmsub213sd
%res = call <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fmsub132pd
+ ; CHECK: fmsub213pd
%res = call <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfmsub.pd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fnmsub_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fnmsub132sd
+ ; CHECK: fnmsub213sd
%res = call <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
ret <2 x double> %res
}
declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
define <2 x double> @test_x86_fnmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
- ; CHECK: fnmsub132pd
+ ; CHECK: fnmsub213pd
%res = call <2 x double> @llvm.x86.fma.vfnmsub.pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) nounwind
ret <2 x double> %res
}
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
new file mode 100644
index 0000000000..b04663a602
--- /dev/null
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -0,0 +1,139 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vfmadd213ps %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ %x = fmul <4 x float> %a0, %a1
+ %res = fadd <4 x float> %x, %a2
+ ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: fmsub213ps %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ %x = fmul <4 x float> %a0, %a1
+ %res = fsub <4 x float> %x, %a2
+ ret <4 x float> %res
+}
+
+; CHECK: test_x86_fnmadd_ps
+; CHECK: fnmadd213ps %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ %x = fmul <4 x float> %a0, %a1
+ %res = fsub <4 x float> %a2, %x
+ ret <4 x float> %res
+}
+
+; CHECK: test_x86_fnmsub_ps
+; CHECK: fnmsub213ps %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+ %x = fmul <4 x float> %a0, %a1
+ %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+ %res = fsub <4 x float> %y, %a2
+ ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmadd_ps_y
+; CHECK: vfmadd213ps %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+ %x = fmul <8 x float> %a0, %a1
+ %res = fadd <8 x float> %x, %a2
+ ret <8 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps_y
+; CHECK: vfmsub213ps %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+ %x = fmul <8 x float> %a0, %a1
+ %res = fsub <8 x float> %x, %a2
+ ret <8 x float> %res
+}
+
+; CHECK: test_x86_fnmadd_ps_y
+; CHECK: vfnmadd213ps %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+ %x = fmul <8 x float> %a0, %a1
+ %res = fsub <8 x float> %a2, %x
+ ret <8 x float> %res
+}
+
+; CHECK: test_x86_fnmsub_ps_y
+; CHECK: vfnmsub213ps %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
+ %x = fmul <8 x float> %a0, %a1
+ %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
+ %res = fsub <8 x float> %y, %a2
+ ret <8 x float> %res
+}
+
+; CHECK: test_x86_fmadd_pd_y
+; CHECK: vfmadd213pd %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+ %x = fmul <4 x double> %a0, %a1
+ %res = fadd <4 x double> %x, %a2
+ ret <4 x double> %res
+}
+
+; CHECK: test_x86_fmsub_pd_y
+; CHECK: vfmsub213pd %ymm2, %ymm0, %ymm1
+; CHECK: ret
+define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+ %x = fmul <4 x double> %a0, %a1
+ %res = fsub <4 x double> %x, %a2
+ ret <4 x double> %res
+}
+
+; CHECK: test_x86_fmsub_pd
+; CHECK: vfmsub213pd %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+ %x = fmul <2 x double> %a0, %a1
+ %res = fsub <2 x double> %x, %a2
+ ret <2 x double> %res
+}
+
+; CHECK: test_x86_fnmadd_ss
+; CHECK: vfnmadd213ss %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
+ %x = fmul float %a0, %a1
+ %res = fsub float %a2, %x
+ ret float %res
+}
+
+; CHECK: test_x86_fnmadd_sd
+; CHECK: vfnmadd213sd %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
+ %x = fmul double %a0, %a1
+ %res = fsub double %a2, %x
+ ret double %res
+}
+
+; CHECK: test_x86_fmsub_sd
+; CHECK: vfmsub213sd %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
+ %x = fmul double %a0, %a1
+ %res = fsub double %x, %a2
+ ret double %res
+}
+
+; CHECK: test_x86_fnmsub_ss
+; CHECK: vfnmsub213ss %xmm2, %xmm0, %xmm1
+; CHECK: ret
+define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
+ %x = fsub float -0.000000e+00, %a0
+ %y = fmul float %x, %a1
+ %res = fsub float %y, %a2
+ ret float %res
+}