summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElena Demikhovsky <elena.demikhovsky@intel.com>2014-05-12 07:18:51 +0000
committerElena Demikhovsky <elena.demikhovsky@intel.com>2014-05-12 07:18:51 +0000
commit1cec507d6d34201ac4319924bcd01c84a7583182 (patch)
tree3df086831790ee6fe4e82f54d0deaf33470c942e
parentf37d8e6b1adc9270c0a31354ffc78224124651c3 (diff)
downloadllvm-1cec507d6d34201ac4319924bcd01c84a7583182.tar.gz
llvm-1cec507d6d34201ac4319924bcd01c84a7583182.tar.bz2
llvm-1cec507d6d34201ac4319924bcd01c84a7583182.tar.xz
AVX-512: changes in intrinsics
1) Changed gather and scatter intrinsics. Now they are aligned with GCC built-ins. There is no more non-masked form. Masked intrinsic receives -1 if all lanes are executed. 2) I changed the function that works with intrinsics inside X86ISelLowering.cpp. I put all intrinsics in one table. I did it for INTRINSICS_W_CHAIN and plan to put all intrinsics from WO_CHAIN set to the same table in order to avoid the long-long "switch". (I wanted to use static map initialization that allowed by C++11 but I wasn't able to compile it on VS2012). 3) I added gather/scatter prefetch intrinsics. 4) I fixed MRMm encoding for masked instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208522 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/IR/IntrinsicsX86.td167
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h9
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp4
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp354
-rw-r--r--lib/Target/X86/X86InstrAVX512.td56
-rw-r--r--test/CodeGen/X86/avx512-gather-scatter-intrin.ll206
6 files changed, 365 insertions, 431 deletions
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 145da599a1..36d93fee8a 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -3029,141 +3029,104 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// Gather and Scatter ops
let TargetPrefix = "x86" in {
- def int_x86_avx512_gather_dpd_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherdpd512">,
- Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i8_ty,
- llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty],
+ def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gathersiv8df">,
+ Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
+ llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
- def int_x86_avx512_gather_dps_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherdps512">,
- Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i16_ty,
- llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty],
+ def int_x86_avx512_gather_dps_512 : GCCBuiltin<"__builtin_ia32_gathersiv16sf">,
+ Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty,
+ llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
[IntrReadArgMem]>;
- def int_x86_avx512_gather_qpd_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherqpd512">,
- Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i8_ty,
- llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
+ def int_x86_avx512_gather_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherdiv8df">,
+ Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty,
+ llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
- def int_x86_avx512_gather_qps_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherqps512">,
- Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i8_ty,
- llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
+ def int_x86_avx512_gather_qps_512 : GCCBuiltin<"__builtin_ia32_gatherdiv16sf">,
+ Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty,
+ llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
- def int_x86_avx512_gather_dpd_512 : GCCBuiltin<"__builtin_ia32_gatherdpd512">,
- Intrinsic<[llvm_v8f64_ty], [llvm_v8i32_ty, llvm_ptr_ty,
- llvm_i32_ty],
- [IntrReadArgMem]>;
- def int_x86_avx512_gather_dps_512 : GCCBuiltin<"__builtin_ia32_gatherdps512">,
- Intrinsic<[llvm_v16f32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
- llvm_i32_ty],
- [IntrReadArgMem]>;
- def int_x86_avx512_gather_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherqpd512">,
- Intrinsic<[llvm_v8f64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
- llvm_i32_ty],
- [IntrReadArgMem]>;
- def int_x86_avx512_gather_qps_512 : GCCBuiltin<"__builtin_ia32_gatherqps512">,
- Intrinsic<[llvm_v8f32_ty], [llvm_v8i64_ty, llvm_ptr_ty,
- llvm_i32_ty],
- [IntrReadArgMem]>;
-
- def int_x86_avx512_gather_dpq_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherdpq512">,
- Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i8_ty,
- llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty],
- [IntrReadArgMem]>;
- def int_x86_avx512_gather_dpi_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherdpi512">,
- Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_i16_ty,
- llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty],
- [IntrReadArgMem]>;
- def int_x86_avx512_gather_qpq_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherqpq512">,
- Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i8_ty,
- llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
- [IntrReadArgMem]>;
- def int_x86_avx512_gather_qpi_mask_512 : GCCBuiltin<"__builtin_ia32_mask_gatherqpi512">,
- Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_i8_ty,
- llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty],
- [IntrReadArgMem]>;
- def int_x86_avx512_gather_dpq_512 : GCCBuiltin<"__builtin_ia32_gatherdpq512">,
- Intrinsic<[llvm_v8i64_ty], [llvm_v8i32_ty, llvm_ptr_ty,
- llvm_i32_ty],
+ def int_x86_avx512_gather_dpq_512 : GCCBuiltin<"__builtin_ia32_gathersiv8di">,
+ Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
+ llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
- def int_x86_avx512_gather_dpi_512 : GCCBuiltin<"__builtin_ia32_gatherdpi512">,
- Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
- llvm_i32_ty],
+ def int_x86_avx512_gather_dpi_512 : GCCBuiltin<"__builtin_ia32_gathersiv16si">,
+ Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty,
+ llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty],
[IntrReadArgMem]>;
- def int_x86_avx512_gather_qpq_512 : GCCBuiltin<"__builtin_ia32_gatherqpq512">,
+ def int_x86_avx512_gather_qpq_512 : GCCBuiltin<"__builtin_ia32_gatherdiv8di">,
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty,
- llvm_i32_ty],
+ llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
- def int_x86_avx512_gather_qpi_512 : GCCBuiltin<"__builtin_ia32_gatherqpi512">,
- Intrinsic<[llvm_v8i32_ty], [llvm_v8i64_ty, llvm_ptr_ty,
- llvm_i32_ty],
+ def int_x86_avx512_gather_qpi_512 : GCCBuiltin<"__builtin_ia32_gatherdiv16si">,
+ Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty,
+ llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty],
[IntrReadArgMem]>;
+
// scatter
- def int_x86_avx512_scatter_dpd_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterdpd512">,
+ def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scattersiv8df">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_dps_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterdps512">,
+ def int_x86_avx512_scatter_dps_512 : GCCBuiltin<"__builtin_ia32_scattersiv16sf">,
Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_qpd_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterqpd512">,
+ def int_x86_avx512_scatter_qpd_512 : GCCBuiltin<"__builtin_ia32_scatterdiv8df">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_qps_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterqps512">,
+ def int_x86_avx512_scatter_qps_512 : GCCBuiltin<"__builtin_ia32_scatterdiv16sf">,
Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_dpd_512 : GCCBuiltin<"__builtin_ia32_scatterdpd512">,
- Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f64_ty,
- llvm_i32_ty],
- [IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_dps_512 : GCCBuiltin<"__builtin_ia32_scatterdps512">,
- Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_v16f32_ty,
- llvm_i32_ty],
- [IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_qpd_512 : GCCBuiltin<"__builtin_ia32_scatterqpd512">,
- Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8f64_ty,
- llvm_i32_ty],
- [IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_qps_512 : GCCBuiltin<"__builtin_ia32_scatterqps512">,
- Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8f32_ty,
- llvm_i32_ty],
- [IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_dpq_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterdpq512">,
- Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty,
- llvm_v8i64_ty, llvm_i32_ty],
+ def int_x86_avx512_scatter_dpq_512 : GCCBuiltin<"__builtin_ia32_scattersiv8di">,
+ Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
+ llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_dpi_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterdpi512">,
+ def int_x86_avx512_scatter_dpi_512 : GCCBuiltin<"__builtin_ia32_scattersiv16si">,
Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty,
llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
[IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_qpq_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterqpq512">,
- Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
- llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty],
+ def int_x86_avx512_scatter_qpq_512 : GCCBuiltin<"__builtin_ia32_scatterdiv8di">,
+ Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,llvm_v8i64_ty, llvm_v8i64_ty,
+ llvm_i32_ty],
[IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_qpi_mask_512 : GCCBuiltin<"__builtin_ia32_mask_scatterqpi512">,
- Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,
- llvm_v8i64_ty, llvm_v8i32_ty, llvm_i32_ty],
+ def int_x86_avx512_scatter_qpi_512 : GCCBuiltin<"__builtin_ia32_scatterdiv16si">,
+ Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i64_ty, llvm_v8i32_ty,
+ llvm_i32_ty],
[IntrReadWriteArgMem]>;
- def int_x86_avx512_scatter_dpq_512 : GCCBuiltin<"__builtin_ia32_scatterdpq512">,
- Intrinsic<[], [llvm_ptr_ty,
- llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty],
- []>;
- def int_x86_avx512_scatter_dpi_512 : GCCBuiltin<"__builtin_ia32_scatterdpi512">,
- Intrinsic<[], [llvm_ptr_ty,
- llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty],
- []>;
- def int_x86_avx512_scatter_qpq_512 : GCCBuiltin<"__builtin_ia32_scatterqpq512">,
- Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i64_ty,
- llvm_i32_ty],
- []>;
- def int_x86_avx512_scatter_qpi_512 : GCCBuiltin<"__builtin_ia32_scatterqpi512">,
- Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i32_ty,
- llvm_i32_ty],
- []>;
+ // gather prefetch
+ def int_x86_avx512_gatherpf_dpd_512 : GCCBuiltin<"__builtin_ia32_gatherpfdpd">,
+ Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+ def int_x86_avx512_gatherpf_dps_512 : GCCBuiltin<"__builtin_ia32_gatherpfdps">,
+ Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+ def int_x86_avx512_gatherpf_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherpfqpd">,
+ Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+ def int_x86_avx512_gatherpf_qps_512 : GCCBuiltin<"__builtin_ia32_gatherpfqps">,
+ Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+
+ // scatter prefetch
+ def int_x86_avx512_scatterpf_dpd_512 : GCCBuiltin<"__builtin_ia32_scatterpfdpd">,
+ Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+ def int_x86_avx512_scatterpf_dps_512 : GCCBuiltin<"__builtin_ia32_scatterpfdps">,
+ Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+ def int_x86_avx512_scatterpf_qpd_512 : GCCBuiltin<"__builtin_ia32_scatterpfqpd">,
+ Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+ def int_x86_avx512_scatterpf_qps_512 : GCCBuiltin<"__builtin_ia32_scatterpfqps">,
+ Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty,
+ llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
}
// AVX-512 conflict detection
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 38fab15dc4..6aeb1f2c22 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -643,6 +643,10 @@ namespace X86II {
/// counted as one operand.
///
inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+ bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+ bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+ bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+
switch (TSFlags & X86II::FormMask) {
default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
case X86II::Pseudo:
@@ -660,9 +664,6 @@ namespace X86II {
case X86II::MRMDestMem:
return 0;
case X86II::MRMSrcMem: {
- bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
- bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
- bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
unsigned FirstMemOp = 1;
if (HasVEX_4V)
++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
@@ -690,6 +691,8 @@ namespace X86II {
unsigned FirstMemOp = 0;
if (HasVEX_4V)
++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
+ if (HasEVEX_K)
+ ++FirstMemOp;// Skip the mask register
return FirstMemOp;
}
case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index c9bbdade19..2152b21068 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1428,6 +1428,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM6r: case X86II::MRM7r: {
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
EmitByte(BaseOpcode, CurByte, OS);
uint64_t Form = TSFlags & X86II::FormMask;
EmitRegModRMByte(MI.getOperand(CurOp++),
@@ -1443,6 +1445,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
case X86II::MRM6m: case X86II::MRM7m: {
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
++CurOp;
+ if (HasEVEX_K) // Skip writemask
+ ++CurOp;
EmitByte(BaseOpcode, CurByte, OS);
uint64_t Form = TSFlags & X86II::FormMask;
EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 7d24e23d08..2801f30b30 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12420,27 +12420,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
}
static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Base, SDValue Index,
- SDValue ScaleOp, SDValue Chain,
- const X86Subtarget * Subtarget) {
- SDLoc dl(Op);
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
- assert(C && "Invalid scale type");
- SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
- SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
- EVT MaskVT = MVT::getVectorVT(MVT::i1,
- Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
- SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
- SDValue Segment = DAG.getRegister(0, MVT::i32);
- SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
- return DAG.getMergeValues(RetOps, dl);
-}
-
-static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Src, SDValue Mask, SDValue Base,
SDValue Index, SDValue ScaleOp, SDValue Chain,
const X86Subtarget * Subtarget) {
@@ -12450,7 +12429,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ else
+ MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -12463,8 +12447,8 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
}
static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Base, SDValue Index,
- SDValue ScaleOp, SDValue Chain) {
+ SDValue Src, SDValue Mask, SDValue Base,
+ SDValue Index, SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
@@ -12473,29 +12457,39 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
SDValue Segment = DAG.getRegister(0, MVT::i32);
EVT MaskVT = MVT::getVectorVT(MVT::i1,
Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ else
+ MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
return SDValue(Res, 1);
}
-static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
- SDValue Src, SDValue Mask, SDValue Base,
- SDValue Index, SDValue ScaleOp, SDValue Chain) {
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+ SDValue Mask, SDValue Base, SDValue Index,
+ SDValue ScaleOp, SDValue Chain) {
SDLoc dl(Op);
ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
assert(C && "Invalid scale type");
SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
SDValue Segment = DAG.getRegister(0, MVT::i32);
- EVT MaskVT = MVT::getVectorVT(MVT::i1,
- Index.getSimpleValueType().getVectorNumElements());
- SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
- SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
- SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
- SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
- return SDValue(Res, 1);
+ EVT MaskVT =
+ MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+ SDValue MaskInReg;
+ ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+ if (MaskC)
+ MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+ else
+ MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+ //SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+ SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+ return SDValue(Res, 0);
}
// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
@@ -12561,27 +12555,122 @@ static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getMergeValues(Results, DL);
}
+enum IntrinsicType {
+ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+};
+
+struct IntrinsicData {
+ IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
+ :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
+ IntrinsicType Type;
+ unsigned Opc0;
+ unsigned Opc1;
+};
+
+std::map < unsigned, IntrinsicData> IntrMap;
+static void InitIntinsicsMap() {
+ static bool Initialized = false;
+ if (Initialized)
+ return;
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+ IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+ IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
+ IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
+ IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
+ IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512,
+ IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512,
+ IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512,
+ IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512,
+ IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
+
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
+ IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512,
+ IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512,
+ IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512,
+ IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512,
+ IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
+
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
+ X86::VGATHERPF1QPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
+ X86::VGATHERPF1QPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
+ X86::VGATHERPF1DPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512,
+ IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
+ X86::VGATHERPF1DPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
+ X86::VSCATTERPF1QPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
+ X86::VSCATTERPF1QPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
+ X86::VSCATTERPF1DPDm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512,
+ IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
+ X86::VSCATTERPF1DPSm)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
+ IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
+ IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
+ IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
+ IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
+ IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
+ IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
+ IntrinsicData(XTEST, X86ISD::XTEST, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
+ IntrinsicData(RDTSC, X86ISD::RDTSC_DAG, 0)));
+ IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
+ IntrinsicData(RDTSC, X86ISD::RDTSCP_DAG, 0)));
+ Initialized = true;
+}
+
static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- SDLoc dl(Op);
+ InitIntinsicsMap();
unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- switch (IntNo) {
- default: return SDValue(); // Don't custom lower most intrinsics.
+ std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
+ if (itr == IntrMap.end())
+ return SDValue();
- // RDRAND/RDSEED intrinsics.
- case Intrinsic::x86_rdrand_16:
- case Intrinsic::x86_rdrand_32:
- case Intrinsic::x86_rdrand_64:
- case Intrinsic::x86_rdseed_16:
- case Intrinsic::x86_rdseed_32:
- case Intrinsic::x86_rdseed_64: {
- unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
- IntNo == Intrinsic::x86_rdseed_32 ||
- IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
- X86ISD::RDRAND;
+ SDLoc dl(Op);
+ IntrinsicData Intr = itr->second;
+ switch(Intr.Type) {
+ default:
+ llvm_unreachable("Unhandled intrinsic");
+ case RDSEED:
+ case RDRAND: {
// Emit the node with the right value type.
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
- SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
+ SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
@@ -12597,162 +12686,49 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
SDValue(Result.getNode(), 2));
}
- //int_gather(index, base, scale);
- case Intrinsic::x86_avx512_gather_qpd_512:
- case Intrinsic::x86_avx512_gather_qps_512:
- case Intrinsic::x86_avx512_gather_dpd_512:
- case Intrinsic::x86_avx512_gather_qpi_512:
- case Intrinsic::x86_avx512_gather_qpq_512:
- case Intrinsic::x86_avx512_gather_dpq_512:
- case Intrinsic::x86_avx512_gather_dps_512:
- case Intrinsic::x86_avx512_gather_dpi_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
- case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
- case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
- case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
- case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
- case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
- }
- SDValue Chain = Op.getOperand(0);
- SDValue Index = Op.getOperand(2);
- SDValue Base = Op.getOperand(3);
- SDValue Scale = Op.getOperand(4);
- return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
- }
- //int_gather_mask(v1, mask, index, base, scale);
- case Intrinsic::x86_avx512_gather_qps_mask_512:
- case Intrinsic::x86_avx512_gather_qpd_mask_512:
- case Intrinsic::x86_avx512_gather_dpd_mask_512:
- case Intrinsic::x86_avx512_gather_dps_mask_512:
- case Intrinsic::x86_avx512_gather_qpi_mask_512:
- case Intrinsic::x86_avx512_gather_qpq_mask_512:
- case Intrinsic::x86_avx512_gather_dpi_mask_512:
- case Intrinsic::x86_avx512_gather_dpq_mask_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_gather_qps_mask_512:
- Opc = X86::VGATHERQPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpd_mask_512:
- Opc = X86::VGATHERQPDZrm; break;
- case Intrinsic::x86_avx512_gather_dpd_mask_512:
- Opc = X86::VGATHERDPDZrm; break;
- case Intrinsic::x86_avx512_gather_dps_mask_512:
- Opc = X86::VGATHERDPSZrm; break;
- case Intrinsic::x86_avx512_gather_qpi_mask_512:
- Opc = X86::VPGATHERQDZrm; break;
- case Intrinsic::x86_avx512_gather_qpq_mask_512:
- Opc = X86::VPGATHERQQZrm; break;
- case Intrinsic::x86_avx512_gather_dpi_mask_512:
- Opc = X86::VPGATHERDDZrm; break;
- case Intrinsic::x86_avx512_gather_dpq_mask_512:
- Opc = X86::VPGATHERDQZrm; break;
- }
+ case GATHER: {
+ //gather(v1, mask, index, base, scale);
SDValue Chain = Op.getOperand(0);
SDValue Src = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
+ SDValue Base = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
- SDValue Base = Op.getOperand(5);
+ SDValue Mask = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+ return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
Subtarget);
}
- //int_scatter(base, index, v1, scale);
- case Intrinsic::x86_avx512_scatter_qpd_512:
- case Intrinsic::x86_avx512_scatter_qps_512:
- case Intrinsic::x86_avx512_scatter_dpd_512:
- case Intrinsic::x86_avx512_scatter_qpi_512:
- case Intrinsic::x86_avx512_scatter_qpq_512:
- case Intrinsic::x86_avx512_scatter_dpq_512:
- case Intrinsic::x86_avx512_scatter_dps_512:
- case Intrinsic::x86_avx512_scatter_dpi_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_scatter_qpd_512:
- Opc = X86::VSCATTERQPDZmr; break;
- case Intrinsic::x86_avx512_scatter_qps_512:
- Opc = X86::VSCATTERQPSZmr; break;
- case Intrinsic::x86_avx512_scatter_dpd_512:
- Opc = X86::VSCATTERDPDZmr; break;
- case Intrinsic::x86_avx512_scatter_dps_512:
- Opc = X86::VSCATTERDPSZmr; break;
- case Intrinsic::x86_avx512_scatter_qpi_512:
- Opc = X86::VPSCATTERQDZmr; break;
- case Intrinsic::x86_avx512_scatter_qpq_512:
- Opc = X86::VPSCATTERQQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpq_512:
- Opc = X86::VPSCATTERDQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpi_512:
- Opc = X86::VPSCATTERDDZmr; break;
- }
- SDValue Chain = Op.getOperand(0);
- SDValue Base = Op.getOperand(2);
- SDValue Index = Op.getOperand(3);
- SDValue Src = Op.getOperand(4);
- SDValue Scale = Op.getOperand(5);
- return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
- }
- //int_scatter_mask(base, mask, index, v1, scale);
- case Intrinsic::x86_avx512_scatter_qps_mask_512:
- case Intrinsic::x86_avx512_scatter_qpd_mask_512:
- case Intrinsic::x86_avx512_scatter_dpd_mask_512:
- case Intrinsic::x86_avx512_scatter_dps_mask_512:
- case Intrinsic::x86_avx512_scatter_qpi_mask_512:
- case Intrinsic::x86_avx512_scatter_qpq_mask_512:
- case Intrinsic::x86_avx512_scatter_dpi_mask_512:
- case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_avx512_scatter_qpd_mask_512:
- Opc = X86::VSCATTERQPDZmr; break;
- case Intrinsic::x86_avx512_scatter_qps_mask_512:
- Opc = X86::VSCATTERQPSZmr; break;
- case Intrinsic::x86_avx512_scatter_dpd_mask_512:
- Opc = X86::VSCATTERDPDZmr; break;
- case Intrinsic::x86_avx512_scatter_dps_mask_512:
- Opc = X86::VSCATTERDPSZmr; break;
- case Intrinsic::x86_avx512_scatter_qpi_mask_512:
- Opc = X86::VPSCATTERQDZmr; break;
- case Intrinsic::x86_avx512_scatter_qpq_mask_512:
- Opc = X86::VPSCATTERQQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpq_mask_512:
- Opc = X86::VPSCATTERDQZmr; break;
- case Intrinsic::x86_avx512_scatter_dpi_mask_512:
- Opc = X86::VPSCATTERDDZmr; break;
- }
+ case SCATTER: {
+ //scatter(base, mask, index, v1, scale);
SDValue Chain = Op.getOperand(0);
SDValue Base = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue Index = Op.getOperand(4);
SDValue Src = Op.getOperand(5);
SDValue Scale = Op.getOperand(6);
- return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+ return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+ }
+ case PREFETCH: {
+ SDValue Hint = Op.getOperand(6);
+ unsigned HintVal;
+ if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
+ (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
+ llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
+ unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
+ SDValue Chain = Op.getOperand(0);
+ SDValue Mask = Op.getOperand(2);
+ SDValue Index = Op.getOperand(3);
+ SDValue Base = Op.getOperand(4);
+ SDValue Scale = Op.getOperand(5);
+ return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
}
- // Read Time Stamp Counter (RDTSC).
- case Intrinsic::x86_rdtsc:
- // Read Time Stamp Counter and Processor ID (RDTSCP).
- case Intrinsic::x86_rdtscp: {
- unsigned Opc;
- switch (IntNo) {
- default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
- case Intrinsic::x86_rdtsc:
- Opc = X86ISD::RDTSC_DAG; break;
- case Intrinsic::x86_rdtscp:
- Opc = X86ISD::RDTSCP_DAG; break;
- }
+ // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+ case RDTSC: {
SmallVector<SDValue, 2> Results;
- getReadTimeStampCounter(Op.getNode(), dl, Opc, DAG, Subtarget, Results);
+ getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
return DAG.getMergeValues(Results, dl);
}
// XTEST intrinsics.
- case Intrinsic::x86_xtest: {
+ case XTEST: {
SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 1cd1adcfc5..37bcc5235e 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -4070,6 +4070,62 @@ defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
EVEX_V512, EVEX_CD8<32, CD8VT1>;
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+ RegisterClass KRC, X86MemOperand memop> {
+ let Predicates = [HasPFI], hasSideEffects = 1 in
+ def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+ !strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"),
+ []>, EVEX, EVEX_K;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+ VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+ VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+ VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+ VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
//===----------------------------------------------------------------------===//
// VSHUFPS - VSHUFPD Operations
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index e429a224f4..20bf7e4a16 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -1,14 +1,14 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
-declare <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float>, i16, <16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dps.mask.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double>, i8, <8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpd.mask.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
+declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
+declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
+declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
-declare <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qps.mask.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
+declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
;CHECK-LABEL: gather_mask_dps
;CHECK: kmovw
@@ -17,9 +17,9 @@ declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x dou
;CHECK: vscatterdps
;CHECK: ret
define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf) {
- %x = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+ %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
- call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
ret void
}
@@ -30,9 +30,9 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8*
;CHECK: vscatterdpd
;CHECK: ret
define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
- %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+ %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
- call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
ret void
}
@@ -43,9 +43,9 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b
;CHECK: vscatterqps
;CHECK: ret
define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf) {
- %x = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+ %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
- call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
ret void
}
@@ -56,23 +56,23 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba
;CHECK: vscatterqpd
;CHECK: ret
define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf) {
- %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+ %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
- call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
ret void
}
;;
;; Integer Gather/Scatter
;;
-declare <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32>, i16, <16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpi.mask.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64>, i8, <8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpq.mask.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
+declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
+declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
-declare <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpi.mask.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
+declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
;CHECK-LABEL: gather_mask_dd
;CHECK: kmovw
@@ -81,9 +81,9 @@ declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64
;CHECK: vpscatterdd
;CHECK: ret
define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf) {
- %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+ %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
%ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
- call void @llvm.x86.avx512.scatter.dpi.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
ret void
}
@@ -94,9 +94,9 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba
;CHECK: vpscatterqd
;CHECK: ret
define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf) {
- %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+ %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
- call void @llvm.x86.avx512.scatter.qpi.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
ret void
}
@@ -107,9 +107,9 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base,
;CHECK: vpscatterqq
;CHECK: ret
define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
- %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+ %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
- call void @llvm.x86.avx512.scatter.qpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
ret void
}
@@ -120,116 +120,19 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base,
;CHECK: vpscatterdq
;CHECK: ret
define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf) {
- %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+ %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
%ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
- call void @llvm.x86.avx512.scatter.dpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
ret void
}
-;; FP Intinsics without masks
-
-declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dps.512 (i8*, <16 x i32>, <16 x float>, i32)
-declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qps.512 (i8*, <8 x i64>, <8 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, <8 x i64>, <8 x double>, i32)
-
-;CHECK-LABEL: gather_dps
-;CHECK: kxnorw
-;CHECK: vgatherdps
-;CHECK: vscatterdps
-;CHECK: ret
-define void @gather_dps(<16 x i32> %ind, i8* %base, i8* %stbuf) {
- %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>%ind, i8* %base, i32 4)
- %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
- call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, <16 x i32>%ind2, <16 x float> %x, i32 4)
- ret void
-}
-
-;CHECK-LABEL: gather_qps
-;CHECK: kxnorw
-;CHECK: vgatherqps
-;CHECK: vscatterqps
-;CHECK: ret
-define void @gather_qps(<8 x i64> %ind, i8* %base, i8* %stbuf) {
- %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>%ind, i8* %base, i32 4)
- %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
- call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, <8 x i64>%ind2, <8 x float> %x, i32 4)
- ret void
-}
-
-;CHECK-LABEL: gather_qpd
-;CHECK: kxnorw
-;CHECK: vgatherqpd
-;CHECK: vpadd
-;CHECK: vscatterqpd
-;CHECK: ret
-define void @gather_qpd(<8 x i64> %ind, i8* %base, i8* %stbuf) {
- %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>%ind, i8* %base, i32 4)
- %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
- call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, <8 x i64>%ind2, <8 x double> %x, i32 4)
- ret void
-}
-
-;; Integer Intinsics without masks
-
-declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, <16 x i32>, <16 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, <8 x i32>, <8 x i64>, i32)
-
-declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, <8 x i64>, <8 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, <8 x i64>, <8 x i64>, i32)
-
-;CHECK-LABEL: gather_dpi
-;CHECK: kxnorw
-;CHECK: vpgatherdd
-;CHECK: vpscatterdd
-;CHECK: ret
-define void @gather_dpi(<16 x i32> %ind, i8* %base, i8* %stbuf) {
- %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>%ind, i8* %base, i32 4)
- %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
- call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, <16 x i32>%ind2, <16 x i32> %x, i32 4)
- ret void
-}
-
-;CHECK-LABEL: gather_qpq
-;CHECK: vpxord %zmm
-;CHECK: kxnorw
-;CHECK: vpgatherqq
-;CHECK: vpadd
-;CHECK: vpscatterqq
-;CHECK: ret
-define void @gather_qpq(<8 x i64> %ind, i8* %base, i8* %stbuf) {
- %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>%ind, i8* %base, i32 4)
- %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
- call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i64> %x, i32 4)
- ret void
-}
-
-;CHECK-LABEL: gather_qpi
-;CHECK: vpxor %ymm
-;CHECK: kxnorw
-;CHECK: vpgatherqd
-;CHECK: vpadd
-;CHECK: vpscatterqd
-;CHECK: ret
-define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf) {
- %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>%ind, i8* %base, i32 4)
- %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
- call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4)
- ret void
-}
;CHECK-LABEL: gather_mask_dpd_execdomain
;CHECK: vgatherdpd
;CHECK: vmovapd
;CHECK: ret
define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
- %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+ %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
ret void
}
@@ -239,7 +142,7 @@ define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %m
;CHECK: vmovapd
;CHECK: ret
define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf) {
- %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+ %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
store <8 x double> %x, <8 x double>* %stbuf
ret void
}
@@ -249,7 +152,7 @@ define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %m
;CHECK: vmovaps
;CHECK: ret
define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base) {
- %res = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+ %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
ret <16 x float> %res;
}
@@ -258,7 +161,7 @@ define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %s
;CHECK: vmovaps
;CHECK: ret
define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base) {
- %res = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+ %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
ret <8 x float> %res;
}
@@ -268,7 +171,7 @@ define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src,
;CHECK: ret
define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = load <8 x double>* %src, align 64
- call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
ret void
}
@@ -278,7 +181,7 @@ define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8
;CHECK: ret
define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = load <8 x double>* %src, align 64
- call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
ret void
}
@@ -288,7 +191,7 @@ define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8
;CHECK: ret
define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf) {
%x = load <16 x float>* %src, align 64
- call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
ret void
}
@@ -298,6 +201,35 @@ define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i1
;CHECK: ret
define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf) {
%x = load <8 x float>* %src, align 32
- call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+ call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+ ret void
+}
+
+;CHECK-LABEL: gather_qps
+;CHECK: kxnorw
+;CHECK: vgatherqps
+;CHECK: vpadd
+;CHECK: vscatterqps
+;CHECK: ret
+define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf) {
+ %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
+ %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+ call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
+ ret void
+}
+
+;CHECK-LABEL: prefetch
+;CHECK: gatherpf0
+;CHECK: gatherpf1
+;CHECK: scatterpf0
+;CHECK: scatterpf1
+;CHECK: ret
+declare void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
+declare void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
+define void @prefetch(<8 x i64> %ind, i8* %base) {
+ call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
+ call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
+ call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
+ call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
ret void
}