summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorElena Demikhovsky <elena.demikhovsky@intel.com>2013-08-26 12:45:35 +0000
committerElena Demikhovsky <elena.demikhovsky@intel.com>2013-08-26 12:45:35 +0000
commit92bfb547700550fcdb668862533e4952a8d74969 (patch)
treeb4ed707646d62007d9781b52759757674f318f72 /lib
parente4bf77a1282bfdacb61bae192fdf79a696be780a (diff)
downloadllvm-92bfb547700550fcdb668862533e4952a8d74969.tar.gz
llvm-92bfb547700550fcdb668862533e4952a8d74969.tar.bz2
llvm-92bfb547700550fcdb668862533e4952a8d74969.tar.xz
AVX-512: Added shuffle instructions -
VPSHUFD, VPERMILPS, VMOVDDUP, VMOVLHPS, VMOVHLPS, VSHUFPS, VALIGN single and double forms. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189215 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp68
-rw-r--r--lib/Target/X86/X86InstrAVX512.td139
-rw-r--r--lib/Target/X86/X86InstrSSE.td4
3 files changed, 177 insertions, 34 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index a00f848d2a..6a7ca7d5b4 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -3600,7 +3600,7 @@ static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
return false;
unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
// Do not handle 64-bit element shuffles with palignr.
@@ -3683,10 +3683,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
/// specifies a shuffle of elements that is suitable for input to 128/256-bit
/// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
/// reverse of what x86 shuffles want.
-static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
- bool Commuted = false) {
- if (!HasFp256 && VT.is256BitVector())
- return false;
+static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
unsigned NumElems = VT.getVectorNumElements();
unsigned NumLanes = VT.getSizeInBits()/128;
@@ -3695,6 +3692,10 @@ static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
if (NumLaneElems != 2 && NumLaneElems != 4)
return false;
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ bool symetricMaskRequired =
+ (VT.getSizeInBits() >= 256) && (EltSize == 32);
+
// VSHUFPSY divides the resulting vector into 4 chunks.
// The sources are also splitted into 4 chunks, and each destination
// chunk must come from a different source chunk.
@@ -3714,6 +3715,7 @@ static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
//
// DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0
//
+ SmallVector<int, 4> MaskVal(NumLaneElems, -1);
unsigned HalfLaneElems = NumLaneElems/2;
for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
for (unsigned i = 0; i != NumLaneElems; ++i) {
@@ -3724,9 +3726,13 @@ static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256,
// For VSHUFPSY, the mask of the second half must be the same as the
// first but with the appropriate offsets. This works in the same way as
// VPERMILPS works with masks.
- if (NumElems != 8 || l == 0 || Mask[i] < 0)
+ if (!symetricMaskRequired || Idx < 0)
+ continue;
+ if (MaskVal[i] < 0) {
+ MaskVal[i] = Idx - l;
continue;
- if (!isUndefOrEqual(Idx, Mask[i]+l))
+ }
+ if ((signed)(Idx - l) != MaskVal[i])
return false;
}
}
@@ -4158,31 +4164,32 @@ static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
/// to the same elements of the low, but to the higher half of the source.
/// In VPERMILPD the two lanes could be shuffled independently of each other
/// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
-static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
- if (!HasFp256)
+static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
+ unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+ if (VT.getSizeInBits() < 256 || EltSize < 32)
return false;
-
+ bool symetricMaskRequired = (EltSize == 32);
unsigned NumElts = VT.getVectorNumElements();
- // Only match 256-bit with 32/64-bit types
- if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
- return false;
unsigned NumLanes = VT.getSizeInBits()/128;
unsigned LaneSize = NumElts/NumLanes;
+ // 2 or 4 elements in one lane
+
+ SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
for (unsigned l = 0; l != NumElts; l += LaneSize) {
for (unsigned i = 0; i != LaneSize; ++i) {
if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
return false;
- if (NumElts != 8 || l == 0)
- continue;
- // VPERMILPS handling
- if (Mask[i] < 0)
- continue;
- if (!isUndefOrEqual(Mask[i+l], Mask[i]+l))
- return false;
+ if (symetricMaskRequired) {
+ if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
+ ExpectedMaskVal[i] = Mask[i+l] - l;
+ continue;
+ }
+ if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
+ return false;
+ }
}
}
-
return true;
}
@@ -4431,10 +4438,11 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
MVT VT = SVOp->getSimpleValueType(0);
- unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
+ unsigned EltSize = VT.is512BitVector() ? 1 :
+ VT.getVectorElementType().getSizeInBits() >> 3;
unsigned NumElts = VT.getVectorNumElements();
- unsigned NumLanes = VT.getSizeInBits()/128;
+ unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
unsigned NumLaneElts = NumElts/NumLanes;
int Val = 0;
@@ -7407,7 +7415,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
}
// Normalize the node to match x86 shuffle ops if needed
- if (!V2IsUndef && (isSHUFPMask(M, VT, HasFp256, /* Commuted */ true)))
+ if (!V2IsUndef && (isSHUFPMask(M, VT, /* Commuted */ true)))
return CommuteVectorShuffle(SVOp, DAG);
// The checks below are all present in isShuffleMaskLegal, but they are
@@ -7430,7 +7438,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
getShufflePSHUFLWImmediate(SVOp),
DAG);
- if (isSHUFPMask(M, VT, HasFp256))
+ if (isSHUFPMask(M, VT))
return getTargetShuffleNode(X86ISD::SHUFP, dl, VT, V1, V2,
getShuffleSHUFImmediate(SVOp), DAG);
@@ -7449,8 +7457,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getTargetShuffleNode(X86ISD::MOVDDUP, dl, VT, V1, DAG);
// Handle VPERMILPS/D* permutations
- if (isVPERMILPMask(M, VT, HasFp256)) {
- if (HasInt256 && VT == MVT::v8i32)
+ if (isVPERMILPMask(M, VT)) {
+ if ((HasInt256 && VT == MVT::v8i32) || VT == MVT::v16i32)
return getTargetShuffleNode(X86ISD::PSHUFD, dl, VT, V1,
getShuffleSHUFImmediate(SVOp), DAG);
return getTargetShuffleNode(X86ISD::VPERMILP, dl, VT, V1,
@@ -13621,7 +13629,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
return (SVT.getVectorNumElements() == 2 ||
ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
isMOVLMask(M, SVT) ||
- isSHUFPMask(M, SVT, Subtarget->hasFp256()) ||
+ isSHUFPMask(M, SVT) ||
isPSHUFDMask(M, SVT) ||
isPSHUFHWMask(M, SVT, Subtarget->hasInt256()) ||
isPSHUFLWMask(M, SVT, Subtarget->hasInt256()) ||
@@ -13646,8 +13654,8 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
if (NumElts == 4 && SVT.is128BitVector()) {
return (isMOVLMask(Mask, SVT) ||
isCommutedMOVLMask(Mask, SVT, true) ||
- isSHUFPMask(Mask, SVT, Subtarget->hasFp256()) ||
- isSHUFPMask(Mask, SVT, Subtarget->hasFp256(), /* Commuted */ true));
+ isSHUFPMask(Mask, SVT) ||
+ isSHUFPMask(Mask, SVT, /* Commuted */ true));
}
return false;
}
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 17be5df394..cf4a0f56eb 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -1621,6 +1621,45 @@ defm VPUNPCKHDQZ : avx512_unpack_int<0x6A, "vpunpckhdq", X86Unpckh, v16i32,
defm VPUNPCKHQDQZ : avx512_unpack_int<0x6D, "vpunpckhqdq", X86Unpckh, v8i64,
VR512, memopv8i64, i512mem>, EVEX_V512,
VEX_W, EVEX_CD8<64, CD8VF>;
+//===----------------------------------------------------------------------===//
+// AVX-512 - PSHUFD
+//
+
+multiclass avx512_pshuf_imm<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ SDNode OpNode, PatFrag mem_frag,
+ X86MemOperand x86memop, ValueType OpVT> {
+ def ri : AVX512Ii8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1, (i8 imm:$src2))))]>,
+ EVEX;
+ def mi : AVX512Ii8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins x86memop:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set RC:$dst,
+ (OpVT (OpNode (mem_frag addr:$src1),
+ (i8 imm:$src2))))]>, EVEX;
+}
+
+defm VPSHUFDZ : avx512_pshuf_imm<0x70, "vpshufd", VR512, X86PShufd, memopv16i32,
+ i512mem, v16i32>, OpSize, EVEX_V512, EVEX_CD8<32, CD8VF>;
+
+let ExeDomain = SSEPackedSingle in
+defm VPERMILPSZ : avx512_pshuf_imm<0x04, "vpermilps", VR512, X86VPermilp,
+ memopv16f32, i512mem, v16f32>, OpSize, TA, EVEX_V512,
+ EVEX_CD8<32, CD8VF>;
+let ExeDomain = SSEPackedDouble in
+defm VPERMILPDZ : avx512_pshuf_imm<0x05, "vpermilpd", VR512, X86VPermilp,
+ memopv8f64, i512mem, v8f64>, OpSize, TA, EVEX_V512,
+ VEX_W, EVEX_CD8<32, CD8VF>;
+
+def : Pat<(v16i32 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+ (VPERMILPSZri VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86VPermilp VR512:$src1, (i8 imm:$imm))),
+ (VPERMILPDZri VR512:$src1, imm:$imm)>;
//===----------------------------------------------------------------------===//
// AVX-512 Logical Instructions
@@ -1774,8 +1813,8 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr, RegisterClass KRC,
defm VPTESTMDZ : avx512_vptest<0x27, "vptestmd", VK16, VR512, f512mem,
memopv16i32, X86testm, v16i32>, EVEX_V512,
EVEX_CD8<32, CD8VF>;
-defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem, memopv8i64,
- X86testm, v8i64>, EVEX_V512, VEX_W,
+defm VPTESTMQZ : avx512_vptest<0x27, "vptestmq", VK8, VR512, f512mem,
+ memopv8i64, X86testm, v8i64>, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
@@ -1914,3 +1953,99 @@ defm VPSRAVDZ : avx512_var_shift<0x46, "vpsravd", sra, VR512, v16i32,
defm VPSRAVQZ : avx512_var_shift<0x46, "vpsravq", sra, VR512, v8i64,
i512mem, memopv8i64>, EVEX_V512, VEX_W,
EVEX_CD8<64, CD8VF>;
+
+//===----------------------------------------------------------------------===//
+// AVX-512 - MOVDDUP
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_movddup<string OpcodeStr, RegisterClass RC, ValueType VT,
+ X86MemOperand x86memop, PatFrag memop_frag> {
+def rr : AVX512PDI<0x12, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (VT (X86Movddup RC:$src)))]>, EVEX;
+def rm : AVX512PDI<0x12, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst,
+ (VT (X86Movddup (memop_frag addr:$src))))]>, EVEX;
+}
+
+defm VMOVDDUPZ : avx512_movddup<"vmovddup", VR512, v8f64, f512mem, memopv8f64>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))),
+ (VMOVDDUPZrm addr:$src)>;
+
+def VMOVLHPSZrr : AVX512PSI<0x16, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovlhps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128X:$dst, (v4f32 (X86Movlhps VR128X:$src1, VR128X:$src2)))],
+ IIC_SSE_MOV_LH>, EVEX_4V;
+def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
+ (ins VR128X:$src1, VR128X:$src2),
+ "vmovhlps{z}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
+ IIC_SSE_MOV_LH>, EVEX_4V;
+
+// MOVLHPS patterns
+def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+ (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
+def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
+ (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
+
+// MOVHLPS patterns
+def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
+ (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
+//===----------------------------------------------------------------------===//
+// VSHUFPS - VSHUFPD Operations
+
+multiclass avx512_shufp<RegisterClass RC, X86MemOperand x86memop,
+ ValueType vt, string OpcodeStr, PatFrag mem_frag,
+ Domain d> {
+ def rmi : AVX512PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
+ (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+ EVEX_4V, TB, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ def rri : AVX512PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
+ (i8 imm:$src3))))], d, IIC_SSE_SHUFP>,
+ EVEX_4V, TB, Sched<[WriteShuffle]>;
+}
+
+defm VSHUFPSZ : avx512_shufp<VR512, f512mem, v16f32, "vshufps", memopv16f32,
+ SSEPackedSingle>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VSHUFPDZ : avx512_shufp<VR512, f512mem, v8f64, "vshufpd", memopv8f64,
+ SSEPackedDouble>, OpSize, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+
+multiclass avx512_alignr<string OpcodeStr, RegisterClass RC,
+ X86MemOperand x86memop> {
+ def rri : AVX512AIi8<0x03, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, EVEX_4V;
+ def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, i8imm:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+ []>, EVEX_4V;
+}
+defm VALIGND : avx512_alignr<"valignd", VR512, i512mem>,
+ EVEX_V512, EVEX_CD8<32, CD8VF>;
+defm VALIGNQ : avx512_alignr<"valignq", VR512, i512mem>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+ (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8f64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+ (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v16i32 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+ (VALIGNDrri VR512:$src2, VR512:$src1, imm:$imm)>;
+def : Pat<(v8i64 (X86PAlignr VR512:$src1, VR512:$src2, (i8 imm:$imm))),
+ (VALIGNQrri VR512:$src2, VR512:$src1, imm:$imm)>;
+
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 4eaba38e52..9b27e27e8d 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1327,7 +1327,7 @@ let Predicates = [UseSSE2] in {
// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
//===----------------------------------------------------------------------===//
-let AddedComplexity = 20 in {
+let AddedComplexity = 20, Predicates = [UseAVX] in {
def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
"movlhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1358,7 +1358,7 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
IIC_SSE_MOV_LH>, Sched<[WriteShuffle]>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [UseAVX] in {
// MOVLHPS patterns
def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
(VMOVLHPSrr VR128:$src1, VR128:$src2)>;