summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorBruno Cardoso Lopes <bruno.cardoso@gmail.com>2011-07-14 18:50:58 +0000
committerBruno Cardoso Lopes <bruno.cardoso@gmail.com>2011-07-14 18:50:58 +0000
commit62f67f86fea55562ef28650b87f73924855438fa (patch)
treed4f39f450bc8b5f1c2d78af64c896331cbca883f /lib
parenteac0796542d098caa371856d545faa6cdab5aad3 (diff)
downloadllvm-62f67f86fea55562ef28650b87f73924855438fa.tar.gz
llvm-62f67f86fea55562ef28650b87f73924855438fa.tar.bz2
llvm-62f67f86fea55562ef28650b87f73924855438fa.tar.xz
Add 256-bit load/store recognition and matching in several places.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@135171 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib')
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp46
-rw-r--r--lib/Target/X86/X86InstrSSE.td32
2 files changed, 74 insertions, 4 deletions
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 702331d8ed..0509a1515a 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -301,12 +301,17 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::MOVAPDrr, X86::MOVAPDmr, 0, 16 },
{ X86::MOVAPSrr, X86::MOVAPSmr, 0, 16 },
{ X86::MOVDQArr, X86::MOVDQAmr, 0, 16 },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYmr, 0, 32 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYmr, 0, 32 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYmr, 0, 32 },
{ X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
{ X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
{ X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
{ X86::MOVSS2DIrr, X86::MOVSS2DImr, 0, 0 },
{ X86::MOVUPDrr, X86::MOVUPDmr, 0, 0 },
{ X86::MOVUPSrr, X86::MOVUPSmr, 0, 0 },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYmr, 0, 0 },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYmr, 0, 0 },
{ X86::MUL16r, X86::MUL16m, 1, 0 },
{ X86::MUL32r, X86::MUL32m, 1, 0 },
{ X86::MUL64r, X86::MUL64m, 1, 0 },
@@ -411,10 +416,13 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::MOV8rr, X86::MOV8rm, 0 },
{ X86::MOVAPDrr, X86::MOVAPDrm, 16 },
{ X86::MOVAPSrr, X86::MOVAPSrm, 16 },
+ { X86::VMOVAPDYrr, X86::VMOVAPDYrm, 32 },
+ { X86::VMOVAPSYrr, X86::VMOVAPSYrm, 32 },
{ X86::MOVDDUPrr, X86::MOVDDUPrm, 0 },
{ X86::MOVDI2PDIrr, X86::MOVDI2PDIrm, 0 },
{ X86::MOVDI2SSrr, X86::MOVDI2SSrm, 0 },
{ X86::MOVDQArr, X86::MOVDQArm, 16 },
+ { X86::VMOVDQAYrr, X86::VMOVDQAYrm, 16 },
{ X86::MOVSHDUPrr, X86::MOVSHDUPrm, 16 },
{ X86::MOVSLDUPrr, X86::MOVSLDUPrm, 16 },
{ X86::MOVSX16rr8, X86::MOVSX16rm8, 0 },
@@ -425,6 +433,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::MOVSX64rr8, X86::MOVSX64rm8, 0 },
{ X86::MOVUPDrr, X86::MOVUPDrm, 16 },
{ X86::MOVUPSrr, X86::MOVUPSrm, 0 },
+ { X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
+ { X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
{ X86::MOVZDI2PDIrr, X86::MOVZDI2PDIrm, 0 },
{ X86::MOVZQI2PQIrr, X86::MOVZQI2PQIrm, 0 },
{ X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, 16 },
@@ -787,6 +797,9 @@ static bool isFrameLoadOpcode(int Opcode) {
case X86::MOVAPSrm:
case X86::MOVAPDrm:
case X86::MOVDQArm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
return true;
@@ -808,6 +821,9 @@ static bool isFrameStoreOpcode(int Opcode) {
case X86::MOVAPSmr:
case X86::MOVAPDmr:
case X86::MOVDQAmr:
+ case X86::VMOVAPSYmr:
+ case X86::VMOVAPDYmr:
+ case X86::VMOVDQAYmr:
case X86::MMX_MOVD64mr:
case X86::MMX_MOVQ64mr:
case X86::MMX_MOVNTQmr:
@@ -926,6 +942,10 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
case X86::MOVUPSrm:
case X86::MOVAPDrm:
case X86::MOVDQArm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
case X86::MMX_MOVD64rm:
case X86::MMX_MOVQ64rm:
case X86::FsMOVAPSrm:
@@ -1975,6 +1995,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
Opc = X86::MOV8rr;
} else if (X86::VR128RegClass.contains(DestReg, SrcReg))
Opc = X86::MOVAPSrr;
+ else if (X86::VR256RegClass.contains(DestReg, SrcReg))
+ Opc = X86::VMOVAPSYrr;
else if (X86::VR64RegClass.contains(DestReg, SrcReg))
Opc = X86::MMX_MOVQ64rr;
else
@@ -2064,6 +2086,13 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
return load ? X86::MOVAPSrm : X86::MOVAPSmr;
else
return load ? X86::MOVUPSrm : X86::MOVUPSmr;
+ case 32:
+ assert(X86::VR256RegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
+ // If stack is realigned we can use aligned stores.
+ if (isStackAligned)
+ return load ? X86::VMOVAPSYrm : X86::VMOVAPSYmr;
+ else
+ return load ? X86::VMOVUPSYrm : X86::VMOVUPSYmr;
}
}
@@ -2853,6 +2882,11 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVAPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
break;
}
switch (Opc2) {
@@ -2875,6 +2909,11 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
case X86::MOVAPDrm:
case X86::MOVDQArm:
case X86::MOVDQUrm:
+ case X86::VMOVAPSYrm:
+ case X86::VMOVUPSYrm:
+ case X86::VMOVAPDYrm:
+ case X86::VMOVDQAYrm:
+ case X86::VMOVDQUYrm:
break;
}
@@ -3053,6 +3092,13 @@ static const unsigned ReplaceableInstrs[][3] = {
{ X86::AVX_SET0PS, X86::AVX_SET0PD, X86::AVX_SET0PI },
{ X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
{ X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
+ // AVX 256-bit support
+ { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
+ { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
+ { X86::VMOVAPSYrr, X86::VMOVAPDYrr, X86::VMOVDQAYrr },
+ { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
+ { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
+ { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
};
// FIXME: Some shuffle and unpack instructions have equivalents in different
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index e6167c73d3..da66e87a59 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -2022,7 +2022,10 @@ def V_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
}
// The same as done above but for AVX. The 128-bit versions are the
-// same, but re-encoded. The 256-bit does not support PI version.
+// same, but re-encoded. The 256-bit does not support PI version, and
+// doesn't need it because on sandy bridge the register is set to zero
+// at the rename stage without using any execution unit, so SET0PSY
+// and SET0PDY can be used for vector int instructions without penalty
// FIXME: Change encoding to pseudo! This is blocked right now by the x86
// JIT implementatioan, it does not expand the instructions below like
// X86MCInstLower does.
@@ -2037,8 +2040,8 @@ def AVX_SET0PSY : PSI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
def AVX_SET0PDY : PDI<0x57, MRMInitReg, (outs VR256:$dst), (ins), "",
[(set VR256:$dst, (v4f64 immAllZerosV))]>, VEX_4V;
let ExeDomain = SSEPackedInt in
-def AVX_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
- [(set VR128:$dst, (v4i32 immAllZerosV))]>;
+def AVX_SET0PI : PDI<0xEF, MRMInitReg, (outs VR128:$dst), (ins), "",
+ [(set VR128:$dst, (v4i32 immAllZerosV))]>;
}
def : Pat<(v2i64 immAllZerosV), (V_SET0PI)>;
@@ -3831,6 +3834,8 @@ def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
(CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
// Use movaps / movups for SSE integer load / store (one byte shorter).
+// The instructions selected below are then converted to MOVDQA/MOVDQU
+// during the SSE domain pass.
let Predicates = [HasSSE1] in {
def : Pat<(alignedloadv4i32 addr:$src),
(MOVAPSrm addr:$src)>;
@@ -3859,8 +3864,9 @@ let Predicates = [HasSSE1] in {
(MOVUPSmr addr:$dst, VR128:$src)>;
}
-// Use vmovaps/vmovups for AVX 128-bit integer load/store (one byte shorter).
+// Use vmovaps/vmovups for AVX integer load/store.
let Predicates = [HasAVX] in {
+ // 128-bit load/store
def : Pat<(alignedloadv4i32 addr:$src),
(VMOVAPSrm addr:$src)>;
def : Pat<(loadv4i32 addr:$src),
@@ -3886,6 +3892,24 @@ let Predicates = [HasAVX] in {
(VMOVUPSmr addr:$dst, VR128:$src)>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
(VMOVUPSmr addr:$dst, VR128:$src)>;
+
+ // 256-bit load/store
+ def : Pat<(alignedloadv4i64 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(loadv4i64 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(alignedloadv8i32 addr:$src),
+ (VMOVAPSYrm addr:$src)>;
+ def : Pat<(loadv8i32 addr:$src),
+ (VMOVUPSYrm addr:$src)>;
+ def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
+ (VMOVAPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v4i64 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
+ def : Pat<(store (v8i32 VR256:$src), addr:$dst),
+ (VMOVUPSYmr addr:$dst, VR256:$src)>;
}
//===----------------------------------------------------------------------===//