SHLD/SHRD are VectorPath (microcode) instructions known to have poor latency on certain architectures. While generating SHLD/SHRD instructions is acceptable when optimizing for size, optimizing for speed on these platforms should be implemented using alternative sequences of instructions composed of add, adc, shr, shl, or and lea which are directPath instructions. These alternative instructions not only have a lower latency but they also increase the decode bandwidth by allowing simultaneous decoding of a third directPath instruction.

AMD's processors family K7, K8, K10, K12, K15 and K16 are known to have SHLD/SHRD instructions with very poor latency. Optimization guides for these processors recommend using an alternative sequence of instructions. For these AMD's processors, I disabled folding (or (x << c) | (y >> (64 - c))) when we are not optimizing for size. It might be beneficial to disable this folding for some of the Intel's processors. However, since I couldn't find specific recommendations regarding using SHLD/SHRD instructions on Intel's processors, I haven't disabled this peephole for Intel. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@195383 91177308-0d34-0410-b5e6-96231b3b80d8
author: Ekaterina Romanova <katya_romanova@playstation.sony.com> 2013-11-21 23:21:26 +0000
committer: Ekaterina Romanova <katya_romanova@playstation.sony.com> 2013-11-21 23:21:26 +0000
commit: 46f7257ed1bbd2a169f6c930a805e702407d955b (patch)
tree: 35e063ff39eb0d44f13c7e126c9e964902fb49b4 /lib/Target
parent: 934d1f83aecf8f01646f4b2a09167309a4c1bb8d (diff)
download: llvm-46f7257ed1bbd2a169f6c930a805e702407d955b.tar.gz
llvm-46f7257ed1bbd2a169f6c930a805e702407d955b.tar.bz2
llvm-46f7257ed1bbd2a169f6c930a805e702407d955b.tar.xz
4 files changed, 53 insertions, 18 deletions
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 65c5552de2..ebe1a82626 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -73,6 +73,8 @@ def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
                                       [Feature64Bit]>;
 def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
                                        "Bit testing of memory is slow">;
+def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
+                                       "SHLD instruction is slow">;
 def FeatureFastUAMem : SubtargetFeature<"fast-unaligned-mem",
                                         "IsUAMemFast", "true",
                                         "Fast unaligned memory access">;
@@ -268,46 +270,53 @@ def : ProcessorModel<"knl", HaswellModel,
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
 def : Proc<"k6-3",            [Feature3DNow]>;
-def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
-def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon",          [Feature3DNowA, FeatureSlowBTMem, 
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-tbird",    [Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem, 
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
+def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
 def : Proc<"k8",              [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"opteron",         [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, FeatureCMPXCHG16B,
-                               FeatureSlowBTMem]>;
+                               FeatureSlowBTMem, FeatureSlowSHLD]>;
 def : Proc<"amdfam10",        [FeatureSSE4A,
                                Feature3DNowA, FeatureCMPXCHG16B, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureSlowBTMem]>;
+                               FeaturePOPCNT, FeatureSlowBTMem,
+                               FeatureSlowSHLD]>;
 // Bobcat
 def : Proc<"btver1",          [FeatureSSSE3, FeatureSSE4A, FeatureCMPXCHG16B,
-                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT]>;
+                               FeaturePRFCHW, FeatureLZCNT, FeaturePOPCNT,
+                               FeatureSlowSHLD]>;
 // Jaguar
 def : Proc<"btver2",          [FeatureAVX, FeatureSSE4A, FeatureCMPXCHG16B,
                                FeaturePRFCHW, FeatureAES, FeaturePCLMUL,
                                FeatureBMI, FeatureF16C, FeatureMOVBE,
-                               FeatureLZCNT, FeaturePOPCNT]>;
+                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
 // Bulldozer
 def : Proc<"bdver1",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
-                               FeatureLZCNT, FeaturePOPCNT]>;
+                               FeatureLZCNT, FeaturePOPCNT, FeatureSlowSHLD]>;
 // Piledriver
 def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePRFCHW, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
-                               FeaturePOPCNT, FeatureBMI,  FeatureTBM,
-                               FeatureFMA]>;
+                               FeaturePOPCNT, FeatureBMI, FeatureTBM,
+                               FeatureFMA, FeatureSlowSHLD]>;
 
 // Steamroller
 def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index ec5ae33ef5..23910a350e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -17892,6 +17892,18 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool OptForSize = MF.getFunction()->getAttributes().
+    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+ 
+  // SHLD/SHRD instructions have lower register pressure, but on some 
+  // platforms they have higher latency than the equivalent 
+  // series of shifts/or that would otherwise be generated. 
+  // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
+  // have higer latencies and we are not optimizing for size.  
+  if (!OptForSize && Subtarget->isSHLDSlow())
+    return SDValue();
+
   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
     std::swap(N0, N1);
   if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 05db662b2c..fa04c38a85 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -263,6 +263,15 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
       ToggleFeature(X86::FeatureSlowBTMem);
     }
 
+    // Determine if SHLD/SHRD instructions have higher latency then the
+    // equivalent series of shifts/or instructions. 
+    // FIXME: Add Intel's processors that have SHLD instructions with very
+    // poor latency. 
+    if (IsAMD) {
+      IsSHLDSlow = true;
+      ToggleFeature(X86::FeatureSlowSHLD);
+    }
+
     // If it's an Intel chip since Nehalem and not an Atom chip, unaligned
     // memory access is fast. We hard code model numbers here because they
     // aren't strictly increasing for Intel chips it seems.
@@ -514,6 +523,7 @@ void X86Subtarget::initializeEnvironment() {
   HasPRFCHW = false;
   HasRDSEED = false;
   IsBTMemSlow = false;
+  IsSHLDSlow = false;
   IsUAMemFast = false;
   HasVectorUAMem = false;
   HasCmpxchg16b = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index dd8c0811ce..cddd9d898e 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -142,6 +142,9 @@ protected:
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
+  /// IsSHLDSlow - True if SHLD instructions are slow.
+  bool IsSHLDSlow;
+
   /// IsUAMemFast - True if unaligned memory access is fast.
   bool IsUAMemFast;
 
@@ -292,6 +295,7 @@ public:
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
author	Ekaterina Romanova <katya_romanova@playstation.sony.com>	2013-11-21 23:21:26 +0000
committer	Ekaterina Romanova <katya_romanova@playstation.sony.com>	2013-11-21 23:21:26 +0000
commit	46f7257ed1bbd2a169f6c930a805e702407d955b (patch)
tree	35e063ff39eb0d44f13c7e126c9e964902fb49b4 /lib/Target
parent	934d1f83aecf8f01646f4b2a09167309a4c1bb8d (diff)
download	llvm-46f7257ed1bbd2a169f6c930a805e702407d955b.tar.gz llvm-46f7257ed1bbd2a169f6c930a805e702407d955b.tar.bz2 llvm-46f7257ed1bbd2a169f6c930a805e702407d955b.tar.xz