The current X86 NOP padding uses one long NOP followed by the remainder in

one-byte NOPs. If the processor actually executes those NOPs, as it sometimes does with aligned bundling, this can have a performance impact. From my micro-benchmarks run on my one machine, a 15-byte NOP followed by twelve one-byte NOPs is about 20% worse than a 15 followed by a 12. This patch changes NOP emission to emit as many 15-byte (the maximum) as possible followed by at most one shorter NOP. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176464 91177308-0d34-0410-b5e6-96231b3b80d8
author: David Sehr <sehr@google.com> 2013-03-05 00:02:23 +0000
committer: David Sehr <sehr@google.com> 2013-03-05 00:02:23 +0000
commit: 6c4265a541c9e431961113c1a5d92fb4628bfe13 (patch)
tree: b98b6adf1f9527b4ca89e194005765c3cc0b0ccf
parent: 880e8c0ad41345f353b819c51092baa8f05e1950 (diff)
download: llvm-6c4265a541c9e431961113c1a5d92fb4628bfe13.tar.gz
llvm-6c4265a541c9e431961113c1a5d92fb4628bfe13.tar.bz2
llvm-6c4265a541c9e431961113c1a5d92fb4628bfe13.tar.xz
2 files changed, 39 insertions, 12 deletions
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index acc90eceba..598ddee56d 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -315,18 +315,18 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     return true;
   }
 
-  // Write an optimal sequence for the first 15 bytes.
-  const uint64_t OptimalCount = (Count < 16) ? Count : 15;
-  const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10;
-  for (uint64_t i = 0, e = Prefixes; i != e; i++)
-    OW->Write8(0x66);
-  const uint64_t Rest = OptimalCount - Prefixes;
-  for (uint64_t i = 0, e = Rest; i != e; i++)
-    OW->Write8(Nops[Rest - 1][i]);
-
-  // Finish with single byte nops.
-  for (uint64_t i = OptimalCount, e = Count; i != e; ++i)
-   OW->Write8(0x90);
+  // 15 is the longest single nop instruction.  Emit as many 15-byte nops as
+  // needed, then emit a nop of the remaining length.
+  do {
+    const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15);
+    const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+    for (uint8_t i = 0; i < Prefixes; i++)
+      OW->Write8(0x66);
+    const uint8_t Rest = ThisNopLength - Prefixes;
+    for (uint8_t i = 0; i < Rest; i++)
+      OW->Write8(Nops[Rest - 1][i]);
+    Count -= ThisNopLength;
+  } while (Count != 0);
 
   return true;
 }
diff --git a/test/MC/X86/AlignedBundling/long-nop-pad.s b/test/MC/X86/AlignedBundling/long-nop-pad.s
new file mode 100644
index 0000000000..ea33e2889b
--- /dev/null
+++ b/test/MC/X86/AlignedBundling/long-nop-pad.s
@@ -0,0 +1,27 @@
+# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
+# RUN:   | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
+
+# Test that long nops are generated for padding where possible.
+
+  .text
+foo:
+  .bundle_align_mode 5
+
+# This callq instruction is 5 bytes long
+  .bundle_lock align_to_end
+  callq   bar
+  .bundle_unlock
+# To align this group to a bundle end, we need a 15-byte NOP and a 12-byte NOP.
+# CHECK:        0:  nop
+# CHECK-NEXT:   f:  nop
+# CHECK-NEXT:   1b: callq
+
+# This push instruction is 1 byte long
+  .bundle_lock align_to_end
+  push %rax
+  .bundle_unlock
+# To align this group to a bundle end, we need two 15-byte NOPs, and a 1-byte.
+# CHECK:        20:  nop
+# CHECK-NEXT:   2f:  nop
+# CHECK-NEXT:   3e:  nop
+# CHECK-NEXT:   3f: pushq
author	David Sehr <sehr@google.com>	2013-03-05 00:02:23 +0000
committer	David Sehr <sehr@google.com>	2013-03-05 00:02:23 +0000
commit	6c4265a541c9e431961113c1a5d92fb4628bfe13 (patch)
tree	b98b6adf1f9527b4ca89e194005765c3cc0b0ccf
parent	880e8c0ad41345f353b819c51092baa8f05e1950 (diff)
download	llvm-6c4265a541c9e431961113c1a5d92fb4628bfe13.tar.gz llvm-6c4265a541c9e431961113c1a5d92fb4628bfe13.tar.bz2 llvm-6c4265a541c9e431961113c1a5d92fb4628bfe13.tar.xz