From 6abfcbdfc8ceee8ad97ec36b0303e3ed89bcb347 Mon Sep 17 00:00:00 2001 From: Juergen Ributzka Date: Wed, 4 Dec 2013 00:39:08 +0000 Subject: [Stackmap] Emit multi-byte nops for X86. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@196334 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86MCInstLower.cpp | 92 +++++++++++---- test/CodeGen/X86/patchpoint.ll | 10 +- test/CodeGen/X86/stackmap-nops.ll | 230 ++++++++++++++++++++++++++++++++++++++ test/MC/X86/stackmap-nops.ll | 47 ++++++++ 4 files changed, 350 insertions(+), 29 deletions(-) create mode 100644 test/CodeGen/X86/stackmap-nops.ll create mode 100644 test/MC/X86/stackmap-nops.ll diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 51ff713e63..85af0b4556 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -674,27 +674,76 @@ static void LowerTlsAddr(MCStreamer &OutStreamer, .addExpr(tlsRef)); } +/// \brief Emit the optimal amount of multi-byte nops on X86. +static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit) { + // This works only for 64bit. For 32bit we have to do additional checking if + // the CPU supports multi-byte nops. + assert(Is64Bit && "EmitNops only supports X86-64"); + while (NumBytes) { + unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg; + Opc = IndexReg = Displacement = SegmentReg = 0; + BaseReg = X86::RAX; ScaleVal = 1; + switch (NumBytes) { + case 0: llvm_unreachable("Zero nops?"); break; + case 1: NumBytes -= 1; Opc = X86::NOOP; break; + case 2: NumBytes -= 2; Opc = X86::XCHG16ar; break; + case 3: NumBytes -= 3; Opc = X86::NOOPL; break; + case 4: NumBytes -= 4; Opc = X86::NOOPL; Displacement = 8; break; + case 5: NumBytes -= 5; Opc = X86::NOOPL; Displacement = 8; + IndexReg = X86::RAX; break; + case 6: NumBytes -= 6; Opc = X86::NOOPW; Displacement = 8; + IndexReg = X86::RAX; break; + case 7: NumBytes -= 7; Opc = X86::NOOPL; Displacement = 512; break; + case 8: NumBytes -= 8; Opc = X86::NOOPL; Displacement = 512; + IndexReg = X86::RAX; break; + case 9: NumBytes -= 9; Opc = X86::NOOPW; Displacement = 512; + IndexReg = X86::RAX; break; + default: NumBytes -= 10; Opc = X86::NOOPW; Displacement = 512; + IndexReg = X86::RAX; SegmentReg = X86::CS; break; + } + + unsigned NumPrefixes = std::min(NumBytes, 5U); + NumBytes -= NumPrefixes; + for (unsigned i = 0; i != NumPrefixes; ++i) + OS.EmitBytes("\x66"); + + switch (Opc) { + default: llvm_unreachable("Unexpected opcode"); break; + case X86::NOOP: + OS.EmitInstruction(MCInstBuilder(Opc)); + break; + case X86::XCHG16ar: + OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX)); + break; + case X86::NOOPL: + case X86::NOOPW: + OS.EmitInstruction(MCInstBuilder(Opc).addReg(BaseReg).addImm(ScaleVal) + .addReg(IndexReg) + .addImm(Displacement) + .addReg(SegmentReg)); + break; + } + } // while (NumBytes) +} + // Lower a stackmap of the form: // , , ... -static void LowerSTACKMAP(MCStreamer &OutStreamer, - StackMaps &SM, - const MachineInstr &MI) -{ - unsigned NumNOPBytes = MI.getOperand(1).getImm(); +static void LowerSTACKMAP(MCStreamer &OS, StackMaps &SM, + const MachineInstr &MI, bool Is64Bit) { + unsigned NumBytes = MI.getOperand(1).getImm(); SM.recordStackMap(MI); // Emit padding. // FIXME: These nops ensure that the stackmap's shadow is covered by // instructions from the same basic block, but the nops should not be // necessary if instructions from the same block follow the stackmap. - for (unsigned i = 0; i < NumNOPBytes; ++i) - OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP)); + EmitNops(OS, NumBytes, Is64Bit); } // Lower a patchpoint of the form: // [], , , , , , ... -static void LowerPATCHPOINT(MCStreamer &OutStreamer, - StackMaps &SM, - const MachineInstr &MI) { +static void LowerPATCHPOINT(MCStreamer &OS, StackMaps &SM, + const MachineInstr &MI, bool Is64Bit) { + assert(Is64Bit && "Patchpoint currently only supports X86-64"); SM.recordPatchPoint(MI); PatchPointOpers opers(&MI); @@ -704,22 +753,21 @@ static void LowerPATCHPOINT(MCStreamer &OutStreamer, if (CallTarget) { // Emit MOV to materialize the target address and the CALL to target. // This is encoded with 12-13 bytes, depending on which register is used. - // We conservatively assume that it is 12 bytes and emit in worst case one - // extra NOP byte. - EncodedBytes = 12; - OutStreamer.EmitInstruction(MCInstBuilder(X86::MOV64ri) - .addReg(MI.getOperand(ScratchIdx).getReg()) - .addImm(CallTarget)); - OutStreamer.EmitInstruction(MCInstBuilder(X86::CALL64r) - .addReg(MI.getOperand(ScratchIdx).getReg())); + unsigned ScratchReg = MI.getOperand(ScratchIdx).getReg(); + if (X86II::isX86_64ExtendedReg(ScratchReg)) + EncodedBytes = 13; + else + EncodedBytes = 12; + OS.EmitInstruction(MCInstBuilder(X86::MOV64ri).addReg(ScratchReg) + .addImm(CallTarget)); + OS.EmitInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg)); } // Emit padding. unsigned NumBytes = opers.getMetaOper(PatchPointOpers::NBytesPos).getImm(); assert(NumBytes >= EncodedBytes && "Patchpoint can't request size less than the length of a call."); - for (unsigned i = EncodedBytes; i < NumBytes; ++i) - OutStreamer.EmitInstruction(MCInstBuilder(X86::NOOP)); + EmitNops(OS, NumBytes - EncodedBytes, Is64Bit); } void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { @@ -813,10 +861,10 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } case TargetOpcode::STACKMAP: - return LowerSTACKMAP(OutStreamer, SM, *MI); + return LowerSTACKMAP(OutStreamer, SM, *MI, Subtarget->is64Bit()); case TargetOpcode::PATCHPOINT: - return LowerPATCHPOINT(OutStreamer, SM, *MI); + return LowerPATCHPOINT(OutStreamer, SM, *MI, Subtarget->is64Bit()); case X86::MORESTACK_RET: OutStreamer.EmitInstruction(MCInstBuilder(X86::RET)); diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll index d534639953..2fd1942f7f 100644 --- a/test/CodeGen/X86/patchpoint.ll +++ b/test/CodeGen/X86/patchpoint.ll @@ -7,10 +7,10 @@ entry: ; CHECK-LABEL: trivial_patchpoint_codegen: ; CHECK: movabsq $-559038736, %r11 ; CHECK-NEXT: callq *%r11 -; CHECK-NEXT: nop +; CHECK-NEXT: xchgw %ax, %ax ; CHECK: movq %rax, %[[REG:r.+]] ; CHECK: callq *%r11 -; CHECK-NEXT: nop +; CHECK-NEXT: xchgw %ax, %ax ; CHECK: movq %[[REG]], %rax ; CHECK: ret %resolveCall2 = inttoptr i64 -559038736 to i8* @@ -84,11 +84,7 @@ define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) { entry: ; CHECK-LABEL: small_patchpoint_codegen: ; CHECK: Ltmp -; CHECK: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop -; CHECK-NEXT: nop +; CHECK: nopl 8(%rax,%rax) ; CHECK-NEXT: popq ; CHECK-NEXT: ret %result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2) diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll new file mode 100644 index 0000000000..e4f7527bd2 --- /dev/null +++ b/test/CodeGen/X86/stackmap-nops.ll @@ -0,0 +1,230 @@ +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s + +define void @nop_test() { +entry: +; CHECK-LABEL: nop_test: +; CHECK: nop +; CHECK: xchgw %ax, %ax +; CHECK: nopl (%rax) +; CHECK: nopl 8(%rax) +; CHECK: nopl 8(%rax,%rax) +; CHECK: nopw 8(%rax,%rax) +; CHECK: nopl 512(%rax) +; CHECK: nopl 512(%rax,%rax) +; CHECK: nopw 512(%rax,%rax) +; CHECK: nopw %cs:512(%rax,%rax) + +; 11 +; CHECK: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +; 12 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +; 13 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +; 14 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +; 15 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +; 16 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: nop + +; 17 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: xchgw %ax, %ax + +; 18 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: nopl (%rax) + +; 19 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: nopl 8(%rax) + +; 20 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: nopl 8(%rax,%rax) + +; 21 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: nopw 8(%rax,%rax) + +; 22 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: nopl 512(%rax) + +; 23 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: nopl 512(%rax,%rax) + +; 24 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: nopw 512(%rax,%rax) + +; 25 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +; 26 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +; 27 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +; 28 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +;29 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + +; 30 +; CHECK: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: .byte 102 +; CHECK-NEXT: nopw %cs:512(%rax,%rax) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 0, i32 0) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 1, i32 1) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 2, i32 2) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 3, i32 3) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 4, i32 4) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 5, i32 5) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 6, i32 6) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 7, i32 7) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 8, i32 8) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 9, i32 9) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 10, i32 10) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 11, i32 11) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 12, i32 12) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 13, i32 13) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 14) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 15, i32 15) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 16, i32 16) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 17, i32 17) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 18, i32 18) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 19, i32 19) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 20, i32 20) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 21, i32 21) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 22, i32 22) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 23, i32 23) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 24, i32 24) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 25, i32 25) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 26, i32 26) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 27, i32 27) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 28, i32 28) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 29, i32 29) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 30, i32 30) + ret void +} + +declare void @llvm.experimental.stackmap(i32, i32, ...) diff --git a/test/MC/X86/stackmap-nops.ll b/test/MC/X86/stackmap-nops.ll new file mode 100644 index 0000000000..e6db891955 --- /dev/null +++ b/test/MC/X86/stackmap-nops.ll @@ -0,0 +1,47 @@ +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -filetype=obj %s -o - | llvm-objdump -d - | FileCheck %s +; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim -filetype=asm %s -o - | llvm-mc -filetype=obj - | llvm-objdump -d - | FileCheck %s + +define void @nop_test() { +entry: +; CHECK: 0: 55 +; CHECK: 1: 48 89 e5 + +; CHECK: 4: 90 +; CHECK: 5: 66 90 +; CHECK: 7: 0f 1f 00 +; CHECK: a: 0f 1f 40 08 +; CHECK: e: 0f 1f 44 00 08 +; CHECK: 13: 66 0f 1f 44 00 08 +; CHECK: 19: 0f 1f 80 00 02 00 00 +; CHECK: 20: 0f 1f 84 00 00 02 00 00 +; CHECK: 28: 66 0f 1f 84 00 00 02 00 00 +; CHECK: 31: 2e 66 0f 1f 84 00 00 02 00 00 +; CHECK: 3b: 66 2e 66 0f 1f 84 00 00 02 00 00 +; CHECK: 46: 66 66 2e 66 0f 1f 84 00 00 02 00 00 +; CHECK: 52: 66 66 66 2e 66 0f 1f 84 00 00 02 00 00 +; CHECK: 5f: 66 66 66 66 2e 66 0f 1f 84 00 00 02 00 00 +; CHECK: 6d: 66 66 66 66 66 2e 66 0f 1f 84 00 00 02 00 00 + +; CHECK: 7c: 5d +; CHECK: 7d: c3 + + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 0, i32 0) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 1, i32 1) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 2, i32 2) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 3, i32 3) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 4, i32 4) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 5, i32 5) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 6, i32 6) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 7, i32 7) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 8, i32 8) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 9, i32 9) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 10, i32 10) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 11, i32 11) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 12, i32 12) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 13, i32 13) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 14) + tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 15, i32 15) + ret void +} + +declare void @llvm.experimental.stackmap(i32, i32, ...) -- cgit v1.2.3