Introduce a pass to insert vzeroupper instructions to avoid AVX to

SSE transition penalty. The pass is enabled through the "x86-use-vzeroupper" llc command line option. This is only the first step (very naive and conservative one) to sketch out the idea, but proper DFA is coming next to allow smarter decisions. Comments and ideas now and in further commits will be very appreciated. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@138317 91177308-0d34-0410-b5e6-96231b3b80d8
author: Bruno Cardoso Lopes <bruno.cardoso@gmail.com> 2011-08-23 01:14:17 +0000
committer: Bruno Cardoso Lopes <bruno.cardoso@gmail.com> 2011-08-23 01:14:17 +0000
commit: 3bde6fe0df05558b89e7edfe48ac05da59beb81a (patch)
tree: 011a10aa34d5fb2d2afa5786803bd3f240a9d2a7
parent: 7e99b5c8a36e3e8d611e47122f9c596b58ccf3e8 (diff)
download: llvm-3bde6fe0df05558b89e7edfe48ac05da59beb81a.tar.gz
llvm-3bde6fe0df05558b89e7edfe48ac05da59beb81a.tar.bz2
llvm-3bde6fe0df05558b89e7edfe48ac05da59beb81a.tar.xz
5 files changed, 153 insertions, 0 deletions
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 1fd55128e2..c481eb9552 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -32,6 +32,7 @@ set(sources
   X86Subtarget.cpp
   X86TargetMachine.cpp
   X86TargetObjectFile.cpp
+  X86VZeroUpper.cpp
   )
 
 if( CMAKE_CL_64 )
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index d1e193304a..d480d0c865 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -48,6 +48,11 @@ FunctionPass *createX86FloatingPointStackifierPass();
 /// crossings.
 FunctionPass *createSSEDomainFixPass();
 
+/// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
+/// before each call to avoid transition penalty between functions encoded with
+/// AVX and SSE.
+FunctionPass *createX86IssueVZeroUpperPass();
+
 /// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
 /// to the specified MCE object.
 FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 569c0408f3..95e7021dce 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/PassManager.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegistry.h"
@@ -92,6 +93,16 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
 }
 
 //===----------------------------------------------------------------------===//
+// Command line options for x86
+//===----------------------------------------------------------------------===//
+bool UseVZeroUpper;
+
+static cl::opt<bool, true>
+VZeroUpper("x86-use-vzeroupper",
+  cl::desc("Minimize AVX to SSE transition penalty"),
+  cl::location(UseVZeroUpper), cl::init(false));
+
+//===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
@@ -125,6 +136,11 @@ bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
     PM.add(createSSEDomainFixPass());
     return true;
   }
+
+  if (Subtarget.hasAVX() && UseVZeroUpper) {
+    PM.add(createX86IssueVZeroUpperPass());
+    return true;
+  }
   return false;
 }
 
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
new file mode 100644
index 0000000000..d87efc99b2
--- /dev/null
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -0,0 +1,105 @@
+//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts x86 AVX vzeroupper instructions
+// before calls to SSE encoded functions. This avoids transition latency
+// penalty when tranfering control between AVX encoded instructions and old
+// SSE encoding mode.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
+
+namespace {
+  struct VZeroUpperInserter : public MachineFunctionPass {
+    static char ID;
+    VZeroUpperInserter() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    virtual const char *getPassName() const { return "X86 vzeroupper inserter";}
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+    MachineBasicBlock *MBB;     // Current basic block
+  };
+  char VZeroUpperInserter::ID = 0;
+}
+
+FunctionPass *llvm::createX86IssueVZeroUpperPass() {
+  return new VZeroUpperInserter();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// vzero upper instructions before function calls.
+bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  bool Changed = false;
+
+  // Process any unreachable blocks in arbitrary order now.
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+    Changed |= processBasicBlock(MF, *BB);
+
+  return Changed;
+}
+
+bool isCallToModuleFn(const MachineInstr *MI) {
+  assert(MI->getDesc().isCall() && "Isn't a call instruction");
+
+  for (int i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    if (!MO.isGlobal())
+      continue;
+
+    const GlobalValue *GV = MO.getGlobal();
+    GlobalValue::LinkageTypes LT = GV->getLinkage();
+    if (GV->isInternalLinkage(LT) || GV->isPrivateLinkage(LT) ||
+        (GV->isExternalLinkage(LT) && !GV->isDeclaration()))
+      return true;
+
+    return false;
+  }
+  return false;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// inserting vzero upper instructions before function calls.
+bool VZeroUpperInserter::processBasicBlock(MachineFunction &MF,
+                                           MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    DebugLoc dl = I->getDebugLoc();
+
+    // Insert a vzeroupper instruction before each control transfer
+    // to functions outside this module
+    if (MI->getDesc().isCall() && !isCallToModuleFn(MI)) {
+      BuildMI(*MBB, I, dl, TII->get(X86::VZEROUPPER));
+      ++NumVZU;
+    }
+  }
+
+  return Changed;
+}
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
new file mode 100644
index 0000000000..eaf236c6c7
--- /dev/null
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+define <4 x float> @do_sse_local(<4 x float> %a) nounwind uwtable readnone ssp {
+entry:
+  %add.i = fadd <4 x float> %a, %a
+  ret <4 x float> %add.i
+}
+
+; CHECK: _test00
+define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+entry:
+  %add.i = fadd <4 x float> %a, %b
+  ; CHECK: vzeroupper
+  ; CHECK-NEXT: callq _do_sse
+  %call3 = tail call <4 x float> @do_sse(<4 x float> %add.i) nounwind
+  %sub.i = fsub <4 x float> %call3, %add.i
+  ; CHECK-NOT: vzeroupper
+  ; CHECK: callq _do_sse_local
+  %call8 = tail call <4 x float> @do_sse_local(<4 x float> %sub.i)
+  ; CHECK: vzeroupper
+  ; CHECK-NEXT: jmp _do_sse
+  %call10 = tail call <4 x float> @do_sse(<4 x float> %call8) nounwind
+  ret <4 x float> %call10
+}
+
+declare <4 x float> @do_sse(<4 x float>)
author	Bruno Cardoso Lopes <bruno.cardoso@gmail.com>	2011-08-23 01:14:17 +0000
committer	Bruno Cardoso Lopes <bruno.cardoso@gmail.com>	2011-08-23 01:14:17 +0000
commit	3bde6fe0df05558b89e7edfe48ac05da59beb81a (patch)
tree	011a10aa34d5fb2d2afa5786803bd3f240a9d2a7
parent	7e99b5c8a36e3e8d611e47122f9c596b58ccf3e8 (diff)
download	llvm-3bde6fe0df05558b89e7edfe48ac05da59beb81a.tar.gz llvm-3bde6fe0df05558b89e7edfe48ac05da59beb81a.tar.bz2 llvm-3bde6fe0df05558b89e7edfe48ac05da59beb81a.tar.xz