summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJustin Holewinski <jholewinski@nvidia.com>2013-05-31 12:14:49 +0000
committerJustin Holewinski <jholewinski@nvidia.com>2013-05-31 12:14:49 +0000
commit5443e7d79044f3198f2da044f1b389b40d9bea6f (patch)
tree2af410aedc2eb4f9a78443029eb71da208562589
parent7ae921dbec5db9823c89fa736b2a4c3afe163e4f (diff)
downloadllvm-5443e7d79044f3198f2da044f1b389b40d9bea6f.tar.gz
llvm-5443e7d79044f3198f2da044f1b389b40d9bea6f.tar.bz2
llvm-5443e7d79044f3198f2da044f1b389b40d9bea6f.tar.xz
[NVPTX] Re-enable support for virtual registers in the final output
Now that 3.3 is branched, we are re-enabling virtual registers to help iron out bugs before the next release. Some of the post-RA passes do not play well with virtual registers, so we disable them for now. The needed functionality of the PrologEpilogInserter pass is copied to a new backend-specific NVPTXPrologEpilog pass. The test for this commit is not breaking the existing tests. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182998 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/NVPTX/CMakeLists.txt1
-rw-r--r--lib/Target/NVPTX/NVPTX.h2
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp63
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.h5
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.cpp46
-rw-r--r--lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp225
-rw-r--r--lib/Target/NVPTX/NVPTXRegisterInfo.cpp4
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.cpp27
-rw-r--r--test/CodeGen/NVPTX/intrinsic-old.ll66
-rw-r--r--test/CodeGen/NVPTX/intrinsics.ll4
10 files changed, 342 insertions, 101 deletions
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 735ca9b6b7..a8293da760 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -24,6 +24,7 @@ set(NVPTXCodeGen_sources
NVPTXUtilities.cpp
NVVMReflect.cpp
NVPTXGenericToNVVM.cpp
+ NVPTXPrologEpilogPass.cpp
)
add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 072c65da35..179dc277f4 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -27,6 +27,7 @@
namespace llvm {
class NVPTXTargetMachine;
class FunctionPass;
+class MachineFunctionPass;
class formatted_raw_ostream;
namespace NVPTXCC {
@@ -66,6 +67,7 @@ FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
ModulePass *createGenericToNVVMPass();
ModulePass *createNVVMReflectPass();
ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
+MachineFunctionPass *createNVPTXPrologEpilogPass();
bool isImageOrSamplerVal(const Value *, const Module *);
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 6cc52bda98..44f357d084 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -436,9 +436,7 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
}
void NVPTXAsmPrinter::EmitFunctionBodyStart() {
- const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
- unsigned numRegClasses = TRI.getNumRegClasses();
- VRidGlobal2LocalMap = new std::map<unsigned, unsigned>[numRegClasses + 1];
+ VRegMapping.clear();
OutStreamer.EmitRawText(StringRef("{\n"));
setAndEmitFunctionVirtualRegisters(*MF);
@@ -450,7 +448,7 @@ void NVPTXAsmPrinter::EmitFunctionBodyStart() {
void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
OutStreamer.EmitRawText(StringRef("}\n"));
- delete[] VRidGlobal2LocalMap;
+ VRegMapping.clear();
}
void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
@@ -507,9 +505,8 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F,
void NVPTXAsmPrinter::getVirtualRegisterName(unsigned vr, bool isVec,
raw_ostream &O) {
const TargetRegisterClass *RC = MRI->getRegClass(vr);
- unsigned id = RC->getID();
- std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[id];
+ DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
unsigned mapped_vr = regmap[vr];
if (!isVec) {
@@ -1709,48 +1706,36 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
for (unsigned i = 0; i < numVRs; i++) {
unsigned int vr = TRI->index2VirtReg(i);
const TargetRegisterClass *RC = MRI->getRegClass(vr);
- std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[RC->getID()];
+ DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
int n = regmap.size();
regmap.insert(std::make_pair(vr, n + 1));
}
// Emit register declarations
// @TODO: Extract out the real register usage
- O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
- O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
- O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
- O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
- O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
- O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
- O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .pred %p<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s16 %rc<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s16 %rs<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s32 %r<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .s64 %rl<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .f32 %f<" << NVPTXNumRegisters << ">;\n";
+ // O << "\t.reg .f64 %fl<" << NVPTXNumRegisters << ">;\n";
// Emit declaration of the virtual registers or 'physical' registers for
// each register class
- //for (unsigned i=0; i< numRegClasses; i++) {
- // std::map<unsigned, unsigned> &regmap = VRidGlobal2LocalMap[i];
- // const TargetRegisterClass *RC = TRI->getRegClass(i);
- // std::string rcname = getNVPTXRegClassName(RC);
- // std::string rcStr = getNVPTXRegClassStr(RC);
- // //int n = regmap.size();
- // if (!isNVPTXVectorRegClass(RC)) {
- // O << "\t.reg " << rcname << " \t" << rcStr << "<"
- // << NVPTXNumRegisters << ">;\n";
- // }
-
- // Only declare those registers that may be used. And do not emit vector
- // registers as
- // they are all elementized to scalar registers.
- //if (n && !isNVPTXVectorRegClass(RC)) {
- // if (RegAllocNilUsed) {
- // O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
- // << ">;\n";
- // }
- // else {
- // O << "\t.reg " << rcname << " \t" << StrToUpper(rcStr)
- // << "<" << 32 << ">;\n";
- // }
- //}
- //}
+ for (unsigned i=0; i< TRI->getNumRegClasses(); i++) {
+ const TargetRegisterClass *RC = TRI->getRegClass(i);
+ DenseMap<unsigned, unsigned> &regmap = VRegMapping[RC];
+ std::string rcname = getNVPTXRegClassName(RC);
+ std::string rcStr = getNVPTXRegClassStr(RC);
+ int n = regmap.size();
+
+ // Only declare those registers that may be used.
+ if (n) {
+ O << "\t.reg " << rcname << " \t" << rcStr << "<" << (n+1)
+ << ">;\n";
+ }
+ }
OutStreamer.EmitRawText(O.str());
}
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 7faa6b265b..55f2943667 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -243,7 +243,9 @@ private:
// The contents are specific for each
// MachineFunction. But the size of the
// array is not.
- std::map<unsigned, unsigned> *VRidGlobal2LocalMap;
+ typedef DenseMap<unsigned, unsigned> VRegMap;
+ typedef DenseMap<const TargetRegisterClass *, VRegMap> VRegRCMap;
+ VRegRCMap VRegMapping;
// cache the subtarget here.
const NVPTXSubtarget &nvptxSubtarget;
// Build the map between type name and ID based on module's type
@@ -281,7 +283,6 @@ public:
: AsmPrinter(TM, Streamer),
nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
CurrentBankselLabelInBasicBlock = "";
- VRidGlobal2LocalMap = NULL;
reader = NULL;
}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 33a63c26f4..52be28736b 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -32,36 +32,36 @@ NVPTXInstrInfo::NVPTXInstrInfo(NVPTXTargetMachine &tm)
void NVPTXInstrInfo::copyPhysReg(
MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
unsigned DestReg, unsigned SrcReg, bool KillSrc) const {
- if (NVPTX::Int32RegsRegClass.contains(DestReg) &&
- NVPTX::Int32RegsRegClass.contains(SrcReg))
+ const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ const TargetRegisterClass *DestRC = MRI.getRegClass(DestReg);
+ const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+
+ if (DestRC != SrcRC)
+ report_fatal_error("Attempted to created cross-class register copy");
+
+ if (DestRC == &NVPTX::Int32RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::IMOV32rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (NVPTX::Int8RegsRegClass.contains(DestReg) &&
- NVPTX::Int8RegsRegClass.contains(SrcReg))
- BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (NVPTX::Int1RegsRegClass.contains(DestReg) &&
- NVPTX::Int1RegsRegClass.contains(SrcReg))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Int1RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::IMOV1rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (NVPTX::Float32RegsRegClass.contains(DestReg) &&
- NVPTX::Float32RegsRegClass.contains(SrcReg))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Float32RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::FMOV32rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (NVPTX::Int16RegsRegClass.contains(DestReg) &&
- NVPTX::Int16RegsRegClass.contains(SrcReg))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Int16RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (NVPTX::Int64RegsRegClass.contains(DestReg) &&
- NVPTX::Int64RegsRegClass.contains(SrcReg))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Int8RegsRegClass)
+ BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg)
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Int64RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
- else if (NVPTX::Float64RegsRegClass.contains(DestReg) &&
- NVPTX::Float64RegsRegClass.contains(SrcReg))
+ .addReg(SrcReg, getKillRegState(KillSrc));
+ else if (DestRC == &NVPTX::Float64RegsRegClass)
BuildMI(MBB, I, DL, get(NVPTX::FMOV64rr), DestReg)
- .addReg(SrcReg, getKillRegState(KillSrc));
+ .addReg(SrcReg, getKillRegState(KillSrc));
else {
- llvm_unreachable("Don't know how to copy a register");
+ llvm_unreachable("Bad register copy");
}
}
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
new file mode 100644
index 0000000000..843ebed5e4
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -0,0 +1,225 @@
+//===-- NVPTXPrologEpilogPass.cpp - NVPTX prolog/epilog inserter ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a copy of the generic LLVM PrologEpilogInserter pass, modified
+// to remove unneeded functionality and to handle virtual registers. Most code
+// here is a copy of PrologEpilogInserter.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/Pass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXPrologEpilogPass : public MachineFunctionPass {
+public:
+ static char ID;
+ NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+ void calculateFrameObjectOffsets(MachineFunction &Fn);
+};
+}
+
+MachineFunctionPass *llvm::createNVPTXPrologEpilogPass() {
+ return new NVPTXPrologEpilogPass();
+}
+
+char NVPTXPrologEpilogPass::ID = 0;
+
+bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameLowering &TFI = *TM.getFrameLowering();
+ const TargetRegisterInfo &TRI = *TM.getRegisterInfo();
+ bool Modified = false;
+
+ calculateFrameObjectOffsets(MF);
+
+ for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB) {
+ for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I) {
+ MachineInstr *MI = I;
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ if (!MI->getOperand(i).isFI())
+ continue;
+ TRI.eliminateFrameIndex(MI, 0, i, NULL);
+ Modified = true;
+ }
+ }
+ }
+
+ // Add function prolog/epilog
+ TFI.emitPrologue(MF);
+
+ for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+ // If last instruction is a return instruction, add an epilogue
+ if (!I->empty() && I->back().isReturn())
+ TFI.emitEpilogue(MF, *I);
+ }
+
+ return Modified;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+ bool StackGrowsDown, int64_t &Offset,
+ unsigned &MaxAlign) {
+ // If the stack grows down, add the object size to find the lowest address.
+ if (StackGrowsDown)
+ Offset += MFI->getObjectSize(FrameIdx);
+
+ unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+ // If the alignment of this object is greater than that of the stack, then
+ // increase the stack alignment to match.
+ MaxAlign = std::max(MaxAlign, Align);
+
+ // Adjust to alignment boundary.
+ Offset = (Offset + Align - 1) / Align * Align;
+
+ if (StackGrowsDown) {
+ DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+ MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+ } else {
+ DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+ MFI->setObjectOffset(FrameIdx, Offset);
+ Offset += MFI->getObjectSize(FrameIdx);
+ }
+}
+
+void
+NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+ const TargetFrameLowering &TFI = *Fn.getTarget().getFrameLowering();
+ const TargetRegisterInfo *RegInfo = Fn.getTarget().getRegisterInfo();
+
+ bool StackGrowsDown =
+ TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+ // Loop over all of the stack objects, assigning sequential addresses...
+ MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+ // Start at the beginning of the local area.
+ // The Offset is the distance from the stack top in the direction
+ // of stack growth -- so it's always nonnegative.
+ int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+ if (StackGrowsDown)
+ LocalAreaOffset = -LocalAreaOffset;
+ assert(LocalAreaOffset >= 0
+ && "Local area offset should be in direction of stack growth");
+ int64_t Offset = LocalAreaOffset;
+
+ // If there are fixed sized objects that are preallocated in the local area,
+ // non-fixed objects can't be allocated right at the start of local area.
+ // We currently don't support filling in holes in between fixed sized
+ // objects, so we adjust 'Offset' to point to the end of last fixed sized
+ // preallocated object.
+ for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+ int64_t FixedOff;
+ if (StackGrowsDown) {
+ // The maximum distance from the stack pointer is at lower address of
+ // the object -- which is given by offset. For down growing stack
+ // the offset is negative, so we negate the offset to get the distance.
+ FixedOff = -MFI->getObjectOffset(i);
+ } else {
+ // The maximum distance from the start pointer is at the upper
+ // address of the object.
+ FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+ }
+ if (FixedOff > Offset) Offset = FixedOff;
+ }
+
+ // NOTE: We do not have a call stack
+
+ unsigned MaxAlign = MFI->getMaxAlignment();
+
+ // No scavenger
+
+ // FIXME: Once this is working, then enable flag will change to a target
+ // check for whether the frame is large enough to want to use virtual
+ // frame index registers. Functions which don't want/need this optimization
+ // will continue to use the existing code path.
+ if (MFI->getUseLocalStackAllocationBlock()) {
+ unsigned Align = MFI->getLocalFrameMaxAlign();
+
+ // Adjust to alignment boundary.
+ Offset = (Offset + Align - 1) / Align * Align;
+
+ DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+ // Resolve offsets for objects in the local block.
+ for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+ std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+ int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+ DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+ FIOffset << "]\n");
+ MFI->setObjectOffset(Entry.first, FIOffset);
+ }
+ // Allocate the local block
+ Offset += MFI->getLocalFrameSize();
+
+ MaxAlign = std::max(Align, MaxAlign);
+ }
+
+ // No stack protector
+
+ // Then assign frame offsets to stack objects that are not used to spill
+ // callee saved registers.
+ for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+ if (MFI->isObjectPreAllocated(i) &&
+ MFI->getUseLocalStackAllocationBlock())
+ continue;
+ if (MFI->isDeadObjectIndex(i))
+ continue;
+
+ AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+ }
+
+ // No scavenger
+
+ if (!TFI.targetHandlesStackFrameRounding()) {
+ // If we have reserved argument space for call sites in the function
+ // immediately on entry to the current function, count it as part of the
+ // overall stack size.
+ if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
+ Offset += MFI->getMaxCallFrameSize();
+
+ // Round up the size to a multiple of the alignment. If the function has
+ // any calls or alloca's, align to the target's StackAlignment value to
+ // ensure that the callee's frame or the alloca data is suitably aligned;
+ // otherwise, for leaf functions, align to the TransientStackAlignment
+ // value.
+ unsigned StackAlign;
+ if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+ (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+ StackAlign = TFI.getStackAlignment();
+ else
+ StackAlign = TFI.getTransientStackAlignment();
+
+ // If the frame pointer is eliminated, all frame offsets will be relative to
+ // SP not FP. Align to MaxAlign so this works.
+ StackAlign = std::max(StackAlign, MaxAlign);
+ unsigned AlignMask = StackAlign - 1;
+ Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+ }
+
+ // Update frame info to pretend that this is part of the stack...
+ int64_t StackSize = Offset - LocalAreaOffset;
+ MFI->setStackSize(StackSize);
+}
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 282465359b..bb039f8397 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -57,9 +57,9 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
return "%f";
}
if (RC == &NVPTX::Float64RegsRegClass) {
- return "%fd";
+ return "%fl";
} else if (RC == &NVPTX::Int64RegsRegClass) {
- return "%rd";
+ return "%rl";
} else if (RC == &NVPTX::Int32RegsRegClass) {
return "%r";
} else if (RC == &NVPTX::Int16RegsRegClass) {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 5b8ea1ed99..68f9bf74f7 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -107,6 +107,10 @@ public:
virtual void addIRPasses();
virtual bool addInstSelector();
virtual bool addPreRegAlloc();
+ virtual bool addPostRegAlloc();
+
+ virtual void addFastRegAlloc(FunctionPass *RegAllocPass);
+ virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
};
} // end anonymous namespace
@@ -116,6 +120,15 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
}
void NVPTXPassConfig::addIRPasses() {
+ // The following passes are known to not play well with virtual regs hanging
+ // around after register allocation (which in our case, is *all* registers).
+ // We explicitly disable them here. We do, however, need some functionality
+ // of the PrologEpilogCodeInserter pass, so we emulate that behavior in the
+ // NVPTXPrologEpilog pass (see NVPTXPrologEpilogPass.cpp).
+ disablePass(&PrologEpilogCodeInserterID);
+ disablePass(&MachineCopyPropagationID);
+ disablePass(&BranchFolderPassID);
+
TargetPassConfig::addIRPasses();
addPass(createGenericToNVVMPass());
}
@@ -129,3 +142,17 @@ bool NVPTXPassConfig::addInstSelector() {
}
bool NVPTXPassConfig::addPreRegAlloc() { return false; }
+bool NVPTXPassConfig::addPostRegAlloc() {
+ addPass(createNVPTXPrologEpilogPass());
+ return false;
+}
+
+void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+ // No reg alloc
+ addPass(&StrongPHIEliminationID);
+}
+
+void NVPTXPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+ // No reg alloc
+ addPass(&StrongPHIEliminationID);
+}
diff --git a/test/CodeGen/NVPTX/intrinsic-old.ll b/test/CodeGen/NVPTX/intrinsic-old.ll
index 53a28f3337..af91bb4424 100644
--- a/test/CodeGen/NVPTX/intrinsic-old.ll
+++ b/test/CodeGen/NVPTX/intrinsic-old.ll
@@ -2,231 +2,231 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
define ptx_device i32 @test_tid_x() {
-; CHECK: mov.u32 %r0, %tid.x;
+; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.tid.x()
ret i32 %x
}
define ptx_device i32 @test_tid_y() {
-; CHECK: mov.u32 %r0, %tid.y;
+; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.tid.y()
ret i32 %x
}
define ptx_device i32 @test_tid_z() {
-; CHECK: mov.u32 %r0, %tid.z;
+; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.tid.z()
ret i32 %x
}
define ptx_device i32 @test_tid_w() {
-; CHECK: mov.u32 %r0, %tid.w;
+; CHECK: mov.u32 %r{{[0-9]+}}, %tid.w;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.tid.w()
ret i32 %x
}
define ptx_device i32 @test_ntid_x() {
-; CHECK: mov.u32 %r0, %ntid.x;
+; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ntid.x()
ret i32 %x
}
define ptx_device i32 @test_ntid_y() {
-; CHECK: mov.u32 %r0, %ntid.y;
+; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ntid.y()
ret i32 %x
}
define ptx_device i32 @test_ntid_z() {
-; CHECK: mov.u32 %r0, %ntid.z;
+; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ntid.z()
ret i32 %x
}
define ptx_device i32 @test_ntid_w() {
-; CHECK: mov.u32 %r0, %ntid.w;
+; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.w;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ntid.w()
ret i32 %x
}
define ptx_device i32 @test_laneid() {
-; CHECK: mov.u32 %r0, %laneid;
+; CHECK: mov.u32 %r{{[0-9]+}}, %laneid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.laneid()
ret i32 %x
}
define ptx_device i32 @test_warpid() {
-; CHECK: mov.u32 %r0, %warpid;
+; CHECK: mov.u32 %r{{[0-9]+}}, %warpid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.warpid()
ret i32 %x
}
define ptx_device i32 @test_nwarpid() {
-; CHECK: mov.u32 %r0, %nwarpid;
+; CHECK: mov.u32 %r{{[0-9]+}}, %nwarpid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nwarpid()
ret i32 %x
}
define ptx_device i32 @test_ctaid_x() {
-; CHECK: mov.u32 %r0, %ctaid.x;
+; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ctaid.x()
ret i32 %x
}
define ptx_device i32 @test_ctaid_y() {
-; CHECK: mov.u32 %r0, %ctaid.y;
+; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ctaid.y()
ret i32 %x
}
define ptx_device i32 @test_ctaid_z() {
-; CHECK: mov.u32 %r0, %ctaid.z;
+; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ctaid.z()
ret i32 %x
}
define ptx_device i32 @test_ctaid_w() {
-; CHECK: mov.u32 %r0, %ctaid.w;
+; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.w;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.ctaid.w()
ret i32 %x
}
define ptx_device i32 @test_nctaid_x() {
-; CHECK: mov.u32 %r0, %nctaid.x;
+; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nctaid.x()
ret i32 %x
}
define ptx_device i32 @test_nctaid_y() {
-; CHECK: mov.u32 %r0, %nctaid.y;
+; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nctaid.y()
ret i32 %x
}
define ptx_device i32 @test_nctaid_z() {
-; CHECK: mov.u32 %r0, %nctaid.z;
+; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nctaid.z()
ret i32 %x
}
define ptx_device i32 @test_nctaid_w() {
-; CHECK: mov.u32 %r0, %nctaid.w;
+; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.w;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nctaid.w()
ret i32 %x
}
define ptx_device i32 @test_smid() {
-; CHECK: mov.u32 %r0, %smid;
+; CHECK: mov.u32 %r{{[0-9]+}}, %smid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.smid()
ret i32 %x
}
define ptx_device i32 @test_nsmid() {
-; CHECK: mov.u32 %r0, %nsmid;
+; CHECK: mov.u32 %r{{[0-9]+}}, %nsmid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.nsmid()
ret i32 %x
}
define ptx_device i32 @test_gridid() {
-; CHECK: mov.u32 %r0, %gridid;
+; CHECK: mov.u32 %r{{[0-9]+}}, %gridid;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.gridid()
ret i32 %x
}
define ptx_device i32 @test_lanemask_eq() {
-; CHECK: mov.u32 %r0, %lanemask_eq;
+; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_eq;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.eq()
ret i32 %x
}
define ptx_device i32 @test_lanemask_le() {
-; CHECK: mov.u32 %r0, %lanemask_le;
+; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_le;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.le()
ret i32 %x
}
define ptx_device i32 @test_lanemask_lt() {
-; CHECK: mov.u32 %r0, %lanemask_lt;
+; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_lt;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.lt()
ret i32 %x
}
define ptx_device i32 @test_lanemask_ge() {
-; CHECK: mov.u32 %r0, %lanemask_ge;
+; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_ge;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.ge()
ret i32 %x
}
define ptx_device i32 @test_lanemask_gt() {
-; CHECK: mov.u32 %r0, %lanemask_gt;
+; CHECK: mov.u32 %r{{[0-9]+}}, %lanemask_gt;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.lanemask.gt()
ret i32 %x
}
define ptx_device i32 @test_clock() {
-; CHECK: mov.u32 %r0, %clock;
+; CHECK: mov.u32 %r{{[0-9]+}}, %clock;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.clock()
ret i32 %x
}
define ptx_device i64 @test_clock64() {
-; CHECK: mov.u64 %rl0, %clock64;
+; CHECK: mov.u64 %rl{{[0-9]+}}, %clock64;
; CHECK: ret;
%x = call i64 @llvm.ptx.read.clock64()
ret i64 %x
}
define ptx_device i32 @test_pm0() {
-; CHECK: mov.u32 %r0, %pm0;
+; CHECK: mov.u32 %r{{[0-9]+}}, %pm0;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.pm0()
ret i32 %x
}
define ptx_device i32 @test_pm1() {
-; CHECK: mov.u32 %r0, %pm1;
+; CHECK: mov.u32 %r{{[0-9]+}}, %pm1;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.pm1()
ret i32 %x
}
define ptx_device i32 @test_pm2() {
-; CHECK: mov.u32 %r0, %pm2;
+; CHECK: mov.u32 %r{{[0-9]+}}, %pm2;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.pm2()
ret i32 %x
}
define ptx_device i32 @test_pm3() {
-; CHECK: mov.u32 %r0, %pm3;
+; CHECK: mov.u32 %r{{[0-9]+}}, %pm3;
; CHECK: ret;
%x = call i32 @llvm.ptx.read.pm3()
ret i32 %x
diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll
index 1676f20643..78e1e77890 100644
--- a/test/CodeGen/NVPTX/intrinsics.ll
+++ b/test/CodeGen/NVPTX/intrinsics.ll
@@ -2,14 +2,14 @@
; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
define ptx_device float @test_fabsf(float %f) {
-; CHECK: abs.f32 %f0, %f0;
+; CHECK: abs.f32 %f{{[0-9]+}}, %f{{[0-9]+}};
; CHECK: ret;
%x = call float @llvm.fabs.f32(float %f)
ret float %x
}
define ptx_device double @test_fabs(double %d) {
-; CHECK: abs.f64 %fl0, %fl0;
+; CHECK: abs.f64 %fl{{[0-9]+}}, %fl{{[0-9]+}};
; CHECK: ret;
%x = call double @llvm.fabs.f64(double %d)
ret double %x