Merging r167948, r168198: into the 3.2 release branch

r168198 [NVPTX] Order global variables in def-use order before emiting them in the final assembly r167948 [NVPTX] Implement custom lowering of loads/stores for i1 Loads from i1 become loads from i8 followed by trunc Stores to i1 become zext to i8 followed by store to i8 git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_32@168335 91177308-0d34-0410-b5e6-96231b3b80d8
author: Pawel Wodnicki <pawel@32bitmicro.com> 2012-11-19 22:25:44 +0000
committer: Pawel Wodnicki <pawel@32bitmicro.com> 2012-11-19 22:25:44 +0000
commit: c8a344e16acf6df8d402729da2e1dff7c5187633 (patch)
tree: c5cfb4f0ece5077d4c0629e216e2e7388e2104ba
parent: 97b07299fad0d019224912afe63fa916c4a0c507 (diff)
download: llvm-c8a344e16acf6df8d402729da2e1dff7c5187633.tar.gz
llvm-c8a344e16acf6df8d402729da2e1dff7c5187633.tar.bz2
llvm-c8a344e16acf6df8d402729da2e1dff7c5187633.tar.xz
5 files changed, 174 insertions, 5 deletions
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 3dd9bf5613..0a885ce1c4 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -68,7 +68,54 @@ static cl::opt<bool, true>InterleaveSrc("nvptx-emit-src",
                                         cl::location(llvm::InterleaveSrcInPtx));
 
 
+namespace {
+/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
+/// depends.
+void DiscoverDependentGlobals(Value *V,
+                              DenseSet<GlobalVariable*> &Globals) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    Globals.insert(GV);
+  else {
+    if (User *U = dyn_cast<User>(V)) {
+      for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
+        DiscoverDependentGlobals(U->getOperand(i), Globals);
+      }
+    }
+  }
+}
 
+/// VisitGlobalVariableForEmission - Add \p GV to the list of GlobalVariable
+/// instances to be emitted, but only after any dependents have been added
+/// first.
+void VisitGlobalVariableForEmission(GlobalVariable *GV,
+                                    SmallVectorImpl<GlobalVariable*> &Order,
+                                    DenseSet<GlobalVariable*> &Visited,
+                                    DenseSet<GlobalVariable*> &Visiting) {
+  // Have we already visited this one?
+  if (Visited.count(GV)) return;
+
+  // Do we have a circular dependency?
+  if (Visiting.count(GV))
+    report_fatal_error("Circular dependency found in global variable set");
+
+  // Start visiting this global
+  Visiting.insert(GV);
+
+  // Make sure we visit all dependents first
+  DenseSet<GlobalVariable*> Others;
+  for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
+    DiscoverDependentGlobals(GV->getOperand(i), Others);
+  
+  for (DenseSet<GlobalVariable*>::iterator I = Others.begin(),
+       E = Others.end(); I != E; ++I)
+    VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
+
+  // Now we can visit ourself
+  Order.push_back(GV);
+  Visited.insert(GV);
+  Visiting.erase(GV);
+}
+}
 
 // @TODO: This is a copy from AsmPrinter.cpp.  The function is static, so we
 // cannot just link to the existing version.
@@ -893,10 +940,27 @@ bool NVPTXAsmPrinter::doInitialization (Module &M) {
 
   emitDeclarations(M, OS2);
 
-  // Print out module-level global variables here.
+  // As ptxas does not support forward references of globals, we need to first
+  // sort the list of module-level globals in def-use order. We visit each
+  // global variable in order, and ensure that we emit it *after* its dependent
+  // globals. We use a little extra memory maintaining both a set and a list to
+  // have fast searches while maintaining a strict ordering.
+  SmallVector<GlobalVariable*,8> Globals;
+  DenseSet<GlobalVariable*> GVVisited;
+  DenseSet<GlobalVariable*> GVVisiting;
+
+  // Visit each global variable, in order
   for (Module::global_iterator I = M.global_begin(), E = M.global_end();
-      I != E; ++I)
-    printModuleLevelGV(I, OS2);
+       I != E; ++I)
+    VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
+
+  assert(GVVisited.size() == M.getGlobalList().size() && 
+         "Missed a global variable");
+  assert(GVVisiting.size() == 0 && "Did not fully process a global variable");
+
+  // Print out module-level global variables in proper order
+  for (unsigned i = 0, e = Globals.size(); i != e; ++i)
+    printModuleLevelGV(Globals[i], OS2);
 
   OS2 << '\n';
 
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index be771e3567..8430e21204 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -174,10 +174,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // PTX does not support load / store predicate registers
-  setOperationAction(ISD::LOAD, MVT::i1, Expand);
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+  setOperationAction(ISD::STORE, MVT::i1, Custom);
+
   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setOperationAction(ISD::STORE, MVT::i1, Expand);
   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
   setTruncStoreAction(MVT::i32, MVT::i1, Expand);
   setTruncStoreAction(MVT::i16, MVT::i1, Expand);
@@ -856,11 +857,66 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::EXTRACT_SUBVECTOR:
     return Op;
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::LOAD: return LowerLOAD(Op, DAG);
   default:
     llvm_unreachable("Custom lowering not defined for operation");
   }
 }
 
+
+// v = ld i1* addr
+//   =>
+// v1 = ld i8* addr
+// v = trunc v1 to i1
+SDValue NVPTXTargetLowering::
+LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  DebugLoc dl = Node->getDebugLoc();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  assert(ExtType == ISD::NON_EXTLOAD) ;
+  EVT VT = Node->getValueType(0);
+  assert(VT == MVT::i1 && "Custom lowering for i1 load only");
+  SDValue newLD = DAG.getLoad(MVT::i8, dl, LD->getChain(), LD->getBasePtr(),
+                              LD->getPointerInfo(),
+                              LD->isVolatile(), LD->isNonTemporal(),
+                              LD->isInvariant(),
+                              LD->getAlignment());
+  SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
+  // The legalizer (the caller) is expecting two values from the legalized
+  // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
+  // in LegalizeDAG.cpp which also uses MergeValues.
+  SDValue Ops[] = {result, LD->getChain()};
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+// st i1 v, addr
+//    =>
+// v1 = zxt v to i8
+// st i8, addr
+SDValue NVPTXTargetLowering::
+LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  StoreSDNode *ST = cast<StoreSDNode>(Node);
+  SDValue Tmp1 = ST->getChain();
+  SDValue Tmp2 = ST->getBasePtr();
+  SDValue Tmp3 = ST->getValue();
+  EVT VT = Tmp3.getValueType();
+  assert(VT == MVT::i1 && "Custom lowering for i1 store only");
+  unsigned Alignment = ST->getAlignment();
+  bool isVolatile = ST->isVolatile();
+  bool isNonTemporal = ST->isNonTemporal();
+  Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                     MVT::i8, Tmp3);
+  SDValue Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2,
+                                ST->getPointerInfo(), isVolatile,
+                                isNonTemporal, Alignment);
+  return Result;
+}
+
+
 SDValue
 NVPTXTargetLowering::getExtSymb(SelectionDAG &DAG, const char *inname, int idx,
                                 EVT v) const {
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 86246e6449..94a177ceb0 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -138,6 +138,9 @@ private:
   SDValue getParamHelpSymbol(SelectionDAG &DAG, int idx);
 
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
 };
 } // namespace llvm
 
diff --git a/test/CodeGen/NVPTX/global-ordering.ll b/test/CodeGen/NVPTX/global-ordering.ll
new file mode 100644
index 0000000000..43394a79e9
--- /dev/null
+++ b/test/CodeGen/NVPTX/global-ordering.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+; Make sure we emit these globals in def-use order
+
+
+; PTX32:      .visible .global .align 1 .u8 a = 2;
+; PTX32-NEXT: .visible .global .align 4 .u32 a2 = a;
+; PTX64:      .visible .global .align 1 .u8 a = 2;
+; PTX64-NEXT: .visible .global .align 8 .u64 a2 = a;
+@a2 = addrspace(1) global i8 addrspace(1)* @a
+@a = addrspace(1) global i8 2
+
+
+; PTX32:      .visible .global .align 1 .u8 b = 1;
+; PTX32-NEXT: .visible .global .align 4 .u32 b2[2] = {b, b};
+; PTX64:      .visible .global .align 1 .u8 b = 1;
+; PTX64-NEXT: .visible .global .align 8 .u64 b2[2] = {b, b};
+@b2 = addrspace(1) global [2 x i8 addrspace(1)*] [i8 addrspace(1)* @b, i8 addrspace(1)* @b]
+@b = addrspace(1) global i8 1
diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll
new file mode 100644
index 0000000000..779f7798d8
--- /dev/null
+++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+define ptx_kernel void @t1(i1* %a) {
+; PTX32:      mov.u16 %rc{{[0-9]+}}, 0;
+; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}};
+; PTX64:      mov.u16 %rc{{[0-9]+}}, 0;
+; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}};
+  store i1 false, i1* %a
+  ret void
+}
+
+
+define ptx_kernel void @t2(i1* %a, i8* %b) {
+; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}]
+; PTX32: and.b16 temp, %rc{{[0-9]+}}, 1;
+; PTX32: setp.b16.eq %p{{[0-9]+}}, temp, 1;
+; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}]
+; PTX64: and.b16 temp, %rc{{[0-9]+}}, 1;
+; PTX64: setp.b16.eq %p{{[0-9]+}}, temp, 1;
+
+  %t1 = load i1* %a
+  %t2 = select i1 %t1, i8 1, i8 2
+  store i8 %t2, i8* %b
+  ret void
+}
author	Pawel Wodnicki <pawel@32bitmicro.com>	2012-11-19 22:25:44 +0000
committer	Pawel Wodnicki <pawel@32bitmicro.com>	2012-11-19 22:25:44 +0000
commit	c8a344e16acf6df8d402729da2e1dff7c5187633 (patch)
tree	c5cfb4f0ece5077d4c0629e216e2e7388e2104ba
parent	97b07299fad0d019224912afe63fa916c4a0c507 (diff)
download	llvm-c8a344e16acf6df8d402729da2e1dff7c5187633.tar.gz llvm-c8a344e16acf6df8d402729da2e1dff7c5187633.tar.bz2 llvm-c8a344e16acf6df8d402729da2e1dff7c5187633.tar.xz