diff options
-rw-r--r-- | include/llvm/IR/IntrinsicsNVVM.td | 2 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 3 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 413 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 5 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelLowering.cpp | 995 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXISelLowering.h | 30 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXInstrInfo.cpp | 3 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXInstrInfo.td | 609 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXIntrinsics.td | 36 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXRegisterInfo.cpp | 6 | ||||
-rw-r--r-- | lib/Target/NVPTX/NVPTXRegisterInfo.td | 2 | ||||
-rw-r--r-- | test/CodeGen/NVPTX/compare-int.ll | 40 | ||||
-rw-r--r-- | test/CodeGen/NVPTX/ld-addrspace.ll | 12 | ||||
-rw-r--r-- | test/CodeGen/NVPTX/ld-generic.ll | 4 | ||||
-rw-r--r-- | test/CodeGen/NVPTX/pr13291-i1-store.ll | 16 | ||||
-rw-r--r-- | test/CodeGen/NVPTX/st-addrspace.ll | 12 | ||||
-rw-r--r-- | test/CodeGen/NVPTX/st-generic.ll | 4 |
17 files changed, 1287 insertions, 905 deletions
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td index c248517def..a372c22e43 100644 --- a/include/llvm/IR/IntrinsicsNVVM.td +++ b/include/llvm/IR/IntrinsicsNVVM.td @@ -861,8 +861,6 @@ def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty], // Move intrinsics, used in nvvm internally -def int_nvvm_move_i8 : Intrinsic<[llvm_i8_ty], [llvm_i8_ty], [IntrNoMem], - "llvm.nvvm.move.i8">; def int_nvvm_move_i16 : Intrinsic<[llvm_i16_ty], [llvm_i16_ty], [IntrNoMem], "llvm.nvvm.move.i16">; def int_nvvm_move_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem], diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index 84b088400c..9188262ca9 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -2016,7 +2016,6 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) { case NVPTX::CallArgI32: case NVPTX::CallArgI32imm: case NVPTX::CallArgI64: - case NVPTX::CallArgI8: case NVPTX::CallArgParam: case NVPTX::CallVoidInst: case NVPTX::CallVoidInstReg: @@ -2050,7 +2049,6 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) { case NVPTX::LastCallArgI32: case NVPTX::LastCallArgI32imm: case NVPTX::LastCallArgI64: - case NVPTX::LastCallArgI8: case NVPTX::LastCallArgParam: case NVPTX::LoadParamMemF32: case NVPTX::LoadParamMemF64: @@ -2063,7 +2061,6 @@ bool NVPTXAsmPrinter::ignoreLoc(const MachineInstr &MI) { case NVPTX::LoadParamRegI16: case NVPTX::LoadParamRegI32: case NVPTX::LoadParamRegI64: - case NVPTX::LoadParamRegI8: case NVPTX::PrototypeInst: case NVPTX::DBG_VALUE: return true; diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ac6dbb9eb6..7a0a59f1ce 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -116,6 +116,23 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) { case NVPTXISD::StoreV4: ResNode = SelectStoreVector(N); break; + case NVPTXISD::LoadParam: + case NVPTXISD::LoadParamV2: + case NVPTXISD::LoadParamV4: + ResNode = SelectLoadParam(N); + break; + case NVPTXISD::StoreRetval: + case NVPTXISD::StoreRetvalV2: + case NVPTXISD::StoreRetvalV4: + ResNode = SelectStoreRetval(N); + break; + case NVPTXISD::StoreParam: + case NVPTXISD::StoreParamV2: + case NVPTXISD::StoreParamV4: + case NVPTXISD::StoreParamS32: + case NVPTXISD::StoreParamU32: + ResNode = SelectStoreParam(N); + break; default: break; } @@ -771,7 +788,9 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) { SDLoc DL(N); SDNode *LD; - EVT RetVT = N->getValueType(0); + MemSDNode *Mem = cast<MemSDNode>(N); + + EVT RetVT = Mem->getMemoryVT().getVectorElementType(); // Select opcode if (Subtarget.is64Bit()) { @@ -1571,6 +1590,398 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) { return ST; } +SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) { + SDValue Chain = Node->getOperand(0); + SDValue Offset = Node->getOperand(2); + SDValue Flag = Node->getOperand(3); + SDLoc DL(Node); + MemSDNode *Mem = cast<MemSDNode>(Node); + + unsigned VecSize; + switch (Node->getOpcode()) { + default: + return NULL; + case NVPTXISD::LoadParam: + VecSize = 1; + break; + case NVPTXISD::LoadParamV2: + VecSize = 2; + break; + case NVPTXISD::LoadParamV4: + VecSize = 4; + break; + } + + EVT EltVT = Node->getValueType(0); + EVT MemVT = Mem->getMemoryVT(); + + unsigned Opc = 0; + + switch (VecSize) { + default: + return NULL; + case 1: + switch (MemVT.getSimpleVT().SimpleTy) { + default: + return NULL; + case MVT::i1: + Opc = NVPTX::LoadParamMemI8; + break; + case MVT::i8: + Opc = NVPTX::LoadParamMemI8; + break; + case MVT::i16: + Opc = NVPTX::LoadParamMemI16; + break; + case MVT::i32: + Opc = NVPTX::LoadParamMemI32; + break; + case MVT::i64: + Opc = NVPTX::LoadParamMemI64; + break; + case MVT::f32: + Opc = NVPTX::LoadParamMemF32; + break; + case MVT::f64: + Opc = NVPTX::LoadParamMemF64; + break; + } + break; + case 2: + switch (MemVT.getSimpleVT().SimpleTy) { + default: + return NULL; + case MVT::i1: + Opc = NVPTX::LoadParamMemV2I8; + break; + case MVT::i8: + Opc = NVPTX::LoadParamMemV2I8; + break; + case MVT::i16: + Opc = NVPTX::LoadParamMemV2I16; + break; + case MVT::i32: + Opc = NVPTX::LoadParamMemV2I32; + break; + case MVT::i64: + Opc = NVPTX::LoadParamMemV2I64; + break; + case MVT::f32: + Opc = NVPTX::LoadParamMemV2F32; + break; + case MVT::f64: + Opc = NVPTX::LoadParamMemV2F64; + break; + } + break; + case 4: + switch (MemVT.getSimpleVT().SimpleTy) { + default: + return NULL; + case MVT::i1: + Opc = NVPTX::LoadParamMemV4I8; + break; + case MVT::i8: + Opc = NVPTX::LoadParamMemV4I8; + break; + case MVT::i16: + Opc = NVPTX::LoadParamMemV4I16; + break; + case MVT::i32: + Opc = NVPTX::LoadParamMemV4I32; + break; + case MVT::f32: + Opc = NVPTX::LoadParamMemV4F32; + break; + } + break; + } + + SDVTList VTs; + if (VecSize == 1) { + VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue); + } else if (VecSize == 2) { + VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue); + } else { + EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue }; + VTs = CurDAG->getVTList(&EVTs[0], 5); + } + + unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); + + SmallVector<SDValue, 2> Ops; + Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32)); + Ops.push_back(Chain); + Ops.push_back(Flag); + + SDNode *Ret = + CurDAG->getMachineNode(Opc, DL, Node->getVTList(), Ops); + return Ret; +} + +SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Offset = N->getOperand(1); + unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); + MemSDNode *Mem = cast<MemSDNode>(N); + + // How many elements do we have? + unsigned NumElts = 1; + switch (N->getOpcode()) { + default: + return NULL; + case NVPTXISD::StoreRetval: + NumElts = 1; + break; + case NVPTXISD::StoreRetvalV2: + NumElts = 2; + break; + case NVPTXISD::StoreRetvalV4: + NumElts = 4; + break; + } + + // Build vector of operands + SmallVector<SDValue, 6> Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(N->getOperand(i + 2)); + Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32)); + Ops.push_back(Chain); + + // Determine target opcode + // If we have an i1, use an 8-bit store. The lowering code in + // NVPTXISelLowering will have already emitted an upcast. + unsigned Opcode = 0; + switch (NumElts) { + default: + return NULL; + case 1: + switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { + default: + return NULL; + case MVT::i1: + Opcode = NVPTX::StoreRetvalI8; + break; + case MVT::i8: + Opcode = NVPTX::StoreRetvalI8; + break; + case MVT::i16: + Opcode = NVPTX::StoreRetvalI16; + break; + case MVT::i32: + Opcode = NVPTX::StoreRetvalI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreRetvalI64; + break; + case MVT::f32: + Opcode = NVPTX::StoreRetvalF32; + break; + case MVT::f64: + Opcode = NVPTX::StoreRetvalF64; + break; + } + break; + case 2: + switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { + default: + return NULL; + case MVT::i1: + Opcode = NVPTX::StoreRetvalV2I8; + break; + case MVT::i8: + Opcode = NVPTX::StoreRetvalV2I8; + break; + case MVT::i16: + Opcode = NVPTX::StoreRetvalV2I16; + break; + case MVT::i32: + Opcode = NVPTX::StoreRetvalV2I32; + break; + case MVT::i64: + Opcode = NVPTX::StoreRetvalV2I64; + break; + case MVT::f32: + Opcode = NVPTX::StoreRetvalV2F32; + break; + case MVT::f64: + Opcode = NVPTX::StoreRetvalV2F64; + break; + } + break; + case 4: + switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { + default: + return NULL; + case MVT::i1: + Opcode = NVPTX::StoreRetvalV4I8; + break; + case MVT::i8: + Opcode = NVPTX::StoreRetvalV4I8; + break; + case MVT::i16: + Opcode = NVPTX::StoreRetvalV4I16; + break; + case MVT::i32: + Opcode = NVPTX::StoreRetvalV4I32; + break; + case MVT::f32: + Opcode = NVPTX::StoreRetvalV4F32; + break; + } + break; + } + + SDNode *Ret = + CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops); + MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); + MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1); + + return Ret; +} + +SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) { + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Param = N->getOperand(1); + unsigned ParamVal = cast<ConstantSDNode>(Param)->getZExtValue(); + SDValue Offset = N->getOperand(2); + unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue(); + MemSDNode *Mem = cast<MemSDNode>(N); + SDValue Flag = N->getOperand(N->getNumOperands() - 1); + + // How many elements do we have? + unsigned NumElts = 1; + switch (N->getOpcode()) { + default: + return NULL; + case NVPTXISD::StoreParamU32: + case NVPTXISD::StoreParamS32: + case NVPTXISD::StoreParam: + NumElts = 1; + break; + case NVPTXISD::StoreParamV2: + NumElts = 2; + break; + case NVPTXISD::StoreParamV4: + NumElts = 4; + break; + } + + // Build vector of operands + SmallVector<SDValue, 8> Ops; + for (unsigned i = 0; i < NumElts; ++i) + Ops.push_back(N->getOperand(i + 3)); + Ops.push_back(CurDAG->getTargetConstant(ParamVal, MVT::i32)); + Ops.push_back(CurDAG->getTargetConstant(OffsetVal, MVT::i32)); + Ops.push_back(Chain); + Ops.push_back(Flag); + + // Determine target opcode + // If we have an i1, use an 8-bit store. The lowering code in + // NVPTXISelLowering will have already emitted an upcast. + unsigned Opcode = 0; + switch (N->getOpcode()) { + default: + switch (NumElts) { + default: + return NULL; + case 1: + switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { + default: + return NULL; + case MVT::i1: + Opcode = NVPTX::StoreParamI8; + break; + case MVT::i8: + Opcode = NVPTX::StoreParamI8; + break; + case MVT::i16: + Opcode = NVPTX::StoreParamI16; + break; + case MVT::i32: + Opcode = NVPTX::StoreParamI32; + break; + case MVT::i64: + Opcode = NVPTX::StoreParamI64; + break; + case MVT::f32: + Opcode = NVPTX::StoreParamF32; + break; + case MVT::f64: + Opcode = NVPTX::StoreParamF64; + break; + } + break; + case 2: + switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { + default: + return NULL; + case MVT::i1: + Opcode = NVPTX::StoreParamV2I8; + break; + case MVT::i8: + Opcode = NVPTX::StoreParamV2I8; + break; + case MVT::i16: + Opcode = NVPTX::StoreParamV2I16; + break; + case MVT::i32: + Opcode = NVPTX::StoreParamV2I32; + break; + case MVT::i64: + Opcode = NVPTX::StoreParamV2I64; + break; + case MVT::f32: + Opcode = NVPTX::StoreParamV2F32; + break; + case MVT::f64: + Opcode = NVPTX::StoreParamV2F64; + break; + } + break; + case 4: + switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) { + default: + return NULL; + case MVT::i1: + Opcode = NVPTX::StoreParamV4I8; + break; + case MVT::i8: + Opcode = NVPTX::StoreParamV4I8; + break; + case MVT::i16: + Opcode = NVPTX::StoreParamV4I16; + break; + case MVT::i32: + Opcode = NVPTX::StoreParamV4I32; + break; + case MVT::f32: + Opcode = NVPTX::StoreParamV4F32; + break; + } + break; + } + break; + case NVPTXISD::StoreParamU32: + Opcode = NVPTX::StoreParamU32I16; + break; + case NVPTXISD::StoreParamS32: + Opcode = NVPTX::StoreParamS32I16; + break; + } + + SDNode *Ret = + CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops); + MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1); + MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand(); + cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1); + + return Ret; +} + // SelectDirectAddr - Match a direct address for DAG. // A direct address could be a globaladdress or externalsymbol. bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) { diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index ed16d4450b..428e7b2288 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -80,7 +80,10 @@ private: SDNode *SelectLDGLDUVector(SDNode *N); SDNode *SelectStore(SDNode *N); SDNode *SelectStoreVector(SDNode *N); - + SDNode *SelectLoadParam(SDNode *N); + SDNode *SelectStoreRetval(SDNode *N); + SDNode *SelectStoreParam(SDNode *N); + inline SDValue getI32Imm(unsigned Imm) { return CurDAG->getTargetConstant(Imm, MVT::i32); } diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 9679b05ab7..0396a6421a 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -51,6 +51,8 @@ static bool IsPTXVectorType(MVT VT) { switch (VT.SimpleTy) { default: return false; + case MVT::v2i1: + case MVT::v4i1: case MVT::v2i8: case MVT::v4i8: case MVT::v2i16: @@ -65,6 +67,37 @@ static bool IsPTXVectorType(MVT VT) { } } +/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive +/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors +/// into their primitive components. +/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the +/// same number of types as the Ins/Outs arrays in LowerFormalArguments, +/// LowerCall, and LowerReturn. +static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty, + SmallVectorImpl<EVT> &ValueVTs, + SmallVectorImpl<uint64_t> *Offsets = 0, + uint64_t StartingOffset = 0) { + SmallVector<EVT, 16> TempVTs; + SmallVector<uint64_t, 16> TempOffsets; + + ComputeValueVTs(TLI, Ty, TempVTs, &TempOffsets, StartingOffset); + for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) { + EVT VT = TempVTs[i]; + uint64_t Off = TempOffsets[i]; + if (VT.isVector()) + for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) { + ValueVTs.push_back(VT.getVectorElementType()); + if (Offsets) + Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize()); + } + else { + ValueVTs.push_back(VT); + if (Offsets) + Offsets->push_back(Off); + } + } +} + // NVPTXTargetLowering Constructor. NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) : TargetLowering(TM, new NVPTXTargetObjectFile()), nvTM(&TM), @@ -90,7 +123,6 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) setSchedulingPreference(Sched::Source); addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass); - addRegisterClass(MVT::i8, &NVPTX::Int8RegsRegClass); addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass); addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass); addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass); @@ -181,6 +213,9 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM) } } + // Custom handling for i8 intrinsics + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + // Now deduce the information based on the above mentioned // actions computeRegisterProperties(); @@ -293,6 +328,7 @@ NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(NVPTXISD::Wrapper, dl, getPointerTy(), Op); } +/* std::string NVPTXTargetLowering::getPrototype( Type *retTy, const ArgListTy &Args, const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment) const { @@ -442,6 +478,152 @@ std::string NVPTXTargetLowering::getPrototype( } O << ");"; return O.str(); +}*/ + +std::string +NVPTXTargetLowering::getPrototype(Type *retTy, const ArgListTy &Args, + const SmallVectorImpl<ISD::OutputArg> &Outs, + unsigned retAlignment, + const ImmutableCallSite *CS) const { + + bool isABI = (nvptxSubtarget.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return ""; + + std::stringstream O; + O << "prototype_" << uniqueCallSite << " : .callprototype "; + + if (retTy->getTypeID() == Type::VoidTyID) { + O << "()"; + } else { + O << "("; + if (retTy->isPrimitiveType() || retTy->isIntegerTy()) { + unsigned size = 0; + if (const IntegerType *ITy = dyn_cast<IntegerType>(retTy)) { + size = ITy->getBitWidth(); + if (size < 32) + size = 32; + } else { + assert(retTy->isFloatingPointTy() && + "Floating point type expected here"); + size = retTy->getPrimitiveSizeInBits(); + } + + O << ".param .b" << size << " _"; + } else if (isa<PointerType>(retTy)) { + O << ".param .b" << getPointerTy().getSizeInBits() << " _"; + } else { + if ((retTy->getTypeID() == Type::StructTyID) || isa<VectorType>(retTy)) { + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*this, retTy, vtparts); + unsigned totalsz = 0; + for (unsigned i = 0, e = vtparts.size(); i != e; ++i) { + unsigned elems = 1; + EVT elemtype = vtparts[i]; + if (vtparts[i].isVector()) { + elems = vtparts[i].getVectorNumElements(); + elemtype = vtparts[i].getVectorElementType(); + } + // TODO: no need to loop + for (unsigned j = 0, je = elems; j != je; ++j) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 8)) + sz = 8; + totalsz += sz / 8; + } + } + O << ".param .align " << retAlignment << " .b8 _[" << totalsz << "]"; + } else { + assert(false && "Unknown return type"); + } + } + O << ") "; + } + O << "_ ("; + + bool first = true; + MVT thePointerTy = getPointerTy(); + + unsigned OIdx = 0; + for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { + Type *Ty = Args[i].Ty; + if (!first) { + O << ", "; + } + first = false; + + if (Outs[OIdx].Flags.isByVal() == false) { + if (Ty->isAggregateType() || Ty->isVectorTy()) { + unsigned align = 0; + const CallInst *CallI = cast<CallInst>(CS->getInstruction()); + const DataLayout *TD = getDataLayout(); + // +1 because index 0 is reserved for return type alignment + if (!llvm::getAlign(*CallI, i + 1, align)) + align = TD->getABITypeAlignment(Ty); + unsigned sz = TD->getTypeAllocSize(Ty); + O << ".param .align " << align << " .b8 "; + O << "_"; + O << "[" << sz << "]"; + // update the index for Outs + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*this, Ty, vtparts); + if (unsigned len = vtparts.size()) + OIdx += len - 1; + continue; + } + assert(getValueType(Ty) == Outs[OIdx].VT && + "type mismatch between callee prototype and arguments"); + // scalar type + unsigned sz = 0; + if (isa<IntegerType>(Ty)) { + sz = cast<IntegerType>(Ty)->getBitWidth(); + if (sz < 32) + sz = 32; + } else if (isa<PointerType>(Ty)) + sz = thePointerTy.getSizeInBits(); + else + sz = Ty->getPrimitiveSizeInBits(); + O << ".param .b" << sz << " "; + O << "_"; + continue; + } + const PointerType *PTy = dyn_cast<PointerType>(Ty); + assert(PTy && "Param with byval attribute should be a pointer type"); + Type *ETy = PTy->getElementType(); + + unsigned align = Outs[OIdx].Flags.getByValAlign(); + unsigned sz = getDataLayout()->getTypeAllocSize(ETy); + O << ".param .align " << align << " .b8 "; + O << "_"; + O << "[" << sz << "]"; + } + O << ");"; + return O.str(); +} + +unsigned +NVPTXTargetLowering::getArgumentAlignment(SDValue Callee, + const ImmutableCallSite *CS, + Type *Ty, + unsigned Idx) const { + const DataLayout *TD = getDataLayout(); + unsigned align = 0; + GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); + + if (Func) { // direct call + assert(CS->getCalledFunction() && + "direct call cannot find callee"); + if (!llvm::getAlign(*(CS->getCalledFunction()), Idx, align)) + align = TD->getABITypeAlignment(Ty); + } + else { // indirect call + const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction()); + if (!llvm::getAlign(*CallI, Idx, align)) + align = TD->getABITypeAlignment(Ty); + } + + return align; } SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, @@ -459,54 +641,257 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, ImmutableCallSite *CS = CLI.CS; bool isABI = (nvptxSubtarget.getSmVersion() >= 20); + assert(isABI && "Non-ABI compilation is not supported"); + if (!isABI) + return Chain; + const DataLayout *TD = getDataLayout(); + MachineFunction &MF = DAG.getMachineFunction(); + const Function *F = MF.getFunction(); + const TargetLowering *TLI = nvTM->getTargetLowering(); SDValue tempChain = Chain; - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(uniqueCallSite, true), - dl); + Chain = + DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(uniqueCallSite, true), + dl); SDValue InFlag = Chain.getValue(1); - assert((Outs.size() == Args.size()) && - "Unexpected number of arguments to function call"); unsigned paramCount = 0; + // Args.size() and Outs.size() need not match. + // Outs.size() will be larger + // * if there is an aggregate argument with multiple fields (each field + // showing up separately in Outs) + // * if there is a vector argument with more than typical vector-length + // elements (generally if more than 4) where each vector element is + // individually present in Outs. + // So a different index should be used for indexing into Outs/OutVals. + // See similar issue in LowerFormalArguments. + unsigned OIdx = 0; // Declare the .params or .reg need to pass values // to the function - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { - EVT VT = Outs[i].VT; + for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) { + EVT VT = Outs[OIdx].VT; + Type *Ty = Args[i].Ty; - if (Outs[i].Flags.isByVal() == false) { + if (Outs[OIdx].Flags.isByVal() == false) { + if (Ty->isAggregateType()) { + // aggregate + SmallVector<EVT, 16> vtparts; + ComputeValueVTs(*this, Ty, vtparts); + + unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); + // declare .param .align <align> .b8 .param<n>[<size>]; + unsigned sz = TD->getTypeAllocSize(Ty); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32), + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(sz, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps, 5); + InFlag = Chain.getValue(1); + unsigned curOffset = 0; + for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { + unsigned elems = 1; + EVT elemtype = vtparts[j]; + if (vtparts[j].isVector()) { + elems = vtparts[j].getVectorNumElements(); + elemtype = vtparts[j].getVectorElementType(); + } + for (unsigned k = 0, ke = elems; k != ke; ++k) { + unsigned sz = elemtype.getSizeInBits(); + if (elemtype.isInteger() && (sz < 8)) + sz = 8; + SDValue StVal = OutVals[OIdx]; + if (elemtype.getSizeInBits() < 16) { + StVal = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, StVal); + } + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(curOffset, MVT::i32), + StVal, InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, + CopyParamVTs, &CopyParamOps[0], 5, + elemtype, MachinePointerInfo()); + InFlag = Chain.getValue(1); + curOffset += sz / 8; + ++OIdx; + } + } + if (vtparts.size() > 0) + --OIdx; + ++paramCount; + continue; + } + if (Ty->isVectorTy()) { + EVT ObjectVT = getValueType(Ty); + unsigned align = getArgumentAlignment(Callee, CS, Ty, paramCount + 1); + // declare .param .align <align> .b8 .param<n>[<size>]; + unsigned sz = TD->getTypeAllocSize(Ty); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, MVT::i32), + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(sz, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps, 5); + InFlag = Chain.getValue(1); + unsigned NumElts = ObjectVT.getVectorNumElements(); + EVT EltVT = ObjectVT.getVectorElementType(); + EVT MemVT = EltVT; + bool NeedExtend = false; + if (EltVT.getSizeInBits() < 16) { + NeedExtend = true; + EltVT = MVT::i16; + } + + // V1 store + if (NumElts == 1) { + SDValue Elt = OutVals[OIdx++]; + if (NeedExtend) + Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt); + + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(0, MVT::i32), Elt, + InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, + CopyParamVTs, &CopyParamOps[0], 5, + MemVT, MachinePointerInfo()); + InFlag = Chain.getValue(1); + } else if (NumElts == 2) { + SDValue Elt0 = OutVals[OIdx++]; + SDValue Elt1 = OutVals[OIdx++]; + if (NeedExtend) { + Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0); + Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1); + } + + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue CopyParamOps[] = { Chain, + DAG.getConstant(paramCount, MVT::i32), + DAG.getConstant(0, MVT::i32), Elt0, Elt1, + InFlag }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl, + CopyParamVTs, &CopyParamOps[0], 6, + MemVT, MachinePointerInfo()); + InFlag = Chain.getValue(1); + } else { + unsigned curOffset = 0; + // V4 stores + // We have at least 4 elements (<3 x Ty> expands to 4 elements) and + // the + // vector will be expanded to a power of 2 elements, so we know we can + // always round up to the next multiple of 4 when creating the vector + // stores. + // e.g. 4 elem => 1 st.v4 + // 6 elem => 2 st.v4 + // 8 elem => 2 st.v4 + // 11 elem => 3 st.v4 + unsigned VecSize = 4; + if (EltVT.getSizeInBits() == 64) + VecSize = 2; + + // This is potentially only part of a vector, so assume all elements + // are packed together. + unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize; + + for (unsigned i = 0; i < NumElts; i += VecSize) { + // Get values + SDValue StoreVal; + SmallVector<SDValue, 8> Ops; + Ops.push_back(Chain); + Ops.push_back(DAG.getConstant(paramCount, MVT::i32)); + Ops.push_back(DAG.getConstant(curOffset, MVT::i32)); + + unsigned Opc = NVPTXISD::StoreParamV2; + + StoreVal = OutVals[OIdx++]; + if (NeedExtend) + StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); + Ops.push_back(StoreVal); + + if (i + 1 < NumElts) { + StoreVal = OutVals[OIdx++]; + if (NeedExtend) + StoreVal = + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); + } else { + StoreVal = DAG.getUNDEF(EltVT); + } + Ops.push_back(StoreVal); + + if (VecSize == 4) { + Opc = NVPTXISD::StoreParamV4; + if (i + 2 < NumElts) { + StoreVal = OutVals[OIdx++]; + if (NeedExtend) + StoreVal = + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); + } else { + StoreVal = DAG.getUNDEF(EltVT); + } + Ops.push_back(StoreVal); + + if (i + 3 < NumElts) { + StoreVal = OutVals[OIdx++]; + if (NeedExtend) + StoreVal = + DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); + } else { + StoreVal = DAG.getUNDEF(EltVT); + } + Ops.push_back(StoreVal); + } + + SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, &Ops[0], + Ops.size(), MemVT, + MachinePointerInfo()); + InFlag = Chain.getValue(1); + curOffset += PerStoreOffset; + } + } + ++paramCount; + --OIdx; + continue; + } // Plain scalar // for ABI, declare .param .b<size> .param<n>; - // for nonABI, declare .reg .b<size> .param<n>; - unsigned isReg = 1; - if (isABI) - isReg = 0; unsigned sz = VT.getSizeInBits(); - if (VT.isInteger() && (sz < 32)) - sz = 32; + bool needExtend = false; + if (VT.isInteger()) { + if (sz < 16) + needExtend = true; + if (sz < 32) + sz = 32; + } SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue DeclareParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32), - DAG.getConstant(isReg, MVT::i32), InFlag }; + DAG.getConstant(0, MVT::i32), InFlag }; Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, DeclareParamOps, 5); InFlag = Chain.getValue(1); + SDValue OutV = OutVals[OIdx]; + if (needExtend) { + // zext/sext i1 to i16 + unsigned opc = ISD::ZERO_EXTEND; + if (Outs[OIdx].Flags.isSExt()) + opc = ISD::SIGN_EXTEND; + OutV = DAG.getNode(opc, dl, MVT::i16, OutV); + } SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(0, MVT::i32), OutVals[i], - InFlag }; + DAG.getConstant(0, MVT::i32), OutV, InFlag }; unsigned opcode = NVPTXISD::StoreParam; - if (isReg) - opcode = NVPTXISD::MoveToParam; - else { - if (Outs[i].Flags.isZExt()) - opcode = NVPTXISD::StoreParamU32; - else if (Outs[i].Flags.isSExt()) - opcode = NVPTXISD::StoreParamS32; - } - Chain = DAG.getNode(opcode, dl, CopyParamVTs, CopyParamOps, 5); + if (Outs[OIdx].Flags.isZExt()) + opcode = NVPTXISD::StoreParamU32; + else if (Outs[OIdx].Flags.isSExt()) + opcode = NVPTXISD::StoreParamS32; + Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, 5, + VT, MachinePointerInfo()); InFlag = Chain.getValue(1); ++paramCount; @@ -518,55 +903,20 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, assert(PTy && "Type of a byval parameter should be pointer"); ComputeValueVTs(*this, PTy->getElementType(), vtparts); - if (isABI) { - // declare .param .align 16 .b8 .param<n>[<size>]; - unsigned sz = Outs[i].Flags.getByValSize(); - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - // The ByValAlign in the Outs[i].Flags is alway set at this point, so we - // don't need to - // worry about natural alignment or not. See TargetLowering::LowerCallTo() - SDValue DeclareParamOps[] = { - Chain, DAG.getConstant(Outs[i].Flags.getByValAlign(), MVT::i32), - DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32), - InFlag - }; - Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, - DeclareParamOps, 5); - InFlag = Chain.getValue(1); - unsigned curOffset = 0; - for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { - unsigned elems = 1; - EVT elemtype = vtparts[j]; - if (vtparts[j].isVector()) { - elems = vtparts[j].getVectorNumElements(); - elemtype = vtparts[j].getVectorElementType(); - } - for (unsigned k = 0, ke = elems; k != ke; ++k) { - unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - SDValue srcAddr = - DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[i], - DAG.getConstant(curOffset, getPointerTy())); - SDValue theVal = - DAG.getLoad(elemtype, dl, tempChain, srcAddr, - MachinePointerInfo(), false, false, false, 0); - SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue CopyParamOps[] = { Chain, - DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(curOffset, MVT::i32), - theVal, InFlag }; - Chain = DAG.getNode(NVPTXISD::StoreParam, dl, CopyParamVTs, - CopyParamOps, 5); - InFlag = Chain.getValue(1); - curOffset += sz / 8; - } - } - ++paramCount; - continue; - } - // Non-abi, struct or vector - // Declare a bunch or .reg .b<size> .param<n> + // declare .param .align <align> .b8 .param<n>[<size>]; + unsigned sz = Outs[OIdx].Flags.getByValSize(); + SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + // The ByValAlign in the Outs[OIdx].Flags is alway set at this point, + // so we don't need to worry about natural alignment or not. + // See TargetLowering::LowerCallTo(). + SDValue DeclareParamOps[] = { + Chain, DAG.getConstant(Outs[OIdx].Flags.getByValAlign(), MVT::i32), + DAG.getConstant(paramCount, MVT::i32), DAG.getConstant(sz, MVT::i32), + InFlag + }; + Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs, + DeclareParamOps, 5); + InFlag = Chain.getValue(1); unsigned curOffset = 0; for (unsigned j = 0, je = vtparts.size(); j != je; ++j) { unsigned elems = 1; @@ -577,107 +927,66 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } for (unsigned k = 0, ke = elems; k != ke; ++k) { unsigned sz = elemtype.getSizeInBits(); - if (elemtype.isInteger() && (sz < 32)) - sz = 32; - SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareParamOps[] = { Chain, - DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(sz, MVT::i32), - DAG.getConstant(1, MVT::i32), InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs, - DeclareParamOps, 5); - InFlag = Chain.getValue(1); + if (elemtype.isInteger() && (sz < 8)) + sz = 8; SDValue srcAddr = - DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[i], + DAG.getNode(ISD::ADD, dl, getPointerTy(), OutVals[OIdx], DAG.getConstant(curOffset, getPointerTy())); - SDValue theVal = - DAG.getLoad(elemtype, dl, tempChain, srcAddr, MachinePointerInfo(), - false, false, false, 0); + SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr, + MachinePointerInfo(), false, false, false, + 0); + if (elemtype.getSizeInBits() < 16) { + theVal = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, theVal); + } SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue CopyParamOps[] = { Chain, DAG.getConstant(paramCount, MVT::i32), - DAG.getConstant(0, MVT::i32), theVal, + DAG.getConstant(curOffset, MVT::i32), theVal, InFlag }; - Chain = DAG.getNode(NVPTXISD::MoveToParam, dl, CopyParamVTs, - CopyParamOps, 5); + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, + CopyParamOps, 5, elemtype, + MachinePointerInfo()); + InFlag = Chain.getValue(1); - ++paramCount; + curOffset += sz / 8; } } + ++paramCount; } GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode()); unsigned retAlignment = 0; // Handle Result - unsigned retCount = 0; if (Ins.size() > 0) { SmallVector<EVT, 16> resvtparts; ComputeValueVTs(*this, retTy, resvtparts); - // Declare one .param .align 16 .b8 func_retval0[<size>] for ABI or - // individual .reg .b<size> func_retval<0..> for non ABI - unsigned resultsz = 0; - for (unsigned i = 0, e = resvtparts.size(); i != e; ++i) { - unsigned elems = 1; - EVT elemtype = resvtparts[i]; - if (resvtparts[i].isVector()) { - elems = resvtparts[i].getVectorNumElements(); - elemtype = resvtparts[i].getVectorElementType(); - } - for (unsigned j = 0, je = elems; j != je; ++j) { - unsigned sz = elemtype.getSizeInBits(); - if (isABI == false) { - if (elemtype.isInteger() && (sz < 32)) - sz = 32; - } else { - if (elemtype.isInteger() && (sz < 8)) - sz = 8; - } - if (isABI == false) { - SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareRetOps[] = { Chain, DAG.getConstant(2, MVT::i32), - DAG.getConstant(sz, MVT::i32), - DAG.getConstant(retCount, MVT::i32), - InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, - DeclareRetOps, 5); - InFlag = Chain.getValue(1); - ++retCount; - } - resultsz += sz; - } - } - if (isABI) { - if (retTy->isPrimitiveType() || retTy->isIntegerTy() || - retTy->isPointerTy()) { - // Scalar needs to be at least 32bit wide - if (resultsz < 32) - resultsz = 32; - SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32), - DAG.getConstant(resultsz, MVT::i32), - DAG.getConstant(0, MVT::i32), InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, - DeclareRetOps, 5); - InFlag = Chain.getValue(1); - } else { - if (Func) { // direct call - if (!llvm::getAlign(*(CS->getCalledFunction()), 0, retAlignment)) - retAlignment = getDataLayout()->getABITypeAlignment(retTy); - } else { // indirect call - const CallInst *CallI = dyn_cast<CallInst>(CS->getInstruction()); - if (!llvm::getAlign(*CallI, 0, retAlignment)) - retAlignment = getDataLayout()->getABITypeAlignment(retTy); - } - SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue DeclareRetOps[] = { Chain, - DAG.getConstant(retAlignment, MVT::i32), - DAG.getConstant(resultsz / 8, MVT::i32), - DAG.getConstant(0, MVT::i32), InFlag }; - Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, - DeclareRetOps, 5); - InFlag = Chain.getValue(1); - } + // Declare + // .param .align 16 .b8 retval0[<size-in-bytes>], or + // .param .b<size-in-bits> retval0 + unsigned resultsz = TD->getTypeAllocSizeInBits(retTy); + if (retTy->isPrimitiveType() || retTy->isIntegerTy() || + retTy->isPointerTy()) { + // Scalar needs to be at least 32bit wide + if (resultsz < 32) + resultsz = 32; + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, MVT::i32), + DAG.getConstant(resultsz, MVT::i32), + DAG.getConstant(0, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs, + DeclareRetOps, 5); + InFlag = Chain.getValue(1); + } else { + retAlignment = getArgumentAlignment(Callee, CS, retTy, 0); + SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue DeclareRetOps[] = { Chain, + DAG.getConstant(retAlignment, MVT::i32), + DAG.getConstant(resultsz / 8, MVT::i32), + DAG.getConstant(0, MVT::i32), InFlag }; + Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs, + DeclareRetOps, 5); + InFlag = Chain.getValue(1); } } @@ -690,7 +999,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // The prototype is embedded in a string and put as the operand for an // INLINEASM SDNode. SDVTList InlineAsmVTs = DAG.getVTList(MVT::Other, MVT::Glue); - std::string proto_string = getPrototype(retTy, Args, Outs, retAlignment); + std::string proto_string = + getPrototype(retTy, Args, Outs, retAlignment, CS); const char *asmstr = nvTM->getManagedStrPool() ->getManagedString(proto_string.c_str())->c_str(); SDValue InlineAsmOps[] = { @@ -703,9 +1013,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Op to just print "call" SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue PrintCallOps[] = { - Chain, - DAG.getConstant(isABI ? ((Ins.size() == 0) ? 0 : 1) : retCount, MVT::i32), - InFlag + Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag }; Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall), dl, PrintCallVTs, PrintCallOps, 3); @@ -753,59 +1061,172 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Generate loads from param memory/moves from registers for result if (Ins.size() > 0) { - if (isABI) { - unsigned resoffset = 0; - for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - unsigned sz = Ins[i].VT.getSizeInBits(); - if (Ins[i].VT.isInteger() && (sz < 8)) - sz = 8; - EVT LoadRetVTs[] = { Ins[i].VT, MVT::Other, MVT::Glue }; - SDValue LoadRetOps[] = { Chain, DAG.getConstant(1, MVT::i32), - DAG.getConstant(resoffset, MVT::i32), InFlag }; - SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, LoadRetVTs, - LoadRetOps, array_lengthof(LoadRetOps)); + unsigned resoffset = 0; + if (retTy && retTy->isVectorTy()) { + EVT ObjectVT = getValueType(retTy); + unsigned NumElts = ObjectVT.getVectorNumElements(); + EVT EltVT = ObjectVT.getVectorElementType(); + assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts && + "Vector was not scalarized"); + unsigned sz = EltVT.getSizeInBits(); + bool needTruncate = sz < 16 ? true : false; + + if (NumElts == 1) { + // Just a simple load + std::vector<EVT> LoadRetVTs; + if (needTruncate) { + // If loading i1 result, generate + // load i16 + // trunc i16 to i1 + LoadRetVTs.push_back(MVT::i16); + } else + LoadRetVTs.push_back(EltVT); + LoadRetVTs.push_back(MVT::Other); + LoadRetVTs.push_back(MVT::Glue); + std::vector<SDValue> LoadRetOps; + LoadRetOps.push_back(Chain); + LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); + LoadRetOps.push_back(DAG.getConstant(0, MVT::i32)); + LoadRetOps.push_back(InFlag); + SDValue retval = DAG.getMemIntrinsicNode( + NVPTXISD::LoadParam, dl, + DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0], + LoadRetOps.size(), EltVT, MachinePointerInfo()); Chain = retval.getValue(1); InFlag = retval.getValue(2); - InVals.push_back(retval); - resoffset += sz / 8; + SDValue Ret0 = retval; + if (needTruncate) + Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0); + InVals.push_back(Ret0); + } else if (NumElts == 2) { + // LoadV2 + std::vector<EVT> LoadRetVTs; + if (needTruncate) { + // If loading i1 result, generate + // load i16 + // trunc i16 to i1 + LoadRetVTs.push_back(MVT::i16); + LoadRetVTs.push_back(MVT::i16); + } else { + LoadRetVTs.push_back(EltVT); + LoadRetVTs.push_back(EltVT); + } + LoadRetVTs.push_back(MVT::Other); + LoadRetVTs.push_back(MVT::Glue); + std::vector<SDValue> LoadRetOps; + LoadRetOps.push_back(Chain); + LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); + LoadRetOps.push_back(DAG.getConstant(0, MVT::i32)); + LoadRetOps.push_back(InFlag); + SDValue retval = DAG.getMemIntrinsicNode( + NVPTXISD::LoadParamV2, dl, + DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0], + LoadRetOps.size(), EltVT, MachinePointerInfo()); + Chain = retval.getValue(2); + InFlag = retval.getValue(3); + SDValue Ret0 = retval.getValue(0); + SDValue Ret1 = retval.getValue(1); + if (needTruncate) { + Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0); + InVals.push_back(Ret0); + Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1); + InVals.push_back(Ret1); + } else { + InVals.push_back(Ret0); + InVals.push_back(Ret1); + } + } else { + // Split into N LoadV4 + unsigned Ofst = 0; + unsigned VecSize = 4; + unsigned Opc = NVPTXISD::LoadParamV4; + if (EltVT.getSizeInBits() == 64) { + VecSize = 2; + Opc = NVPTXISD::LoadParamV2; + } + EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize); + for (unsigned i = 0; i < NumElts; i += VecSize) { + SmallVector<EVT, 8> LoadRetVTs; + if (needTruncate) { + // If loading i1 result, generate + // load i16 + // trunc i16 to i1 + for (unsigned j = 0; j < VecSize; ++j) + LoadRetVTs.push_back(MVT::i16); + } else { + for (unsigned j = 0; j < VecSize; ++j) + LoadRetVTs.push_back(EltVT); + } + LoadRetVTs.push_back(MVT::Other); + LoadRetVTs.push_back(MVT::Glue); + SmallVector<SDValue, 4> LoadRetOps; + LoadRetOps.push_back(Chain); + LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); + LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32)); + LoadRetOps.push_back(InFlag); + SDValue retval = DAG.getMemIntrinsicNode( + Opc, dl, DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), + &LoadRetOps[0], LoadRetOps.size(), EltVT, MachinePointerInfo()); + if (VecSize == 2) { + Chain = retval.getValue(2); + InFlag = retval.getValue(3); + } else { + Chain = retval.getValue(4); + InFlag = retval.getValue(5); + } + + for (unsigned j = 0; j < VecSize; ++j) { + if (i + j >= NumElts) + break; + SDValue Elt = retval.getValue(j); + if (needTruncate) + Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); + InVals.push_back(Elt); + } + Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); + } } } else { - SmallVector<EVT, 16> resvtparts; - ComputeValueVTs(*this, retTy, resvtparts); - - assert(Ins.size() == resvtparts.size() && - "Unexpected number of return values in non-ABI case"); - unsigned paramNum = 0; + SmallVector<EVT, 16> VTs; + ComputePTXValueVTs(*this, retTy, VTs); + assert(VTs.size() == Ins.size() && "Bad value decomposition"); for (unsigned i = 0, e = Ins.size(); i != e; ++i) { - assert(EVT(Ins[i].VT) == resvtparts[i] && - "Unexpected EVT type in non-ABI case"); - unsigned numelems = 1; - EVT elemtype = Ins[i].VT; - if (Ins[i].VT.isVector()) { - numelems = Ins[i].VT.getVectorNumElements(); - elemtype = Ins[i].VT.getVectorElementType(); - } - std::vector<SDValue> tempRetVals; - for (unsigned j = 0; j < numelems; ++j) { - EVT MoveRetVTs[] = { elemtype, MVT::Other, MVT::Glue }; - SDValue MoveRetOps[] = { Chain, DAG.getConstant(0, MVT::i32), - DAG.getConstant(paramNum, MVT::i32), - InFlag }; - SDValue retval = DAG.getNode(NVPTXISD::LoadParam, dl, MoveRetVTs, - MoveRetOps, array_lengthof(MoveRetOps)); - Chain = retval.getValue(1); - InFlag = retval.getValue(2); - tempRetVals.push_back(retval); - ++paramNum; - } - if (Ins[i].VT.isVector()) - InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, Ins[i].VT, - &tempRetVals[0], tempRetVals.size())); - else - InVals.push_back(tempRetVals[0]); + unsigned sz = VTs[i].getSizeInBits(); + bool needTruncate = sz < 8 ? true : false; + if (VTs[i].isInteger() && (sz < 8)) + sz = 8; + + SmallVector<EVT, 4> LoadRetVTs; + if (sz < 16) { + // If loading i1/i8 result, generate + // load i8 (-> i16) + // trunc i16 to i1/i8 + LoadRetVTs.push_back(MVT::i16); + } else + LoadRetVTs.push_back(Ins[i].VT); + LoadRetVTs.push_back(MVT::Other); + LoadRetVTs.push_back(MVT::Glue); + + SmallVector<SDValue, 4> LoadRetOps; + LoadRetOps.push_back(Chain); + LoadRetOps.push_back(DAG.getConstant(1, MVT::i32)); + LoadRetOps.push_back(DAG.getConstant(resoffset, MVT::i32)); + LoadRetOps.push_back(InFlag); + SDValue retval = DAG.getMemIntrinsicNode( + NVPTXISD::LoadParam, dl, + DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0], + LoadRetOps.size(), VTs[i], MachinePointerInfo()); + Chain = retval.getValue(1); + InFlag = retval.getValue(2); + SDValue Ret0 = retval.getValue(0); + if (needTruncate) + Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0); + InVals.push_back(Ret0); + resoffset += sz / 8; } } } + Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(uniqueCallSite, true), DAG.getIntPtrConstant(uniqueCallSite + 1, true), InFlag, dl); @@ -874,8 +1295,8 @@ SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // v = ld i1* addr // => -// v1 = ld i8* addr -// v = trunc v1 to i1 +// v1 = ld i8* addr (-> i16) +// v = trunc i16 to i1 SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); LoadSDNode *LD = cast<LoadSDNode>(Node); @@ -884,7 +1305,7 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const { assert(Node->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only"); SDValue newLD = - DAG.getLoad(MVT::i8, dl, LD->getChain(), LD->getBasePtr(), + DAG.getLoad(MVT::i16, dl, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), LD->getAlignment()); SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD); @@ -942,9 +1363,9 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { // Since StoreV2 is a target node, we cannot rely on DAG type legalization. // Therefore, we must ensure the type is legal. For i1 and i8, we set the // stored type to i16 and propogate the "real" type as the memory type. - bool NeedExt = false; + bool NeedSExt = false; if (EltVT.getSizeInBits() < 16) - NeedExt = true; + NeedSExt = true; switch (NumElts) { default: @@ -967,10 +1388,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { for (unsigned i = 0; i < NumElts; ++i) { SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, DAG.getIntPtrConstant(i)); - if (NeedExt) - // ANY_EXTEND is correct here since the store will only look at the - // lower-order bits anyway. - ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); + if (NeedSExt) + ExtVal = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i16, ExtVal); Ops.push_back(ExtVal); } @@ -994,8 +1413,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { // st i1 v, addr // => -// v1 = zxt v to i8 -// st i8, addr +// v1 = zxt v to i16 +// st.u8 i16, addr SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { SDNode *Node = Op.getNode(); SDLoc dl(Node); @@ -1007,9 +1426,10 @@ SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const { unsigned Alignment = ST->getAlignment(); bool isVolatile = ST->isVolatile(); bool isNonTemporal = ST->isNonTemporal(); - Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Tmp3); - SDValue Result = DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), - isVolatile, isNonTemporal, Alignment); + Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3); + SDValue Result = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, + ST->getPointerInfo(), MVT::i8, isNonTemporal, + isVolatile, Alignment); return Result; } @@ -1116,7 +1536,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (Ty->isAggregateType()) { SmallVector<EVT, 16> vtparts; - ComputeValueVTs(*this, Ty, vtparts); + ComputePTXValueVTs(*this, Ty, vtparts); assert(vtparts.size() > 0 && "empty aggregate type not expected"); for (unsigned parti = 0, parte = vtparts.size(); parti != parte; ++parti) { @@ -1152,7 +1572,10 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( SmallVector<EVT, 16> vtparts; SmallVector<uint64_t, 16> offsets; - ComputeValueVTs(*this, Ty, vtparts, &offsets, 0); + // NOTE: Here, we lose the ability to issue vector loads for vectors + // that are a part of a struct. This should be investigated in the + // future. + ComputePTXValueVTs(*this, Ty, vtparts, &offsets, 0); assert(vtparts.size() > 0 && "empty aggregate type not expected"); bool aggregateIsPacked = false; if (StructType *STy = llvm::dyn_cast<StructType>(Ty)) @@ -1172,9 +1595,15 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( aggregateIsPacked ? 1 : TD->getABITypeAlignment( partVT.getTypeForEVT(F->getContext())); - SDValue p = DAG.getLoad(partVT, dl, Root, srcAddr, - MachinePointerInfo(srcValue), false, false, - true, partAlign); + SDValue p; + if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) + p = DAG.getExtLoad(ISD::SEXTLOAD, dl, Ins[InsIdx].VT, Root, srcAddr, + MachinePointerInfo(srcValue), partVT, false, + false, partAlign); + else + p = DAG.getLoad(partVT, dl, Root, srcAddr, + MachinePointerInfo(srcValue), false, false, false, + partAlign); if (p.getNode()) p.getNode()->setIROrder(idx + 1); InVals.push_back(p); @@ -1208,6 +1637,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( if (P.getNode()) P.getNode()->setIROrder(idx + 1); + if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) + P = DAG.getNode(ISD::SIGN_EXTEND, dl, Ins[InsIdx].VT, P); InVals.push_back(P); Ofst += TD->getTypeAllocSize(EltVT.getTypeForEVT(F->getContext())); ++InsIdx; @@ -1230,6 +1661,12 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( DAG.getIntPtrConstant(0)); SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, DAG.getIntPtrConstant(1)); + + if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) { + Elt0 = DAG.getNode(ISD::SIGN_EXTEND, dl, Ins[InsIdx].VT, Elt0); + Elt1 = DAG.getNode(ISD::SIGN_EXTEND, dl, Ins[InsIdx].VT, Elt1); + } + InVals.push_back(Elt0); InVals.push_back(Elt1); Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); @@ -1269,6 +1706,8 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( break; SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P, DAG.getIntPtrConstant(j)); + if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) + Elt = DAG.getNode(ISD::SIGN_EXTEND, dl, Ins[InsIdx].VT, Elt); InVals.push_back(Elt); } Ofst += TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); @@ -1282,16 +1721,19 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( } // A plain scalar. EVT ObjectVT = getValueType(Ty); - assert(ObjectVT == Ins[InsIdx].VT && - "Ins type did not match function type"); // If ABI, load from the param symbol SDValue Arg = getParamSymbol(DAG, idx, getPointerTy()); Value *srcValue = Constant::getNullValue(PointerType::get( ObjectVT.getTypeForEVT(F->getContext()), llvm::ADDRESS_SPACE_PARAM)); - SDValue p = DAG.getLoad( - ObjectVT, dl, Root, Arg, MachinePointerInfo(srcValue), false, false, - true, - TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); + SDValue p; + if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) + p = DAG.getExtLoad(ISD::SEXTLOAD, dl, Ins[InsIdx].VT, Root, Arg, + MachinePointerInfo(srcValue), ObjectVT, false, false, + TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); + else + p = DAG.getLoad(Ins[InsIdx].VT, dl, Root, Arg, + MachinePointerInfo(srcValue), false, false, false, + TD->getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext()))); if (p.getNode()) p.getNode()->setIROrder(idx + 1); InVals.push_back(p); @@ -1360,26 +1802,38 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, unsigned NumElts = VTy->getNumElements(); assert(NumElts == Outs.size() && "Bad scalarization of return value"); + // const_cast can be removed in later LLVM versions + EVT EltVT = getValueType(const_cast<Type *>(RetTy)).getVectorElementType(); + bool NeedExtend = false; + if (EltVT.getSizeInBits() < 16) + NeedExtend = true; + // V1 store if (NumElts == 1) { SDValue StoreVal = OutVals[0]; // We only have one element, so just directly store it - if (StoreVal.getValueType().getSizeInBits() < 8) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, StoreVal); - Chain = DAG.getNode(NVPTXISD::StoreRetval, dl, MVT::Other, Chain, - DAG.getConstant(0, MVT::i32), StoreVal); + if (NeedExtend) + StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal); + SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, + DAG.getVTList(MVT::Other), &Ops[0], 3, + EltVT, MachinePointerInfo()); + } else if (NumElts == 2) { // V2 store SDValue StoreVal0 = OutVals[0]; SDValue StoreVal1 = OutVals[1]; - if (StoreVal0.getValueType().getSizeInBits() < 8) { - StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, StoreVal0); - StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, StoreVal1); + if (NeedExtend) { + StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0); + StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1); } - Chain = DAG.getNode(NVPTXISD::StoreRetvalV2, dl, MVT::Other, Chain, - DAG.getConstant(0, MVT::i32), StoreVal0, StoreVal1); + SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0, + StoreVal1 }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl, + DAG.getVTList(MVT::Other), &Ops[0], 4, + EltVT, MachinePointerInfo()); } else { // V4 stores // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the @@ -1402,10 +1856,6 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, unsigned PerStoreOffset = TD->getTypeAllocSize(VecVT.getTypeForEVT(F->getContext())); - bool Extend = false; - if (OutVals[0].getValueType().getSizeInBits() < 8) - Extend = true; - for (unsigned i = 0; i < NumElts; i += VecSize) { // Get values SDValue StoreVal; @@ -1413,17 +1863,17 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Ops.push_back(Chain); Ops.push_back(DAG.getConstant(Offset, MVT::i32)); unsigned Opc = NVPTXISD::StoreRetvalV2; - EVT ExtendedVT = (Extend) ? MVT::i8 : OutVals[0].getValueType(); + EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType(); StoreVal = OutVals[i]; - if (Extend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, StoreVal); + if (NeedExtend) + StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); Ops.push_back(StoreVal); if (i + 1 < NumElts) { StoreVal = OutVals[i + 1]; - if (Extend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, StoreVal); + if (NeedExtend) + StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); } else { StoreVal = DAG.getUNDEF(ExtendedVT); } @@ -1433,8 +1883,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, Opc = NVPTXISD::StoreRetvalV4; if (i + 2 < NumElts) { StoreVal = OutVals[i + 2]; - if (Extend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, StoreVal); + if (NeedExtend) + StoreVal = + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); } else { StoreVal = DAG.getUNDEF(ExtendedVT); } @@ -1442,19 +1893,29 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, if (i + 3 < NumElts) { StoreVal = OutVals[i + 3]; - if (Extend) - StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, StoreVal); + if (NeedExtend) + StoreVal = + DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal); } else { StoreVal = DAG.getUNDEF(ExtendedVT); } Ops.push_back(StoreVal); } - Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size()); + // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size()); + Chain = + DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), &Ops[0], + Ops.size(), EltVT, MachinePointerInfo()); Offset += PerStoreOffset; } } } else { + SmallVector<EVT, 16> ValVTs; + // const_cast is necessary since we are still using an LLVM version from + // before the type system re-write. + ComputePTXValueVTs(*this, const_cast<Type *>(RetTy), ValVTs); + assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition"); + unsigned sizesofar = 0; for (unsigned i = 0, e = Outs.size(); i != e; ++i) { SDValue theVal = OutVals[i]; @@ -1471,13 +1932,15 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, EVT theStoreType = tmpval.getValueType(); if (theStoreType.getSizeInBits() < 8) tmpval = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, tmpval); - Chain = DAG.getNode(NVPTXISD::StoreRetval, dl, MVT::Other, Chain, - DAG.getConstant(sizesofar, MVT::i32), tmpval); + SDValue Ops[] = { Chain, DAG.getConstant(sizesofar, MVT::i32), tmpval }; + Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl, + DAG.getVTList(MVT::Other), &Ops[0], 3, + ValVTs[i], MachinePointerInfo()); if (theValType.isVector()) sizesofar += - theValType.getVectorElementType().getStoreSizeInBits() / 8; + ValVTs[i].getVectorElementType().getStoreSizeInBits() / 8; else - sizesofar += theValType.getStoreSizeInBits() / 8; + sizesofar += ValVTs[i].getStoreSizeInBits() / 8; } } } @@ -1485,6 +1948,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, return DAG.getNode(NVPTXISD::RET_FLAG, dl, MVT::Other, Chain); } + void NVPTXTargetLowering::LowerAsmOperandForConstraint( SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { @@ -1548,9 +2012,9 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic( Info.opc = ISD::INTRINSIC_W_CHAIN; if (Intrinsic == Intrinsic::nvvm_ldu_global_i) - Info.memVT = MVT::i32; + Info.memVT = getValueType(I.getType()); else if (Intrinsic == Intrinsic::nvvm_ldu_global_p) - Info.memVT = getPointerTy(); + Info.memVT = getValueType(I.getType()); else Info.memVT = MVT::f32; Info.ptrVal = I.getArgOperand(0); @@ -1635,7 +2099,7 @@ NVPTXTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint, if (Constraint.size() == 1) { switch (Constraint[0]) { case 'c': - return std::make_pair(0U, &NVPTX::Int8RegsRegClass); + return std::make_pair(0U, &NVPTX::Int16RegsRegClass); case 'h': return std::make_pair(0U, &NVPTX::Int16RegsRegClass); case 'r': @@ -1775,7 +2239,8 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, unsigned NumElts = ResVT.getVectorNumElements(); EVT EltVT = ResVT.getVectorElementType(); - // Since LDU/LDG are target nodes, we cannot rely on DAG type legalization. + // Since LDU/LDG are target nodes, we cannot rely on DAG type + // legalization. // Therefore, we must ensure the type is legal. For i1 and i8, we set the // loaded type to i16 and propogate the "real" type as the memory type. bool NeedTrunc = false; @@ -1834,7 +2299,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, OtherOps.push_back(Chain); // Chain // Skip operand 1 (intrinsic ID) - // Others + // Others for (unsigned i = 2, e = N->getNumOperands(); i != e; ++i) OtherOps.push_back(N->getOperand(i)); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h index b0dad0f0d7..43c63ae739 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/lib/Target/NVPTX/NVPTXISelLowering.h @@ -35,14 +35,6 @@ enum NodeType { DeclareRetParam, DeclareRet, DeclareScalarRet, - LoadParam, - LoadParamV2, - LoadParamV4, - StoreParam, - StoreParamV2, - StoreParamV4, - StoreParamS32, // to sext and store a <32bit value, not used currently - StoreParamU32, // to zext and store a <32bit value, not used currently MoveToParam, PrintCall, PrintCallUni, @@ -57,9 +49,6 @@ enum NodeType { MoveParam, MoveRetval, MoveToRetval, - StoreRetval, - StoreRetvalV2, - StoreRetvalV4, PseudoUseParam, RETURN, CallSeqBegin, @@ -73,7 +62,18 @@ enum NodeType { LDUV2, // LDU.v2 LDUV4, // LDU.v4 StoreV2, - StoreV4 + StoreV4, + LoadParam, + LoadParamV2, + LoadParamV4, + StoreParam, + StoreParamV2, + StoreParamV4, + StoreParamS32, // to sext and store a <32bit value, not used currently + StoreParamU32, // to zext and store a <32bit value, not used currently + StoreRetval, + StoreRetvalV2, + StoreRetvalV4 }; } @@ -126,7 +126,8 @@ public: std::string getPrototype(Type *, const ArgListTy &, const SmallVectorImpl<ISD::OutputArg> &, - unsigned retAlignment) const; + unsigned retAlignment, + const ImmutableCallSite *CS) const; virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -164,6 +165,9 @@ private: virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const; + + unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS, + Type *Ty, unsigned Idx) const; }; } // namespace llvm diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp index 80af163a49..b406aa9288 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp +++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp @@ -51,9 +51,6 @@ void NVPTXInstrInfo::copyPhysReg( else if (DestRC == &NVPTX::Int16RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV16rr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); - else if (DestRC == &NVPTX::Int8RegsRegClass) - BuildMI(MBB, I, DL, get(NVPTX::IMOV8rr), DestReg) - .addReg(SrcReg, getKillRegState(KillSrc)); else if (DestRC == &NVPTX::Int64RegsRegClass) BuildMI(MBB, I, DL, get(NVPTX::IMOV64rr), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index c980237408..965af511e1 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -82,101 +82,6 @@ def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">; def true : Predicate<"1">; -//===----------------------------------------------------------------------===// -// Special Handling for 8-bit Operands and Operations -// -// PTX supports 8-bit signed and unsigned types, but does not support 8-bit -// operations (like add, shift, etc) except for ld/st/cvt. SASS does not have -// 8-bit registers. -// -// PTX ld, st and cvt instructions permit source and destination data operands -// to be wider than the instruction-type size, so that narrow values may be -// loaded, stored, and converted using regular-width registers. -// -// So in PTX generation, we -// - always use 16-bit registers in place in 8-bit registers. -// (8-bit variables should stay as 8-bit as they represent memory layout.) -// - for the following 8-bit operations, we sign-ext/zero-ext the 8-bit values -// before operation -// . div -// . rem -// . neg (sign) -// . set, setp -// . shr -// -// We are patching the operations by inserting the cvt instructions in the -// asm strings of the affected instructions. -// -// Since vector operations, except for ld/st, are eventually elementized. We -// do not need to special-hand the vector 8-bit operations. -// -// -//===----------------------------------------------------------------------===// - -// Generate string block like -// { -// .reg .s16 %temp1; -// .reg .s16 %temp2; -// cvt.s16.s8 %temp1, %a; -// cvt.s16.s8 %temp2, %b; -// opc.s16 %dst, %temp1, %temp2; -// } -// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8 -class Handle_i8rr<string OpcStr, string TypeStr, string CVTStr> { - string s = !strconcat("{{\n\t", - !strconcat(".reg .", !strconcat(TypeStr, - !strconcat(" \t%temp1;\n\t", - !strconcat(".reg .", !strconcat(TypeStr, - !strconcat(" \t%temp2;\n\t", - !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t", - !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t", - !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}")))))))))))); -} - -// Generate string block like -// { -// .reg .s16 %temp1; -// .reg .s16 %temp2; -// cvt.s16.s8 %temp1, %a; -// mov.b16 %temp2, %b; -// cvt.s16.s8 %temp2, %temp2; -// opc.s16 %dst, %temp1, %temp2; -// } -// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8 -class Handle_i8ri<string OpcStr, string TypeStr, string CVTStr> { - string s = !strconcat("{{\n\t", - !strconcat(".reg .", !strconcat(TypeStr, - !strconcat(" \t%temp1;\n\t", - !strconcat(".reg .", - !strconcat(TypeStr, !strconcat(" \t%temp2;\n\t", - !strconcat(CVTStr, !strconcat(" \t%temp1, $a;\n\t", - !strconcat("mov.b16 \t%temp2, $b;\n\t", - !strconcat(CVTStr, !strconcat(" \t%temp2, %temp2;\n\t", - !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}"))))))))))))); -} - -// Generate string block like -// { -// .reg .s16 %temp1; -// .reg .s16 %temp2; -// mov.b16 %temp1, %b; -// cvt.s16.s8 %temp1, %temp1; -// cvt.s16.s8 %temp2, %a; -// opc.s16 %dst, %temp1, %temp2; -// } -// when OpcStr=opc.s TypeStr=s16 CVTStr=cvt.s16.s8 -class Handle_i8ir<string OpcStr, string TypeStr, string CVTStr> { - string s = !strconcat("{{\n\t", - !strconcat(".reg .", !strconcat(TypeStr, - !strconcat(" \t%temp1;\n\t", - !strconcat(".reg .", !strconcat(TypeStr, - !strconcat(" \t%temp2;\n\t", - !strconcat("mov.b16 \t%temp1, $a;\n\t", - !strconcat(CVTStr, !strconcat(" \t%temp1, %temp1;\n\t", - !strconcat(CVTStr, !strconcat(" \t%temp2, $b;\n\t", - !strconcat(OpcStr, "16 \t$dst, %temp1, %temp2;\n\t}}"))))))))))))); -} - //===----------------------------------------------------------------------===// // Some Common Instruction Class Templates @@ -204,66 +109,6 @@ multiclass I3<string OpcStr, SDNode OpNode> { def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), !strconcat(OpcStr, "16 \t$dst, $a, $b;"), [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; - def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; - def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>; -} - -multiclass I3_i8<string OpcStr, SDNode OpNode, string TypeStr, string CVTStr> { - def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, - Int64Regs:$b))]>; - def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; - def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, - Int32Regs:$b))]>; - def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; - def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, - Int16Regs:$b))]>; - def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; - def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), - Handle_i8rr<OpcStr, TypeStr, CVTStr>.s, - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; - def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), - Handle_i8ri<OpcStr, TypeStr, CVTStr>.s, - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, (imm):$b))]>; -} - -multiclass I3_noi8<string OpcStr, SDNode OpNode> { - def i64rr : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, Int64Regs:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, - Int64Regs:$b))]>; - def i64ri : NVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$a, i64imm:$b), - !strconcat(OpcStr, "64 \t$dst, $a, $b;"), - [(set Int64Regs:$dst, (OpNode Int64Regs:$a, imm:$b))]>; - def i32rr : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, - Int32Regs:$b))]>; - def i32ri : NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$a, i32imm:$b), - !strconcat(OpcStr, "32 \t$dst, $a, $b;"), - [(set Int32Regs:$dst, (OpNode Int32Regs:$a, imm:$b))]>; - def i16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, - Int16Regs:$b))]>; - def i16ri : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, i16imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (imm):$b))]>; } multiclass ADD_SUB_INT_32<string OpcStr, SDNode OpNode> { @@ -522,81 +367,17 @@ def : Pat<(mul (zext Int16Regs:$a), (i32 UInt16Const:$b)), defm MULT : I3<"mul.lo.s", mul>; -defm MULTHS : I3_noi8<"mul.hi.s", mulhs>; -defm MULTHU : I3_noi8<"mul.hi.u", mulhu>; -def MULTHSi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), - !strconcat("{{ \n\t", - !strconcat(".reg \t.s16 temp1; \n\t", - !strconcat(".reg \t.s16 temp2; \n\t", - !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t", - !strconcat("cvt.s16.s8 \ttemp2, $b; \n\t", - !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t", - !strconcat("shr.s16 \t$dst, $dst, 8; \n\t", - !strconcat("}}", "")))))))), - [(set Int8Regs:$dst, (mulhs Int8Regs:$a, Int8Regs:$b))]>; -def MULTHSi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), - !strconcat("{{ \n\t", - !strconcat(".reg \t.s16 temp1; \n\t", - !strconcat(".reg \t.s16 temp2; \n\t", - !strconcat("cvt.s16.s8 \ttemp1, $a; \n\t", - !strconcat("mov.b16 \ttemp2, $b; \n\t", - !strconcat("cvt.s16.s8 \ttemp2, temp2; \n\t", - !strconcat("mul.lo.s16 \t$dst, temp1, temp2; \n\t", - !strconcat("shr.s16 \t$dst, $dst, 8; \n\t", - !strconcat("}}", ""))))))))), - [(set Int8Regs:$dst, (mulhs Int8Regs:$a, imm:$b))]>; -def MULTHUi8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), - !strconcat("{{ \n\t", - !strconcat(".reg \t.u16 temp1; \n\t", - !strconcat(".reg \t.u16 temp2; \n\t", - !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t", - !strconcat("cvt.u16.u8 \ttemp2, $b; \n\t", - !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t", - !strconcat("shr.u16 \t$dst, $dst, 8; \n\t", - !strconcat("}}", "")))))))), - [(set Int8Regs:$dst, (mulhu Int8Regs:$a, Int8Regs:$b))]>; -def MULTHUi8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), - !strconcat("{{ \n\t", - !strconcat(".reg \t.u16 temp1; \n\t", - !strconcat(".reg \t.u16 temp2; \n\t", - !strconcat("cvt.u16.u8 \ttemp1, $a; \n\t", - !strconcat("mov.b16 \ttemp2, $b; \n\t", - !strconcat("cvt.u16.u8 \ttemp2, temp2; \n\t", - !strconcat("mul.lo.u16 \t$dst, temp1, temp2; \n\t", - !strconcat("shr.u16 \t$dst, $dst, 8; \n\t", - !strconcat("}}", ""))))))))), - [(set Int8Regs:$dst, (mulhu Int8Regs:$a, imm:$b))]>; - - -defm SDIV : I3_i8<"div.s", sdiv, "s16", "cvt.s16.s8">; -defm UDIV : I3_i8<"div.u", udiv, "u16", "cvt.u16.u8">; - -defm SREM : I3_i8<"rem.s", srem, "s16", "cvt.s16.s8">; +defm MULTHS : I3<"mul.hi.s", mulhs>; +defm MULTHU : I3<"mul.hi.u", mulhu>; + +defm SDIV : I3<"div.s", sdiv>; +defm UDIV : I3<"div.u", udiv>; + +defm SREM : I3<"rem.s", srem>; // The ri version will not be selected as DAGCombiner::visitSREM will lower it. -defm UREM : I3_i8<"rem.u", urem, "u16", "cvt.u16.u8">; +defm UREM : I3<"rem.u", urem>; // The ri version will not be selected as DAGCombiner::visitUREM will lower it. -def MAD8rrr : NVPTXInst<(outs Int8Regs:$dst), - (ins Int8Regs:$a, Int8Regs:$b, Int8Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b), - Int8Regs:$c))]>; -def MAD8rri : NVPTXInst<(outs Int8Regs:$dst), - (ins Int8Regs:$a, Int8Regs:$b, i8imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int8Regs:$dst, (add (mul Int8Regs:$a, Int8Regs:$b), - imm:$c))]>; -def MAD8rir : NVPTXInst<(outs Int8Regs:$dst), - (ins Int8Regs:$a, i8imm:$b, Int8Regs:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b), - Int8Regs:$c))]>; -def MAD8rii : NVPTXInst<(outs Int8Regs:$dst), - (ins Int8Regs:$a, i8imm:$b, i8imm:$c), - "mad.lo.s16 \t$dst, $a, $b, $c;", - [(set Int8Regs:$dst, (add (mul Int8Regs:$a, imm:$b), - imm:$c))]>; - def MAD16rrr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int16Regs:$c), "mad.lo.s16 \t$dst, $a, $b, $c;", @@ -661,10 +442,6 @@ def MAD64rii : NVPTXInst<(outs Int64Regs:$dst), (mul Int64Regs:$a, imm:$b), imm:$c))]>; -def INEG8 : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src), - !strconcat("cvt.s16.s8 \t$dst, $src;\n\t", - "neg.s16 \t$dst, $dst;"), - [(set Int8Regs:$dst, (ineg Int8Regs:$src))]>; def INEG16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "neg.s16 \t$dst, $src;", [(set Int16Regs:$dst, (ineg Int16Regs:$src))]>; @@ -974,12 +751,6 @@ multiclass LOG_FORMAT<string OpcStr, SDNode OpNode> { def b1ri: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$a, i1imm:$b), !strconcat(OpcStr, ".pred \t$dst, $a, $b;"), [(set Int1Regs:$dst, (OpNode Int1Regs:$a, imm:$b))]>; - def b8rr: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; - def b8ri: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i8imm:$b), - !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>; def b16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, ".b16 \t$dst, $a, $b;"), [(set Int16Regs:$dst, (OpNode Int16Regs:$a, @@ -1010,9 +781,6 @@ defm XOR : LOG_FORMAT<"xor", xor>; def NOT1: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$src), "not.pred \t$dst, $src;", [(set Int1Regs:$dst, (not Int1Regs:$src))]>; -def NOT8: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src), - "not.b16 \t$dst, $src;", - [(set Int8Regs:$dst, (not Int8Regs:$src))]>; def NOT16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "not.b16 \t$dst, $src;", [(set Int16Regs:$dst, (not Int16Regs:$src))]>; @@ -1056,14 +824,6 @@ multiclass LSHIFT_FORMAT<string OpcStr, SDNode OpNode> { !strconcat(OpcStr, "16 \t$dst, $a, $b;"), [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; - def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, - Int32Regs:$b))]>; - def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b), - !strconcat(OpcStr, "16 \t$dst, $a, $b;"), - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, - (i32 imm:$b)))]>; } defm SHL : LSHIFT_FORMAT<"shl.b", shl>; @@ -1102,16 +862,6 @@ multiclass RSHIFT_FORMAT<string OpcStr, SDNode OpNode, string CVTStr> { !strconcat(OpcStr, "16 \t$dst, $a, $b;"), [(set Int16Regs:$dst, (OpNode Int16Regs:$a, (i32 imm:$b)))]>; - def i8rr : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int32Regs:$b), - !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t", - !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))), - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, - Int32Regs:$b))]>; - def i8ri : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, i32imm:$b), - !strconcat(CVTStr, !strconcat(" \t$dst, $a;\n\t", - !strconcat(OpcStr, "16 \t$dst, $dst, $b;"))), - [(set Int8Regs:$dst, (OpNode Int8Regs:$a, - (i32 imm:$b)))]>; } defm SRA : RSHIFT_FORMAT<"shr.s", sra, "cvt.s16.s8">; @@ -1257,8 +1007,6 @@ def MOV_ADDR64 : NVPTXInst<(outs Int64Regs:$dst), (ins imem:$a), let IsSimpleMove=1 in { def IMOV1rr: NVPTXInst<(outs Int1Regs:$dst), (ins Int1Regs:$sss), "mov.pred \t$dst, $sss;", []>; -def IMOV8rr: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$sss), - "mov.u16 \t$dst, $sss;", []>; def IMOV16rr: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$sss), "mov.u16 \t$dst, $sss;", []>; def IMOV32rr: NVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$sss), @@ -1274,9 +1022,6 @@ def FMOV64rr: NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$src), def IMOV1ri: NVPTXInst<(outs Int1Regs:$dst), (ins i1imm:$src), "mov.pred \t$dst, $src;", [(set Int1Regs:$dst, imm:$src)]>; -def IMOV8ri: NVPTXInst<(outs Int8Regs:$dst), (ins i8imm:$src), - "mov.u16 \t$dst, $src;", - [(set Int8Regs:$dst, imm:$src)]>; def IMOV16ri: NVPTXInst<(outs Int16Regs:$dst), (ins i16imm:$src), "mov.u16 \t$dst, $src;", [(set Int16Regs:$dst, imm:$src)]>; @@ -1331,47 +1076,8 @@ class Set_Str<string OpcStr, string sz1, string sz2, string d, string a, string s = !strconcat(t11, ", -1, 0, p;\n\t}}"); } -// Generate string block like -// { -// .reg .pred p; -// .reg .s16 %temp1; -// .reg .s16 %temp2; -// cvt.s16.s8 %temp1, %a; -// cvt s16.s8 %temp1, %b; -// setp.gt.s16 p, %temp1, %temp2; -// selp.s16 %dst, -1, 0, p; -// } -// when OpcStr=setp.gt.s d=%dst a=%a b=%b type=s16 cvt=cvt.s16.s8 -class Set_Stri8<string OpcStr, string d, string a, string b, string type, - string cvt> { - string t1 = "{{\n\t.reg .pred p;\n\t"; - string t2 = !strconcat(t1, ".reg ."); - string t3 = !strconcat(t2, type); - string t4 = !strconcat(t3, " %temp1;\n\t"); - string t5 = !strconcat(t4, ".reg ."); - string t6 = !strconcat(t5, type); - string t7 = !strconcat(t6, " %temp2;\n\t"); - string t8 = !strconcat(t7, cvt); - string t9 = !strconcat(t8, " \t%temp1, "); - string t10 = !strconcat(t9, a); - string t11 = !strconcat(t10, ";\n\t"); - string t12 = !strconcat(t11, cvt); - string t13 = !strconcat(t12, " \t%temp2, "); - string t14 = !strconcat(t13, b); - string t15 = !strconcat(t14, ";\n\t"); - string t16 = !strconcat(t15, OpcStr); - string t17 = !strconcat(t16, "16"); - string t18 = !strconcat(t17, " \tp, %temp1, %temp2;\n\t"); - string t19 = !strconcat(t18, "selp.s16 \t"); - string t20 = !strconcat(t19, d); - string s = !strconcat(t20, ", -1, 0, p;\n\t}}"); -} - multiclass ISET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode, string TypeStr, string CVTStr> { - def i8rr_toi8: NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), - Set_Stri8<OpcStr, "$dst", "$a", "$b", TypeStr, CVTStr>.s, - []>; def i16rr_toi16: NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), Set_Str<OpcStr, "16", "16", "$dst", "$a", "$b">.s, @@ -1385,15 +1091,6 @@ multiclass ISET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode, Set_Str<OpcStr, "64", "64", "$dst", "$a", "$b">.s, []>; - def i8rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), - Handle_i8rr<OpcStr, TypeStr, CVTStr>.s, - [(set Int1Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; - def i8ri_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int8Regs:$a, i8imm:$b), - Handle_i8ri<OpcStr, TypeStr, CVTStr>.s, - [(set Int1Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>; - def i8ir_p: NVPTXInst<(outs Int1Regs:$dst), (ins i8imm:$a, Int8Regs:$b), - Handle_i8ir<OpcStr, TypeStr, CVTStr>.s, - [(set Int1Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>; def i16rr_p: NVPTXInst<(outs Int1Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr, "16 \t$dst, $a, $b;"), [(set Int1Regs:$dst, (OpNode Int16Regs:$a, Int16Regs:$b))]>; @@ -1422,15 +1119,6 @@ multiclass ISET_FORMAT<string OpcStr, string OpcStr_u32, PatFrag OpNode, !strconcat(OpcStr, "64 \t$dst, $a, $b;"), [(set Int1Regs:$dst, (OpNode imm:$a, Int64Regs:$b))]>; - def i8rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, Int8Regs:$b), - Handle_i8rr<OpcStr_u32, TypeStr, CVTStr>.s, - [(set Int32Regs:$dst, (OpNode Int8Regs:$a, Int8Regs:$b))]>; - def i8ri_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int8Regs:$a, i8imm:$b), - Handle_i8ri<OpcStr_u32, TypeStr, CVTStr>.s, - [(set Int32Regs:$dst, (OpNode Int8Regs:$a, imm:$b))]>; - def i8ir_u32: NVPTXInst<(outs Int32Regs:$dst), (ins i8imm:$a, Int8Regs:$b), - Handle_i8ir<OpcStr_u32, TypeStr, CVTStr>.s, - [(set Int32Regs:$dst, (OpNode imm:$a, Int8Regs:$b))]>; def i16rr_u32: NVPTXInst<(outs Int32Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b), !strconcat(OpcStr_u32, "16 \t$dst, $a, $b;"), @@ -1639,22 +1327,6 @@ defm FSetNAN : FSET_FORMAT<"setp.nan.", "set.nan.u32.",setuo>; def SELECTi1rr : Pat<(i1 (select Int1Regs:$p, Int1Regs:$a, Int1Regs:$b)), (ORb1rr (ANDb1rr Int1Regs:$p, Int1Regs:$a), (ANDb1rr (NOT1 Int1Regs:$p), Int1Regs:$b))>; -def SELECTi8rr : NVPTXInst<(outs Int8Regs:$dst), - (ins Int8Regs:$a, Int8Regs:$b, Int1Regs:$p), - "selp.b16 \t$dst, $a, $b, $p;", - [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, Int8Regs:$b))]>; -def SELECTi8ri : NVPTXInst<(outs Int8Regs:$dst), - (ins Int8Regs:$a, i8imm:$b, Int1Regs:$p), - "selp.b16 \t$dst, $a, $b, $p;", - [(set Int8Regs:$dst, (select Int1Regs:$p, Int8Regs:$a, imm:$b))]>; -def SELECTi8ir : NVPTXInst<(outs Int8Regs:$dst), - (ins i8imm:$a, Int8Regs:$b, Int1Regs:$p), - "selp.b16 \t$dst, $a, $b, $p;", - [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, Int8Regs:$b))]>; -def SELECTi8ii : NVPTXInst<(outs Int8Regs:$dst), - (ins i8imm:$a, i8imm:$b, Int1Regs:$p), - "selp.b16 \t$dst, $a, $b, $p;", - [(set Int8Regs:$dst, (select Int1Regs:$p, imm:$a, imm:$b))]>; def SELECTi16rr : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$a, Int16Regs:$b, Int1Regs:$p), @@ -1838,7 +1510,7 @@ class LoadParamMemInst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), !strconcat(!strconcat("ld.param", opstr), "\t$dst, [retval0+$b];"), - [(set regclass:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>; + []>; class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs regclass:$dst), (ins i32imm:$b), @@ -1846,8 +1518,6 @@ class LoadParamRegInst<NVPTXRegClass regclass, string opstr> : "\t$dst, retval$b;"), [(set regclass:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; -// FIXME: A bug in tablegen currently prevents us from using multi-output -// patterns here, so we have to custom select these in C++. class LoadParamV2MemInst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs regclass:$dst, regclass:$dst2), (ins i32imm:$b), !strconcat(!strconcat("ld.param.v2", opstr), @@ -1864,24 +1534,21 @@ class StoreParamInst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), !strconcat(!strconcat("st.param", opstr), "\t[param$a+$b], $val;"), - [(StoreParam (i32 imm:$a), (i32 imm:$b), regclass:$val)]>; + []>; class StoreParamV2Inst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a, i32imm:$b), !strconcat(!strconcat("st.param.v2", opstr), "\t[param$a+$b], {{$val, $val2}};"), - [(StoreParamV2 (i32 imm:$a), (i32 imm:$b), regclass:$val, - regclass:$val2)]>; + []>; class StoreParamV4Inst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs), (ins regclass:$val, regclass:$val1, regclass:$val2, regclass:$val3, i32imm:$a, i32imm:$b), !strconcat(!strconcat("st.param.v4", opstr), "\t[param$a+$b], {{$val, $val2, $val3, $val4}};"), - [(StoreParamV4 (i32 imm:$a), (i32 imm:$b), regclass:$val, - regclass:$val2, regclass:$val3, - regclass:$val4)]>; + []>; class MoveToParamInst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs), (ins regclass:$val, i32imm:$a, i32imm:$b), @@ -1893,13 +1560,13 @@ class StoreRetvalInst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs), (ins regclass:$val, i32imm:$a), !strconcat(!strconcat("st.param", opstr), "\t[func_retval0+$a], $val;"), - [(StoreRetval (i32 imm:$a), regclass:$val)]>; + []>; class StoreRetvalV2Inst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs), (ins regclass:$val, regclass:$val2, i32imm:$a), !strconcat(!strconcat("st.param.v2", opstr), "\t[func_retval0+$a], {{$val, $val2}};"), - [(StoreRetvalV2 (i32 imm:$a), regclass:$val, regclass:$val2)]>; + []>; class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs), @@ -1907,8 +1574,7 @@ class StoreRetvalV4Inst<NVPTXRegClass regclass, string opstr> : regclass:$val4, i32imm:$a), !strconcat(!strconcat("st.param.v4", opstr), "\t[func_retval0+$a], {{$val, $val2, $val3, $val4}};"), - [(StoreRetvalV4 (i32 imm:$a), regclass:$val, regclass:$val2, - regclass:$val3, regclass:$val4)]>; + []>; class MoveToRetvalInst<NVPTXRegClass regclass, string opstr> : NVPTXInst<(outs), (ins i32imm:$num, regclass:$val), @@ -1983,29 +1649,19 @@ def PrintCallUniNoRetInst : NVPTXInst<(outs), (ins), "call.uni ", def LoadParamMemI64 : LoadParamMemInst<Int64Regs, ".b64">; def LoadParamMemI32 : LoadParamMemInst<Int32Regs, ".b32">; def LoadParamMemI16 : LoadParamMemInst<Int16Regs, ".b16">; -def LoadParamMemI8 : LoadParamMemInst<Int8Regs, ".b8">; -def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; -def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; -def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; -def LoadParamMemV2I8 : LoadParamV2MemInst<Int8Regs, ".b8">; -def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; -def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; -def LoadParamMemV4I8 : LoadParamV4MemInst<Int8Regs, ".b8">; - -//def LoadParamMemI16 : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b), -// !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t", -// "cvt.u16.u32\t$dst, temp_param_reg;"), -// [(set Int16Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>; -//def LoadParamMemI8 : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b), -// !strconcat("ld.param.b32\ttemp_param_reg, [retval0+$b];\n\t", -// "cvt.u16.u32\t$dst, temp_param_reg;"), -// [(set Int8Regs:$dst, (LoadParam (i32 1), (i32 imm:$b)))]>; - +def LoadParamMemI8 : LoadParamMemInst<Int16Regs, ".b8">; +def LoadParamMemV2I64 : LoadParamV2MemInst<Int64Regs, ".b64">; +def LoadParamMemV2I32 : LoadParamV2MemInst<Int32Regs, ".b32">; +def LoadParamMemV2I16 : LoadParamV2MemInst<Int16Regs, ".b16">; +def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">; +def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">; +def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">; +def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">; def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">; def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">; -def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; -def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; -def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; +def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">; +def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">; +def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">; def LoadParamRegI64 : LoadParamRegInst<Int64Regs, ".b64">; def LoadParamRegI32 : LoadParamRegInst<Int32Regs, ".b32">; @@ -2013,10 +1669,6 @@ def LoadParamRegI16 : NVPTXInst<(outs Int16Regs:$dst), (ins i32imm:$b), "cvt.u16.u32\t$dst, retval$b;", [(set Int16Regs:$dst, (LoadParam (i32 0), (i32 imm:$b)))]>; -def LoadParamRegI8 : NVPTXInst<(outs Int8Regs:$dst), (ins i32imm:$b), - "cvt.u16.u32\t$dst, retval$b;", - [(set Int8Regs:$dst, - (LoadParam (i32 0), (i32 imm:$b)))]>; def LoadParamRegF32 : LoadParamRegInst<Float32Regs, ".f32">; def LoadParamRegF64 : LoadParamRegInst<Float64Regs, ".f64">; @@ -2024,31 +1676,12 @@ def LoadParamRegF64 : LoadParamRegInst<Float64Regs, ".f64">; def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">; def StoreParamI32 : StoreParamInst<Int32Regs, ".b32">; -def StoreParamI16 : NVPTXInst<(outs), - (ins Int16Regs:$val, i32imm:$a, i32imm:$b), - "st.param.b16\t[param$a+$b], $val;", - [(StoreParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>; - -def StoreParamI8 : NVPTXInst<(outs), - (ins Int8Regs:$val, i32imm:$a, i32imm:$b), - "st.param.b8\t[param$a+$b], $val;", - [(StoreParam - (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>; - -def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; -def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; - -def StoreParamV2I16 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2, - i32imm:$a, i32imm:$b), - "st.param.v2.b16\t[param$a+$b], {{$val, $val2}};", - [(StoreParamV2 (i32 imm:$a), (i32 imm:$b), - Int16Regs:$val, Int16Regs:$val2)]>; - -def StoreParamV2I8 : NVPTXInst<(outs), (ins Int8Regs:$val, Int8Regs:$val2, - i32imm:$a, i32imm:$b), - "st.param.v2.b8\t[param$a+$b], {{$val, $val2}};", - [(StoreParamV2 (i32 imm:$a), (i32 imm:$b), - Int8Regs:$val, Int8Regs:$val2)]>; +def StoreParamI16 : StoreParamInst<Int16Regs, ".b16">; +def StoreParamI8 : StoreParamInst<Int16Regs, ".b8">; +def StoreParamV2I64 : StoreParamV2Inst<Int64Regs, ".b64">; +def StoreParamV2I32 : StoreParamV2Inst<Int32Regs, ".b32">; +def StoreParamV2I16 : StoreParamV2Inst<Int16Regs, ".b16">; +def StoreParamV2I8 : StoreParamV2Inst<Int16Regs, ".b8">; // FIXME: StoreParamV4Inst crashes llvm-tblgen :( //def StoreParamV4I32 : StoreParamV4Inst<Int32Regs, ".b32">; @@ -2056,47 +1689,41 @@ def StoreParamV4I32 : NVPTXInst<(outs), (ins Int32Regs:$val, Int32Regs:$val2, Int32Regs:$val3, Int32Regs:$val4, i32imm:$a, i32imm:$b), "st.param.b32\t[param$a+$b], {{$val, $val2, $val3, $val4}};", - [(StoreParamV4 (i32 imm:$a), (i32 imm:$b), - Int32Regs:$val, Int32Regs:$val2, - Int32Regs:$val3, Int32Regs:$val4)]>; + []>; def StoreParamV4I16 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2, Int16Regs:$val3, Int16Regs:$val4, i32imm:$a, i32imm:$b), "st.param.v4.b16\t[param$a+$b], {{$val, $val2, $val3, $val4}};", - [(StoreParamV4 (i32 imm:$a), (i32 imm:$b), - Int16Regs:$val, Int16Regs:$val2, - Int16Regs:$val3, Int16Regs:$val4)]>; + []>; -def StoreParamV4I8 : NVPTXInst<(outs), (ins Int8Regs:$val, Int8Regs:$val2, - Int8Regs:$val3, Int8Regs:$val4, +def StoreParamV4I8 : NVPTXInst<(outs), (ins Int16Regs:$val, Int16Regs:$val2, + Int16Regs:$val3, Int16Regs:$val4, i32imm:$a, i32imm:$b), "st.param.v4.b8\t[param$a+$b], {{$val, $val2, $val3, $val4}};", - [(StoreParamV4 (i32 imm:$a), (i32 imm:$b), - Int8Regs:$val, Int8Regs:$val2, - Int8Regs:$val3, Int8Regs:$val4)]>; + []>; def StoreParamS32I16 : NVPTXInst<(outs), (ins Int16Regs:$val, i32imm:$a, i32imm:$b), !strconcat("cvt.s32.s16\ttemp_param_reg, $val;\n\t", "st.param.b32\t[param$a+$b], temp_param_reg;"), - [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>; + []>; def StoreParamU32I16 : NVPTXInst<(outs), (ins Int16Regs:$val, i32imm:$a, i32imm:$b), !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t", "st.param.b32\t[param$a+$b], temp_param_reg;"), - [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>; + []>; def StoreParamU32I8 : NVPTXInst<(outs), - (ins Int8Regs:$val, i32imm:$a, i32imm:$b), + (ins Int16Regs:$val, i32imm:$a, i32imm:$b), !strconcat("cvt.u32.u8\ttemp_param_reg, $val;\n\t", "st.param.b32\t[param$a+$b], temp_param_reg;"), - [(StoreParamU32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>; + []>; def StoreParamS32I8 : NVPTXInst<(outs), - (ins Int8Regs:$val, i32imm:$a, i32imm:$b), + (ins Int16Regs:$val, i32imm:$a, i32imm:$b), !strconcat("cvt.s32.s8\ttemp_param_reg, $val;\n\t", "st.param.b32\t[param$a+$b], temp_param_reg;"), - [(StoreParamS32 (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>; + []>; def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">; def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">; @@ -2109,9 +1736,7 @@ def StoreParamV4F32 : NVPTXInst<(outs), Float32Regs:$val3, Float32Regs:$val4, i32imm:$a, i32imm:$b), "st.param.v4.f32\t[param$a+$b], {{$val, $val2, $val3, $val4}};", - [(StoreParamV4 (i32 imm:$a), (i32 imm:$b), - Float32Regs:$val, Float32Regs:$val2, - Float32Regs:$val3, Float32Regs:$val4)]>; + []>; def MoveToParamI64 : MoveToParamInst<Int64Regs, ".b64">; def MoveToParamI32 : MoveToParamInst<Int32Regs, ".b32">; @@ -2122,36 +1747,18 @@ def MoveToParamI16 : NVPTXInst<(outs), !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t", "mov.b32\tparam$a, temp_param_reg;"), [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int16Regs:$val)]>; -def MoveToParamI8 : NVPTXInst<(outs), - (ins Int8Regs:$val, i32imm:$a, i32imm:$b), - !strconcat("cvt.u32.u16\ttemp_param_reg, $val;\n\t", - "mov.b32\tparam$a, temp_param_reg;"), - [(MoveToParam (i32 imm:$a), (i32 imm:$b), Int8Regs:$val)]>; def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">; def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">; def StoreRetvalI16 : StoreRetvalInst<Int16Regs, ".b16">; -def StoreRetvalI8 : StoreRetvalInst<Int8Regs, ".b8">; +def StoreRetvalI8 : StoreRetvalInst<Int16Regs, ".b8">; def StoreRetvalV2I64 : StoreRetvalV2Inst<Int64Regs, ".b64">; def StoreRetvalV2I32 : StoreRetvalV2Inst<Int32Regs, ".b32">; def StoreRetvalV2I16 : StoreRetvalV2Inst<Int16Regs, ".b16">; -def StoreRetvalV2I8 : StoreRetvalV2Inst<Int8Regs, ".b8">; +def StoreRetvalV2I8 : StoreRetvalV2Inst<Int16Regs, ".b8">; def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">; def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">; -def StoreRetvalV4I8 : StoreRetvalV4Inst<Int8Regs, ".b8">; - -//def StoreRetvalI16 : NVPTXInst<(outs), (ins Int16Regs:$val, i32imm:$a), -// !strconcat("\{\n\t", -// !strconcat(".reg .b32 temp_retval_reg;\n\t", -// !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t", -// "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))), -// [(StoreRetval (i32 imm:$a), Int16Regs:$val)]>; -//def StoreRetvalI8 : NVPTXInst<(outs), (ins Int8Regs:$val, i32imm:$a), -// !strconcat("\{\n\t", -// !strconcat(".reg .b32 temp_retval_reg;\n\t", -// !strconcat("cvt.u32.u16\ttemp_retval_reg, $val;\n\t", -// "st.param.b32\t[func_retval0+$a], temp_retval_reg;\n\t\}"))), -// [(StoreRetval (i32 imm:$a), Int8Regs:$val)]>; +def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">; def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">; def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">; @@ -2162,7 +1769,7 @@ def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">; def MoveRetvalI64 : MoveRetvalInst<Int64Regs, ".b64">; def MoveRetvalI32 : MoveRetvalInst<Int32Regs, ".b32">; def MoveRetvalI16 : MoveRetvalInst<Int16Regs, ".b16">; -def MoveRetvalI8 : MoveRetvalInst<Int8Regs, ".b8">; +def MoveRetvalI8 : MoveRetvalInst<Int16Regs, ".b8">; def MoveRetvalF64 : MoveRetvalInst<Float64Regs, ".f64">; def MoveRetvalF32 : MoveRetvalInst<Float32Regs, ".f32">; @@ -2173,9 +1780,6 @@ def MoveToRetvalF32 : MoveToRetvalInst<Float32Regs, ".f32">; def MoveToRetvalI16 : NVPTXInst<(outs), (ins i32imm:$num, Int16Regs:$val), "cvt.u32.u16\tfunc_retval$num, $val;", [(MoveToRetval (i32 imm:$num), Int16Regs:$val)]>; -def MoveToRetvalI8 : NVPTXInst<(outs), (ins i32imm:$num, Int8Regs:$val), - "cvt.u32.u16\tfunc_retval$num, $val;", - [(MoveToRetval (i32 imm:$num), Int8Regs:$val)]>; def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; @@ -2193,7 +1797,6 @@ class LastCallArgInst<NVPTXRegClass regclass> : def CallArgI64 : CallArgInst<Int64Regs>; def CallArgI32 : CallArgInst<Int32Regs>; def CallArgI16 : CallArgInst<Int16Regs>; -def CallArgI8 : CallArgInst<Int8Regs>; def CallArgF64 : CallArgInst<Float64Regs>; def CallArgF32 : CallArgInst<Float32Regs>; @@ -2201,7 +1804,6 @@ def CallArgF32 : CallArgInst<Float32Regs>; def LastCallArgI64 : LastCallArgInst<Int64Regs>; def LastCallArgI32 : LastCallArgInst<Int32Regs>; def LastCallArgI16 : LastCallArgInst<Int16Regs>; -def LastCallArgI8 : LastCallArgInst<Int8Regs>; def LastCallArgF64 : LastCallArgInst<Float64Regs>; def LastCallArgF32 : LastCallArgInst<Float32Regs>; @@ -2261,9 +1863,6 @@ def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">; def MoveParamI16 : NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), "cvt.u16.u32\t$dst, $src;", [(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>; -def MoveParamI8 : NVPTXInst<(outs Int8Regs:$dst), (ins Int8Regs:$src), - "cvt.u16.u32\t$dst, $src;", - [(set Int8Regs:$dst, (MoveParam Int8Regs:$src))]>; def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">; def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">; @@ -2275,7 +1874,6 @@ class PseudoUseParamInst<NVPTXRegClass regclass> : def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>; def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>; def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>; -def PseudoUseParamI8 : PseudoUseParamInst<Int8Regs>; def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>; def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>; @@ -2317,7 +1915,7 @@ multiclass LD<NVPTXRegClass regclass> { } let mayLoad=1, neverHasSideEffects=1 in { -defm LD_i8 : LD<Int8Regs>; +defm LD_i8 : LD<Int16Regs>; defm LD_i16 : LD<Int16Regs>; defm LD_i32 : LD<Int32Regs>; defm LD_i64 : LD<Int64Regs>; @@ -2359,7 +1957,7 @@ multiclass ST<NVPTXRegClass regclass> { } let mayStore=1, neverHasSideEffects=1 in { -defm ST_i8 : ST<Int8Regs>; +defm ST_i8 : ST<Int16Regs>; defm ST_i16 : ST<Int16Regs>; defm ST_i32 : ST<Int32Regs>; defm ST_i64 : ST<Int64Regs>; @@ -2443,7 +2041,7 @@ multiclass LD_VEC<NVPTXRegClass regclass> { []>; } let mayLoad=1, neverHasSideEffects=1 in { -defm LDV_i8 : LD_VEC<Int8Regs>; +defm LDV_i8 : LD_VEC<Int16Regs>; defm LDV_i16 : LD_VEC<Int16Regs>; defm LDV_i32 : LD_VEC<Int32Regs>; defm LDV_i64 : LD_VEC<Int64Regs>; @@ -2526,7 +2124,7 @@ multiclass ST_VEC<NVPTXRegClass regclass> { []>; } let mayStore=1, neverHasSideEffects=1 in { -defm STV_i8 : ST_VEC<Int8Regs>; +defm STV_i8 : ST_VEC<Int16Regs>; defm STV_i16 : ST_VEC<Int16Regs>; defm STV_i32 : ST_VEC<Int32Regs>; defm STV_i64 : ST_VEC<Int64Regs>; @@ -2539,10 +2137,6 @@ defm STV_f64 : ST_VEC<Float64Regs>; multiclass CVT_INT_TO_FP <string OpStr, SDNode OpNode> { // FIXME: need to add f16 support -// def CVTf16i8 : -// NVPTXInst<(outs Float16Regs:$d), (ins Int8Regs:$a), -// !strconcat(!strconcat("cvt.rn.f16.", OpStr), "8 \t$d, $a;"), -// [(set Float16Regs:$d, (OpNode Int8Regs:$a))]>; // def CVTf16i16 : // NVPTXInst<(outs Float16Regs:$d), (ins Int16Regs:$a), // !strconcat(!strconcat("cvt.rn.f16.", OpStr), "16 \t$d, $a;"), @@ -2560,10 +2154,6 @@ multiclass CVT_INT_TO_FP <string OpStr, SDNode OpNode> { NVPTXInst<(outs Float32Regs:$d), (ins Int1Regs:$a), "selp.f32 \t$d, 1.0, 0.0, $a;", [(set Float32Regs:$d, (OpNode Int1Regs:$a))]>; - def CVTf32i8 : - NVPTXInst<(outs Float32Regs:$d), (ins Int8Regs:$a), - !strconcat(!strconcat("cvt.rn.f32.", OpStr), "8 \t$d, $a;"), - [(set Float32Regs:$d, (OpNode Int8Regs:$a))]>; def CVTf32i16 : NVPTXInst<(outs Float32Regs:$d), (ins Int16Regs:$a), !strconcat(!strconcat("cvt.rn.f32.", OpStr), "16 \t$d, $a;"), @@ -2581,10 +2171,6 @@ multiclass CVT_INT_TO_FP <string OpStr, SDNode OpNode> { NVPTXInst<(outs Float64Regs:$d), (ins Int1Regs:$a), "selp.f64 \t$d, 1.0, 0.0, $a;", [(set Float64Regs:$d, (OpNode Int1Regs:$a))]>; - def CVTf64i8 : - NVPTXInst<(outs Float64Regs:$d), (ins Int8Regs:$a), - !strconcat(!strconcat("cvt.rn.f64.", OpStr), "8 \t$d, $a;"), - [(set Float64Regs:$d, (OpNode Int8Regs:$a))]>; def CVTf64i16 : NVPTXInst<(outs Float64Regs:$d), (ins Int16Regs:$a), !strconcat(!strconcat("cvt.rn.f64.", OpStr), "16 \t$d, $a;"), @@ -2604,24 +2190,6 @@ defm Uint_to_fp : CVT_INT_TO_FP <"u", uint_to_fp>; multiclass CVT_FP_TO_INT <string OpStr, SDNode OpNode> { // FIXME: need to add f16 support -// def CVTi8f16: -// NVPTXInst<(outs Int8Regs:$d), (ins Float16Regs:$a), -// !strconcat(!strconcat("cvt.rzi.", OpStr), "8.f16 $d, $a;"), -// [(set Int8Regs:$d, (OpNode Float16Regs:$a))]>; - def CVTi8f32_ftz: - NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a), - !strconcat(!strconcat("cvt.rzi.ftz.", OpStr), "16.f32 \t$d, $a;"), - [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>, Requires<[doF32FTZ]>; - def CVTi8f32: - NVPTXInst<(outs Int8Regs:$d), (ins Float32Regs:$a), - !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f32 \t$d, $a;"), - [(set Int8Regs:$d, (OpNode Float32Regs:$a))]>; - def CVTi8f64: - NVPTXInst<(outs Int8Regs:$d), (ins Float64Regs:$a), - !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f64 \t$d, $a;"), - [(set Int8Regs:$d, (OpNode Float64Regs:$a))]>; - -// FIXME: need to add f16 support // def CVTi16f16: // NVPTXInst<(outs Int16Regs:$d), (ins Float16Regs:$a), // !strconcat(!strconcat("cvt.rzi.", OpStr), "16.f16 \t$d, $a;"), @@ -2680,10 +2248,6 @@ defm Fp_to_sint : CVT_FP_TO_INT <"s", fp_to_sint>; defm Fp_to_uint : CVT_FP_TO_INT <"u", fp_to_uint>; multiclass INT_EXTEND_UNSIGNED_1 <SDNode OpNode> { - def ext1to8: - NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a), - "selp.u16 \t$d, 1, 0, $a;", - [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>; def ext1to16: NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a), "selp.u16 \t$d, 1, 0, $a;", @@ -2699,10 +2263,6 @@ multiclass INT_EXTEND_UNSIGNED_1 <SDNode OpNode> { } multiclass INT_EXTEND_SIGNED_1 <SDNode OpNode> { - def ext1to8: - NVPTXInst<(outs Int8Regs:$d), (ins Int1Regs:$a), - "selp.s16 \t$d, -1, 0, $a;", - [(set Int8Regs:$d, (OpNode Int1Regs:$a))]>; def ext1to16: NVPTXInst<(outs Int16Regs:$d), (ins Int1Regs:$a), "selp.s16 \t$d, -1, 0, $a;", @@ -2718,23 +2278,6 @@ multiclass INT_EXTEND_SIGNED_1 <SDNode OpNode> { } multiclass INT_EXTEND <string OpStr, SDNode OpNode> { - // All Int8Regs are emiited as 16bit registers in ptx. - // And there is no selp.u8 in ptx. - def ext8to16: - NVPTXInst<(outs Int16Regs:$d), (ins Int8Regs:$a), - !strconcat("cvt.", !strconcat(OpStr, !strconcat("16.", - !strconcat(OpStr, "8 \t$d, $a;")))), - [(set Int16Regs:$d, (OpNode Int8Regs:$a))]>; - def ext8to32: - NVPTXInst<(outs Int32Regs:$d), (ins Int8Regs:$a), - !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.", - !strconcat(OpStr, "8 \t$d, $a;")))), - [(set Int32Regs:$d, (OpNode Int8Regs:$a))]>; - def ext8to64: - NVPTXInst<(outs Int64Regs:$d), (ins Int8Regs:$a), - !strconcat("cvt.", !strconcat(OpStr, !strconcat("64.", - !strconcat(OpStr, "8 \t$d, $a;")))), - [(set Int64Regs:$d, (OpNode Int8Regs:$a))]>; def ext16to32: NVPTXInst<(outs Int32Regs:$d), (ins Int16Regs:$a), !strconcat("cvt.", !strconcat(OpStr, !strconcat("32.", @@ -2778,18 +2321,9 @@ def TRUNC_64to32 : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), def TRUNC_64to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int64Regs:$a), "cvt.u16.u64 \t$d, $a;", [(set Int16Regs:$d, (trunc Int64Regs:$a))]>; -def TRUNC_64to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int64Regs:$a), - "cvt.u8.u64 \t$d, $a;", - [(set Int8Regs:$d, (trunc Int64Regs:$a))]>; def TRUNC_32to16 : NVPTXInst<(outs Int16Regs:$d), (ins Int32Regs:$a), "cvt.u16.u32 \t$d, $a;", [(set Int16Regs:$d, (trunc Int32Regs:$a))]>; -def TRUNC_32to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int32Regs:$a), - "cvt.u8.u32 \t$d, $a;", - [(set Int8Regs:$d, (trunc Int32Regs:$a))]>; -def TRUNC_16to8 : NVPTXInst<(outs Int8Regs:$d), (ins Int16Regs:$a), - "cvt.u8.u16 \t$d, $a;", - [(set Int8Regs:$d, (trunc Int16Regs:$a))]>; def TRUNC_64to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), TRUNC_to1_asm<".b64">.s, [(set Int1Regs:$d, (trunc Int64Regs:$a))]>; @@ -2799,13 +2333,8 @@ def TRUNC_32to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int32Regs:$a), def TRUNC_16to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int16Regs:$a), TRUNC_to1_asm<".b16">.s, [(set Int1Regs:$d, (trunc Int16Regs:$a))]>; -def TRUNC_8to1 : NVPTXInst<(outs Int1Regs:$d), (ins Int8Regs:$a), - TRUNC_to1_asm<".b16">.s, - [(set Int1Regs:$d, (trunc Int8Regs:$a))]>; // Select instructions -def : Pat<(select Int32Regs:$pred, Int8Regs:$a, Int8Regs:$b), - (SELECTi8rr Int8Regs:$a, Int8Regs:$b, (TRUNC_32to1 Int32Regs:$pred))>; def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b), (SELECTi16rr Int16Regs:$a, Int16Regs:$b, (TRUNC_32to1 Int32Regs:$pred))>; @@ -2834,28 +2363,11 @@ def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>; def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>; // pack a set of smaller int registers to a larger int register -def V4I8toI32 : NVPTXInst<(outs Int32Regs:$d), - (ins Int8Regs:$s1, Int8Regs:$s2, - Int8Regs:$s3, Int8Regs:$s4), - !strconcat("{{\n\t.reg .b8\t%t<4>;", - !strconcat("\n\tcvt.u8.u8\t%t0, $s1;", - !strconcat("\n\tcvt.u8.u8\t%t1, $s2;", - !strconcat("\n\tcvt.u8.u8\t%t2, $s3;", - !strconcat("\n\tcvt.u8.u8\t%t3, $s4;", - "\n\tmov.b32\t$d, {%t0, %t1, %t2, %t3};\n\t}}"))))), - []>; def V4I16toI64 : NVPTXInst<(outs Int64Regs:$d), (ins Int16Regs:$s1, Int16Regs:$s2, Int16Regs:$s3, Int16Regs:$s4), "mov.b64\t$d, {{$s1, $s2, $s3, $s4}};", []>; -def V2I8toI16 : NVPTXInst<(outs Int16Regs:$d), - (ins Int8Regs:$s1, Int8Regs:$s2), - !strconcat("{{\n\t.reg .b8\t%t<2>;", - !strconcat("\n\tcvt.u8.u8\t%t0, $s1;", - !strconcat("\n\tcvt.u8.u8\t%t1, $s2;", - "\n\tmov.b16\t$d, {%t0, %t1};\n\t}}"))), - []>; def V2I16toI32 : NVPTXInst<(outs Int32Regs:$d), (ins Int16Regs:$s1, Int16Regs:$s2), "mov.b32\t$d, {{$s1, $s2}};", @@ -2870,28 +2382,11 @@ def V2F32toF64 : NVPTXInst<(outs Float64Regs:$d), []>; // unpack a larger int register to a set of smaller int registers -def I32toV4I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2, - Int8Regs:$d3, Int8Regs:$d4), - (ins Int32Regs:$s), - !strconcat("{{\n\t.reg .b8\t%t<4>;", - !strconcat("\n\tmov.b32\t{%t0, %t1, %t2, %t3}, $s;", - !strconcat("\n\tcvt.u8.u8\t$d1, %t0;", - !strconcat("\n\tcvt.u8.u8\t$d2, %t1;", - !strconcat("\n\tcvt.u8.u8\t$d3, %t2;", - "\n\tcvt.u8.u8\t$d4, %t3;\n\t}}"))))), - []>; def I64toV4I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2, Int16Regs:$d3, Int16Regs:$d4), (ins Int64Regs:$s), "mov.b64\t{{$d1, $d2, $d3, $d4}}, $s;", []>; -def I16toV2I8 : NVPTXInst<(outs Int8Regs:$d1, Int8Regs:$d2), - (ins Int16Regs:$s), - !strconcat("{{\n\t.reg .b8\t%t<2>;", - !strconcat("\n\tmov.b16\t{%t0, %t1}, $s;", - !strconcat("\n\tcvt.u8.u8\t$d1, %t0;", - "\n\tcvt.u8.u8\t$d2, %t1;\n\t}}"))), - []>; def I32toV2I16 : NVPTXInst<(outs Int16Regs:$d1, Int16Regs:$d2), (ins Int32Regs:$s), "mov.b32\t{{$d1, $d2}}, $s;", diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 24037cafef..caa7775a4c 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -1270,6 +1270,11 @@ def INT_PTX_SREG_WARPSIZE : F_SREG<"mov.u32 \t$dst, WARP_SZ;", Int32Regs, // Support for ldu on sm_20 or later //----------------------------------- +def ldu_i8 : PatFrag<(ops node:$ptr), (int_nvvm_ldu_global_i node:$ptr), [{ + MemIntrinsicSDNode *M = cast<MemIntrinsicSDNode>(N); + return M->getMemoryVT() == MVT::i8; +}]>; + // Scalar // @TODO: Revisit this, Changed imemAny to imem multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> { @@ -1291,8 +1296,27 @@ multiclass LDU_G<string TyStr, NVPTXRegClass regclass, Intrinsic IntOp> { [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; } -defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int8Regs, -int_nvvm_ldu_global_i>; +multiclass LDU_G_NOINTRIN<string TyStr, NVPTXRegClass regclass, PatFrag IntOp> { + def areg: NVPTXInst<(outs regclass:$result), (ins Int32Regs:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp Int32Regs:$src))]>, Requires<[hasLDU]>; + def areg64: NVPTXInst<(outs regclass:$result), (ins Int64Regs:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp Int64Regs:$src))]>, Requires<[hasLDU]>; + def avar: NVPTXInst<(outs regclass:$result), (ins imem:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp (Wrapper tglobaladdr:$src)))]>, + Requires<[hasLDU]>; + def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp ADDRri:$src))]>, Requires<[hasLDU]>; + def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src), + !strconcat("ldu.global.", TyStr), + [(set regclass:$result, (IntOp ADDRri64:$src))]>, Requires<[hasLDU]>; +} + +defm INT_PTX_LDU_GLOBAL_i8 : LDU_G_NOINTRIN<"u8 \t$result, [$src];", Int16Regs, + ldu_i8>; defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs, int_nvvm_ldu_global_i>; defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs, @@ -1330,7 +1354,7 @@ multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> { } defm INT_PTX_LDU_G_v2i8_ELE - : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int8Regs>; + : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>; defm INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>; defm INT_PTX_LDU_G_v2i32_ELE @@ -1342,7 +1366,7 @@ defm INT_PTX_LDU_G_v2i64_ELE defm INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>; defm INT_PTX_LDU_G_v4i8_ELE - : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int8Regs>; + : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>; defm INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>; @@ -1542,10 +1566,6 @@ def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result), // nvvm.move intrinsicc -def nvvm_move_i8 : NVPTXInst<(outs Int8Regs:$r), (ins Int8Regs:$s), - "mov.b16 \t$r, $s;", - [(set Int8Regs:$r, - (int_nvvm_move_i8 Int8Regs:$s))]>; def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s), "mov.b16 \t$r, $s;", [(set Int16Regs:$r, diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp index b749b05315..4d3a1d9b40 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp @@ -38,10 +38,6 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) { return ".s32"; } else if (RC == &NVPTX::Int16RegsRegClass) { return ".s16"; - } - // Int8Regs become 16-bit registers in PTX - else if (RC == &NVPTX::Int8RegsRegClass) { - return ".s16"; } else if (RC == &NVPTX::Int1RegsRegClass) { return ".pred"; } else if (RC == &NVPTX::SpecialRegsRegClass) { @@ -64,8 +60,6 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) { return "%r"; } else if (RC == &NVPTX::Int16RegsRegClass) { return "%rs"; - } else if (RC == &NVPTX::Int8RegsRegClass) { - return "%rc"; } else if (RC == &NVPTX::Int1RegsRegClass) { return "%p"; } else if (RC == &NVPTX::SpecialRegsRegClass) { diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td index 8d100d6316..bc705b8d5f 100644 --- a/lib/Target/NVPTX/NVPTXRegisterInfo.td +++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td @@ -31,7 +31,6 @@ def VRDepot : NVPTXReg<"%Depot">; foreach i = 0-395 in { def P#i : NVPTXReg<"%p"#i>; // Predicate - def RC#i : NVPTXReg<"%rc"#i>; // 8-bit def RS#i : NVPTXReg<"%rs"#i>; // 16-bit def R#i : NVPTXReg<"%r"#i>; // 32-bit def RL#i : NVPTXReg<"%rl"#i>; // 64-bit @@ -49,7 +48,6 @@ foreach i = 0-395 in { // Register classes //===----------------------------------------------------------------------===// def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 395))>; -def Int8Regs : NVPTXRegClass<[i8], 8, (add (sequence "RC%u", 0, 395))>; def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 395))>; def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 395))>; def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 395))>; diff --git a/test/CodeGen/NVPTX/compare-int.ll b/test/CodeGen/NVPTX/compare-int.ll index 16af0a336d..e929f24ddb 100644 --- a/test/CodeGen/NVPTX/compare-int.ll +++ b/test/CodeGen/NVPTX/compare-int.ll @@ -288,8 +288,8 @@ define i16 @icmp_sle_i16(i16 %a, i16 %b) { define i8 @icmp_eq_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.eq.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp eq i8 %a, %b %ret = zext i1 %cmp to i8 @@ -298,8 +298,8 @@ define i8 @icmp_eq_i8(i8 %a, i8 %b) { define i8 @icmp_ne_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ne.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ne i8 %a, %b %ret = zext i1 %cmp to i8 @@ -308,8 +308,8 @@ define i8 @icmp_ne_i8(i8 %a, i8 %b) { define i8 @icmp_ugt_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.gt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ugt i8 %a, %b %ret = zext i1 %cmp to i8 @@ -318,8 +318,8 @@ define i8 @icmp_ugt_i8(i8 %a, i8 %b) { define i8 @icmp_uge_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ge.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp uge i8 %a, %b %ret = zext i1 %cmp to i8 @@ -328,8 +328,8 @@ define i8 @icmp_uge_i8(i8 %a, i8 %b) { define i8 @icmp_ult_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.lt.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ult i8 %a, %b %ret = zext i1 %cmp to i8 @@ -338,8 +338,8 @@ define i8 @icmp_ult_i8(i8 %a, i8 %b) { define i8 @icmp_ule_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.le.u16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp ule i8 %a, %b %ret = zext i1 %cmp to i8 @@ -348,8 +348,8 @@ define i8 @icmp_ule_i8(i8 %a, i8 %b) { define i8 @icmp_sgt_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.gt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sgt i8 %a, %b %ret = zext i1 %cmp to i8 @@ -358,8 +358,8 @@ define i8 @icmp_sgt_i8(i8 %a, i8 %b) { define i8 @icmp_sge_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.ge.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sge i8 %a, %b %ret = zext i1 %cmp to i8 @@ -368,8 +368,8 @@ define i8 @icmp_sge_i8(i8 %a, i8 %b) { define i8 @icmp_slt_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.lt.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp slt i8 %a, %b %ret = zext i1 %cmp to i8 @@ -378,8 +378,8 @@ define i8 @icmp_slt_i8(i8 %a, i8 %b) { define i8 @icmp_sle_i8(i8 %a, i8 %b) { ; Comparison happens in 16-bit -; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %temp{{[0-9]+}}, %temp{{[0-9]+}} -; CHECK: selp.u16 %rc{{[0-9]+}}, 1, 0, %p[[P0]] +; CHECK: setp.le.s16 %p[[P0:[0-9]+]], %rs{{[0-9]+}}, %rs{{[0-9]+}} +; CHECK: selp.u16 %rs{{[0-9]+}}, 1, 0, %p[[P0]] ; CHECK: ret %cmp = icmp sle i8 %a, %b %ret = zext i1 %cmp to i8 diff --git a/test/CodeGen/NVPTX/ld-addrspace.ll b/test/CodeGen/NVPTX/ld-addrspace.ll index 3265868d3c..204ae7b1fb 100644 --- a/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/test/CodeGen/NVPTX/ld-addrspace.ll @@ -4,27 +4,27 @@ ;; i8 define i8 @ld_global_i8(i8 addrspace(1)* %ptr) { -; PTX32: ld.global.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.global.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.global.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.global.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(1)* %ptr ret i8 %a } define i8 @ld_shared_i8(i8 addrspace(3)* %ptr) { -; PTX32: ld.shared.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.shared.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.shared.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.shared.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(3)* %ptr ret i8 %a } define i8 @ld_local_i8(i8 addrspace(5)* %ptr) { -; PTX32: ld.local.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.local.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.local.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.local.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(5)* %ptr ret i8 %a diff --git a/test/CodeGen/NVPTX/ld-generic.ll b/test/CodeGen/NVPTX/ld-generic.ll index 81a5216f96..f811a37191 100644 --- a/test/CodeGen/NVPTX/ld-generic.ll +++ b/test/CodeGen/NVPTX/ld-generic.ll @@ -4,9 +4,9 @@ ;; i8 define i8 @ld_global_i8(i8 addrspace(0)* %ptr) { -; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: ld.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] ; PTX64: ret %a = load i8 addrspace(0)* %ptr ret i8 %a diff --git a/test/CodeGen/NVPTX/pr13291-i1-store.ll b/test/CodeGen/NVPTX/pr13291-i1-store.ll index 779f7798d8..a5526f8ad7 100644 --- a/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -2,21 +2,21 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64 define ptx_kernel void @t1(i1* %a) { -; PTX32: mov.u16 %rc{{[0-9]+}}, 0; -; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}}; -; PTX64: mov.u16 %rc{{[0-9]+}}, 0; -; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}}; +; PTX32: mov.u16 %rs{{[0-9]+}}, 0; +; PTX32-NEXT: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}; +; PTX64: mov.u16 %rs{{[0-9]+}}, 0; +; PTX64-NEXT: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}}; store i1 false, i1* %a ret void } define ptx_kernel void @t2(i1* %a, i8* %b) { -; PTX32: ld.u8 %rc{{[0-9]+}}, [%r{{[0-9]+}}] -; PTX32: and.b16 temp, %rc{{[0-9]+}}, 1; +; PTX32: ld.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: and.b16 temp, %rs{{[0-9]+}}, 1; ; PTX32: setp.b16.eq %p{{[0-9]+}}, temp, 1; -; PTX64: ld.u8 %rc{{[0-9]+}}, [%rl{{[0-9]+}}] -; PTX64: and.b16 temp, %rc{{[0-9]+}}, 1; +; PTX64: ld.u8 %rs{{[0-9]+}}, [%rl{{[0-9]+}}] +; PTX64: and.b16 temp, %rs{{[0-9]+}}, 1; ; PTX64: setp.b16.eq %p{{[0-9]+}}, temp, 1; %t1 = load i1* %a diff --git a/test/CodeGen/NVPTX/st-addrspace.ll b/test/CodeGen/NVPTX/st-addrspace.ll index 0b26d802df..68c09fe065 100644 --- a/test/CodeGen/NVPTX/st-addrspace.ll +++ b/test/CodeGen/NVPTX/st-addrspace.ll @@ -5,27 +5,27 @@ ;; i8 define void @st_global_i8(i8 addrspace(1)* %ptr, i8 %a) { -; PTX32: st.global.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}} +; PTX32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}} +; PTX64: st.global.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(1)* %ptr ret void } define void @st_shared_i8(i8 addrspace(3)* %ptr, i8 %a) { -; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}} +; PTX32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}} +; PTX64: st.shared.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(3)* %ptr ret void } define void @st_local_i8(i8 addrspace(5)* %ptr, i8 %a) { -; PTX32: st.local.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}} +; PTX32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}} +; PTX64: st.local.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(5)* %ptr ret void diff --git a/test/CodeGen/NVPTX/st-generic.ll b/test/CodeGen/NVPTX/st-generic.ll index 59a1fe0211..b9c616fbd1 100644 --- a/test/CodeGen/NVPTX/st-generic.ll +++ b/test/CodeGen/NVPTX/st-generic.ll @@ -5,9 +5,9 @@ ;; i8 define void @st_global_i8(i8 addrspace(0)* %ptr, i8 %a) { -; PTX32: st.u8 [%r{{[0-9]+}}], %rc{{[0-9]+}} +; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.u8 [%rl{{[0-9]+}}], %rc{{[0-9]+}} +; PTX64: st.u8 [%rl{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, i8 addrspace(0)* %ptr ret void |