diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2014-06-17 16:53:14 +0000 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2014-06-17 16:53:14 +0000 |
commit | f56e7678d1ced97d5513e0a75658dc48396e4a58 (patch) | |
tree | 72898cf24c9d9becee91dec7b74d37c2bbfc9242 | |
parent | ff8dc48da387719d4b4c4712715be0e2d2672d87 (diff) | |
download | llvm-f56e7678d1ced97d5513e0a75658dc48396e4a58.tar.gz llvm-f56e7678d1ced97d5513e0a75658dc48396e4a58.tar.bz2 llvm-f56e7678d1ced97d5513e0a75658dc48396e4a58.tar.xz |
R600: Use LDS and vectors for private memory
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211110 91177308-0d34-0410-b5e6-96231b3b80d8
25 files changed, 788 insertions, 73 deletions
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index 3fd8e2f924..f92bde8537 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -17,6 +17,7 @@ namespace llvm { class AMDGPUInstrPrinter; +class AMDGPUSubtarget; class AMDGPUTargetMachine; class FunctionPass; class MCAsmInfo; @@ -47,6 +48,7 @@ void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; // Passes common to R600 and SI +FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST); Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td index 2cfc463937..d3dff531a7 100644 --- a/lib/Target/R600/AMDGPU.td +++ b/lib/Target/R600/AMDGPU.td @@ -86,28 +86,40 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>; def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>; def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>; +class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature< + "localmemorysize"#Value, + "LocalMemorySize", + !cast<string>(Value), + "The size of local memory in bytes">; + class SubtargetFeatureGeneration <string Value, list<SubtargetFeature> Implies> : SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value, Value#" GPU generation", Implies>; +def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>; +def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>; +def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>; + def FeatureR600 : SubtargetFeatureGeneration<"R600", - [FeatureR600ALUInst, FeatureFetchLimit8]>; + [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>; def FeatureR700 : SubtargetFeatureGeneration<"R700", - [FeatureFetchLimit16]>; + [FeatureFetchLimit16, FeatureLocalMemorySize0]>; def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN", - [FeatureFetchLimit16]>; + [FeatureFetchLimit16, FeatureLocalMemorySize32768]>; def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS", - [FeatureFetchLimit16, FeatureWavefrontSize64]>; + [FeatureFetchLimit16, FeatureWavefrontSize64, + FeatureLocalMemorySize32768] +>; def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS", - [Feature64BitPtr, FeatureFP64]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>; def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS", - [Feature64BitPtr, FeatureFP64]>; + [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>; //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp index 8385baa101..b4e86ce3a1 100644 --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp @@ -258,6 +258,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args); } case ISD::SCALAR_TO_VECTOR: + case AMDGPUISD::BUILD_VERTICAL_VECTOR: case ISD::BUILD_VECTOR: { unsigned RegClassID; const AMDGPURegisterInfo *TRI = @@ -308,7 +309,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) { // can't be bundled by our scheduler. switch(NumVectorElts) { case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break; - case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break; + case 4: + if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR) + RegClassID = AMDGPU::R600_Reg128VerticalRegClassID; + else + RegClassID = AMDGPU::R600_Reg128RegClassID; + break; default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR"); } } diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index 1627ec0877..58959b8d4b 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -1911,6 +1911,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CVT_F32_UBYTE1) NODE_NAME_CASE(CVT_F32_UBYTE2) NODE_NAME_CASE(CVT_F32_UBYTE3) + NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(STORE_MSKOR) NODE_NAME_CASE(TBUFFER_STORE_FORMAT) } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 57f09bebd1..a4baaf1258 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -203,6 +203,15 @@ enum { CVT_F32_UBYTE1, CVT_F32_UBYTE2, CVT_F32_UBYTE3, + /// This node is for VLIW targets and it is used to represent a vector + /// that is stored in consecutive registers with the same channel. + /// For example: + /// |X |Y|Z|W| + /// T0|v.x| | | | + /// T1|v.y| | | | + /// T2|v.z| | | | + /// T3|v.w| | | | + BUILD_VERTICAL_VECTOR, FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE, STORE_MSKOR, LOAD_CONSTANT, diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp new file mode 100644 index 0000000000..2d3a7fdd4f --- /dev/null +++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp @@ -0,0 +1,365 @@ +//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass eliminates allocas by either converting them into vectors or +// by migrating them to local address space. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "amdgpu-promote-alloca" + +using namespace llvm; + +namespace { + +class AMDGPUPromoteAlloca : public FunctionPass, + public InstVisitor<AMDGPUPromoteAlloca> { + + static char ID; + Module *Mod; + const AMDGPUSubtarget &ST; + int LocalMemAvailable; + +public: + AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st), + LocalMemAvailable(0) { } + virtual bool doInitialization(Module &M); + virtual bool runOnFunction(Function &F); + virtual const char *getPassName() const { + return "AMDGPU Promote Alloca"; + } + void visitAlloca(AllocaInst &I); +}; + +} // End anonymous namespace + +char AMDGPUPromoteAlloca::ID = 0; + +bool AMDGPUPromoteAlloca::doInitialization(Module &M) { + Mod = &M; + return false; +} + +bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { + + const FunctionType *FTy = F.getFunctionType(); + + LocalMemAvailable = ST.getLocalMemorySize(); + + + // If the function has any arguments in the local address space, then it's + // possible these arguments require the entire local memory space, so + // we cannot use local memory in the pass. + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) { + const Type *ParamTy = FTy->getParamType(i); + if (ParamTy->isPointerTy() && + ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + LocalMemAvailable = 0; + DEBUG(dbgs() << "Function has local memory argument. Promoting to " + "local memory disabled.\n"); + break; + } + } + + if (LocalMemAvailable > 0) { + // Check how much local memory is being used by global objects + for (Module::global_iterator I = Mod->global_begin(), + E = Mod->global_end(); I != E; ++I) { + GlobalVariable *GV = I; + PointerType *GVTy = GV->getType(); + if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) + continue; + for (Value::use_iterator U = GV->use_begin(), + UE = GV->use_end(); U != UE; ++U) { + Instruction *Use = dyn_cast<Instruction>(*U); + if (!Use) + continue; + if (Use->getParent()->getParent() == &F) + LocalMemAvailable -= + Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType()); + } + } + } + + LocalMemAvailable = std::max(0, LocalMemAvailable); + DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n"); + + visit(F); + + return false; +} + +static VectorType *arrayTypeToVecType(const Type *ArrayTy) { + return VectorType::get(ArrayTy->getArrayElementType(), + ArrayTy->getArrayNumElements()); +} + +static Value* calculateVectorIndex(Value *Ptr, + std::map<GetElementPtrInst*, Value*> GEPIdx) { + if (isa<AllocaInst>(Ptr)) + return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext())); + + GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr); + + return GEPIdx[GEP]; +} + +static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { + // FIXME we only support simple cases + if (GEP->getNumOperands() != 3) + return NULL; + + ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1)); + if (!I0 || !I0->isZero()) + return NULL; + + return GEP->getOperand(2); +} + +static bool tryPromoteAllocaToVector(AllocaInst *Alloca) { + Type *AllocaTy = Alloca->getAllocatedType(); + + DEBUG(dbgs() << "Alloca Candidate for vectorization \n"); + + // FIXME: There is no reason why we can't support larger arrays, we + // are just being conservative for now. + if (!AllocaTy->isArrayTy() || + AllocaTy->getArrayElementType()->isVectorTy() || + AllocaTy->getArrayNumElements() > 4) { + + DEBUG(dbgs() << " Cannot convert type to vector"); + return false; + } + + std::map<GetElementPtrInst*, Value*> GEPVectorIdx; + std::vector<Value*> WorkList; + for (User *AllocaUser : Alloca->users()) { + GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser); + if (!GEP) { + WorkList.push_back(AllocaUser); + continue; + } + + Value *Index = GEPToVectorIndex(GEP); + + // If we can't compute a vector index from this GEP, then we can't + // promote this alloca to vector. + if (!Index) { + DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << "\n"); + return false; + } + + GEPVectorIdx[GEP] = Index; + for (User *GEPUser : AllocaUser->users()) { + WorkList.push_back(GEPUser); + } + } + + VectorType *VectorTy = arrayTypeToVecType(AllocaTy); + + DEBUG(dbgs() << " Converting alloca to vector "; AllocaTy->dump(); + dbgs() << " -> "; VectorTy->dump(); dbgs() << "\n"); + + for (std::vector<Value*>::iterator I = WorkList.begin(), + E = WorkList.end(); I != E; ++I) { + Instruction *Inst = cast<Instruction>(*I); + IRBuilder<> Builder(Inst); + switch (Inst->getOpcode()) { + case Instruction::Load: { + Value *Ptr = Inst->getOperand(0); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index); + Inst->replaceAllUsesWith(ExtractElement); + Inst->eraseFromParent(); + break; + } + case Instruction::Store: { + Value *Ptr = Inst->getOperand(1); + Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx); + Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0)); + Value *VecValue = Builder.CreateLoad(BitCast); + Value *NewVecValue = Builder.CreateInsertElement(VecValue, + Inst->getOperand(0), + Index); + Builder.CreateStore(NewVecValue, BitCast); + Inst->eraseFromParent(); + break; + } + case Instruction::BitCast: + break; + + default: + Inst->dump(); + llvm_unreachable("Do not know how to replace this instruction " + "with vector op"); + } + } + return true; +} + +static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) { + for (User *User : Val->users()) { + if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end()) + continue; + if (isa<CallInst>(User)) { + WorkList.push_back(User); + continue; + } + if (!User->getType()->isPointerTy()) + continue; + WorkList.push_back(User); + collectUsesWithPtrTypes(User, WorkList); + } +} + +void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) { + IRBuilder<> Builder(&I); + + // First try to replace the alloca with a vector + Type *AllocaTy = I.getAllocatedType(); + + DEBUG(dbgs() << "Trying to promote " << I); + + if (tryPromoteAllocaToVector(&I)) + return; + + DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n"); + + // FIXME: This is the maximum work group size. We should try to get + // value from the reqd_work_group_size function attribute if it is + // available. + unsigned WorkGroupSize = 256; + int AllocaSize = WorkGroupSize * + Mod->getDataLayout()->getTypeAllocSize(AllocaTy); + + if (AllocaSize > LocalMemAvailable) { + DEBUG(dbgs() << " Not enough local memory to promote alloca.\n"); + return; + } + + DEBUG(dbgs() << "Promoting alloca to local memory\n"); + LocalMemAvailable -= AllocaSize; + + GlobalVariable *GV = new GlobalVariable( + *Mod, ArrayType::get(I.getAllocatedType(), 256), false, + GlobalValue::ExternalLinkage, 0, I.getName(), 0, + GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS); + + FunctionType *FTy = FunctionType::get( + Type::getInt32Ty(Mod->getContext()), false); + AttributeSet AttrSet; + AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone); + + Value *ReadLocalSizeY = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.y", FTy, AttrSet); + Value *ReadLocalSizeZ = Mod->getOrInsertFunction( + "llvm.r600.read.local.size.z", FTy, AttrSet); + Value *ReadTIDIGX = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.x", FTy, AttrSet); + Value *ReadTIDIGY = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.y", FTy, AttrSet); + Value *ReadTIDIGZ = Mod->getOrInsertFunction( + "llvm.r600.read.tidig.z", FTy, AttrSet); + + + Value *TCntY = Builder.CreateCall(ReadLocalSizeY); + Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ); + Value *TIdX = Builder.CreateCall(ReadTIDIGX); + Value *TIdY = Builder.CreateCall(ReadTIDIGY); + Value *TIdZ = Builder.CreateCall(ReadTIDIGZ); + + Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ); + Tmp0 = Builder.CreateMul(Tmp0, TIdX); + Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ); + Value *TID = Builder.CreateAdd(Tmp0, Tmp1); + TID = Builder.CreateAdd(TID, TIdZ); + + std::vector<Value*> Indices; + Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext()))); + Indices.push_back(TID); + + Value *Offset = Builder.CreateGEP(GV, Indices); + I.mutateType(Offset->getType()); + I.replaceAllUsesWith(Offset); + I.eraseFromParent(); + + std::vector<Value*> WorkList; + + collectUsesWithPtrTypes(Offset, WorkList); + + for (std::vector<Value*>::iterator i = WorkList.begin(), + e = WorkList.end(); i != e; ++i) { + Value *V = *i; + CallInst *Call = dyn_cast<CallInst>(V); + if (!Call) { + Type *EltTy = V->getType()->getPointerElementType(); + PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS); + V->mutateType(NewTy); + continue; + } + + IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call); + if (!Intr) { + std::vector<Type*> ArgTypes; + for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands(); + ArgIdx != ArgEnd; ++ArgIdx) { + ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType()); + } + Function *F = Call->getCalledFunction(); + FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes, + F->isVarArg()); + Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType, + F->getAttributes()); + Function *NewF = cast<Function>(C); + Call->setCalledFunction(NewF); + continue; + } + + Builder.SetInsertPoint(Intr); + switch (Intr->getIntrinsicID()) { + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + // These intrinsics are for address space 0 only + Intr->eraseFromParent(); + continue; + case Intrinsic::memcpy: { + MemCpyInst *MemCpy = cast<MemCpyInst>(Intr); + Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(), + MemCpy->getLength(), MemCpy->getAlignment(), + MemCpy->isVolatile()); + Intr->eraseFromParent(); + continue; + } + case Intrinsic::memset: { + MemSetInst *MemSet = cast<MemSetInst>(Intr); + Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(), + MemSet->getLength(), MemSet->getAlignment(), + MemSet->isVolatile()); + Intr->eraseFromParent(); + continue; + } + default: + Intr->dump(); + llvm_unreachable("Don't know how to promote alloca intrinsic use."); + } + } +} + +FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) { + return new AMDGPUPromoteAlloca(ST); +} diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index 348d50f93f..4fd43905b2 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -41,6 +41,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) : EnableIfCvt = true; WavefrontSize = 0; CFALUBug = false; + LocalMemorySize = 0; ParseSubtargetFeatures(GPU, FS); DevName = GPU; @@ -109,6 +110,10 @@ AMDGPUSubtarget::hasCFAluBug() const { assert(getGeneration() <= NORTHERN_ISLANDS); return CFALUBug; } +int +AMDGPUSubtarget::getLocalMemorySize() const { + return LocalMemorySize; +} bool AMDGPUSubtarget::isTargetELF() const { return false; diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 411aaf9711..9c78f35df3 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -56,6 +56,7 @@ private: bool EnableIfCvt; unsigned WavefrontSize; bool CFALUBug; + int LocalMemorySize; InstrItineraryData InstrItins; @@ -109,6 +110,7 @@ public: unsigned getWavefrontSize() const; unsigned getStackEntrySize() const; bool hasCFAluBug() const; + int getLocalMemorySize() const; bool enableMachineScheduler() const override { return getGeneration() <= NORTHERN_ISLANDS; diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 3c896af46a..be1eceaaa0 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -109,6 +109,7 @@ public: return nullptr; } + virtual void addCodeGenPrepare(); bool addPreISel() override; bool addInstSelector() override; bool addPreRegAlloc() override; @@ -134,6 +135,13 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) { PM.add(createAMDGPUTargetTransformInfoPass(this)); } +void AMDGPUPassConfig::addCodeGenPrepare() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + addPass(createAMDGPUPromoteAlloca(ST)); + addPass(createSROAPass()); + TargetPassConfig::addCodeGenPrepare(); +} + bool AMDGPUPassConfig::addPreISel() { const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 128dd683fb..be2ca06ef3 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_target(R600CodeGen AMDGPUTargetTransformInfo.cpp AMDGPUISelLowering.cpp AMDGPUInstrInfo.cpp + AMDGPUPromoteAlloca.cpp AMDGPURegisterInfo.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 87238d6156..beea54e14e 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -136,6 +136,16 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::FrameIndex, MVT::i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); + + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); + setTargetDAGCombine(ISD::FP_ROUND); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); @@ -540,6 +550,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); + case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::FCOS: case ISD::FSIN: return LowerTrig(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); @@ -812,6 +824,56 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N, } } +SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG, + SDValue Vector) const { + + SDLoc DL(Vector); + EVT VecVT = Vector.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + SmallVector<SDValue, 8> Args; + + for (unsigned i = 0, e = VecVT.getVectorNumElements(); + i != e; ++i) { + Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, + Vector, DAG.getConstant(i, getVectorIdxTy()))); + } + + return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args); +} + +SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Index = Op.getOperand(1); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Index); +} + +SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + SDValue Vector = Op.getOperand(0); + SDValue Value = Op.getOperand(1); + SDValue Index = Op.getOperand(2); + + if (isa<ConstantSDNode>(Index) || + Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR) + return Op; + + Vector = vectorToVerticalVector(DAG, Vector); + SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(), + Vector, Value, Index); + return vectorToVerticalVector(DAG, Insert); +} + SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { // On hw >= R700, COS/SIN input must be between -1. and 1. // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5) diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h index 301509fc8c..ed12dab48b 100644 --- a/lib/Target/R600/R600ISelLowering.h +++ b/lib/Target/R600/R600ISelLowering.h @@ -51,7 +51,10 @@ private: void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB, MachineRegisterInfo & MRI, unsigned dword_offset) const; SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const; + SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const; + SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 080ff6eea7..3972e2f037 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -51,11 +51,15 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned DestReg, unsigned SrcReg, bool KillSrc) const { unsigned VectorComponents = 0; - if (AMDGPU::R600_Reg128RegClass.contains(DestReg) && - AMDGPU::R600_Reg128RegClass.contains(SrcReg)) { + if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg128RegClass.contains(SrcReg) || + AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) { VectorComponents = 4; - } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) && - AMDGPU::R600_Reg64RegClass.contains(SrcReg)) { + } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) && + (AMDGPU::R600_Reg64RegClass.contains(SrcReg) || + AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) { VectorComponents = 2; } @@ -1053,6 +1057,29 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData, return 2; } +bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { + + switch(MI->getOpcode()) { + default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); + case AMDGPU::R600_EXTRACT_ELT_V2: + case AMDGPU::R600_EXTRACT_ELT_V4: + buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(), + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(2).getReg(), + RI.getHWRegChan(MI->getOperand(1).getReg())); + break; + case AMDGPU::R600_INSERT_ELT_V2: + case AMDGPU::R600_INSERT_ELT_V4: + buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value + RI.getHWRegIndex(MI->getOperand(1).getReg()), // Address + MI->getOperand(3).getReg(), // Offset + RI.getHWRegChan(MI->getOperand(1).getReg())); // Channel + break; + } + MI->eraseFromParent(); + return true; +} + void R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const { const AMDGPUFrameLowering *TFL = @@ -1090,7 +1117,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { - unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); setImmOperand(MOVA, AMDGPU::OpName::write, 0); @@ -1107,7 +1149,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, unsigned ValueReg, unsigned Address, unsigned OffsetReg) const { - unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); + return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0); +} + +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const { + unsigned AddrReg; + switch (AddrChan) { + default: llvm_unreachable("Invalid Channel"); + case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break; + case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break; + case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break; + case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break; + } MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg, AMDGPU::AR_X, OffsetReg); diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index 60df21571b..45a57d367b 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -36,6 +36,18 @@ namespace llvm { std::vector<std::pair<int, unsigned> > ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const; + + MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; + + MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + unsigned ValueReg, unsigned Address, + unsigned OffsetReg, + unsigned AddrChan) const; public: enum BankSwizzle { ALU_VEC_012_SCL_210 = 0, @@ -195,6 +207,8 @@ namespace llvm { int getInstrLatency(const InstrItineraryData *ItinData, SDNode *Node) const override { return 1;} + virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const; + /// \brief Reserve the registers that may be accesed using indirect addressing. void reserveIndirectRegisters(BitVector &Reserved, const MachineFunction &MF) const; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index ec039c50da..58c704d8ec 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1582,6 +1582,60 @@ let isTerminator=1 in { } //===----------------------------------------------------------------------===// +// Indirect addressing pseudo instructions +//===----------------------------------------------------------------------===// + +let isPseudo = 1 in { + +class ExtractVertical <RegisterClass vec_rc> : InstR600 < + (outs R600_Reg32:$dst), + (ins vec_rc:$vec, R600_Reg32:$index), "", + [], + AnyALU +>; + +let Constraints = "$dst = $vec" in { + +class InsertVertical <RegisterClass vec_rc> : InstR600 < + (outs vec_rc:$dst), + (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "", + [], + AnyALU +>; + +} // End Constraints = "$dst = $vec" + +} // End isPseudo = 1 + +def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>; +def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>; + +def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>; +def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>; + +class ExtractVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (scalar_ty (extractelt vec_ty:$vec, i32:$index)), + (inst $vec, $index) +>; + +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>; +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>; + +class InsertVerticalPat <Instruction inst, ValueType vec_ty, + ValueType scalar_ty> : Pat < + (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)), + (inst $vec, $value, $index) +>; + +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>; +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>; + +//===----------------------------------------------------------------------===// // ISel Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td index 68bcd207b4..cc667d985a 100644 --- a/lib/Target/R600/R600RegisterInfo.td +++ b/lib/Target/R600/R600RegisterInfo.td @@ -18,18 +18,28 @@ class R600RegWithChan <string name, bits<9> sel, string chan> : class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1, sub2, sub3]; - let HWEncoding = encoding; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; } class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> { + field bits<2> chan_encoding = 0; let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = encoding; + let HWEncoding{8-0} = encoding{8-0}; + let HWEncoding{10-9} = chan_encoding; } +class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 < + "V"#lo#hi#"_"#chan, + [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)], + lo +>; foreach Index = 0-127 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { @@ -54,6 +64,24 @@ foreach Index = 0-127 in { Index>; } +foreach Chan = [ "X", "Y", "Z", "W"] in { + + let chan_encoding = !if(!eq(Chan, "X"), 0, + !if(!eq(Chan, "Y"), 1, + !if(!eq(Chan, "Z"), 2, + !if(!eq(Chan, "W"), 3, 0)))) in { + def V0123_#Chan : R600Reg_128 <"V0123_"#Chan, + [!cast<Register>("T0_"#Chan), + !cast<Register>("T1_"#Chan), + !cast<Register>("T2_"#Chan), + !cast<Register>("T3_"#Chan)], + 0>; + def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>; + def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>; + } +} + + // KCACHE_BANK0 foreach Index = 159-128 in { foreach Chan = [ "X", "Y", "Z", "W" ] in { @@ -130,8 +158,14 @@ def ALU_PARAM : R600Reg<"Param", 0>; let isAllocatable = 0 in { -// XXX: Only use the X channel, until we support wider stack widths -def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>; +def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>; + +// We only use Addr_[YZW] for vertical vectors. +// FIXME if we add more vertical vector registers we will need to ad more +// registers to these classes. +def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>; +def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>; +def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>; def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32, (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>; @@ -206,5 +240,13 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, let CopyCost = -1; } +def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, + (add V0123_W, V0123_Z, V0123_Y, V0123_X) +>; + def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, (add (sequence "T%u_XY", 0, 63))>; + +def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64, + (add V01_X, V01_Y, V01_Z, V01_W, + V23_X, V23_Y, V23_Z, V23_W)>; diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 2f04bea269..48b6b7d730 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -2560,13 +2560,13 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I // 1. Extract with offset def : Pat< (vector_extract vt:$vec, (add i32:$idx, imm:$off)), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) + (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off)) >; // 2. Extract without offset def : Pat< (vector_extract vt:$vec, i32:$idx), - (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) + (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0)) >; // 3. Insert with offset diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll index c2362da15c..3230353c36 100644 --- a/test/CodeGen/R600/array-ptr-calc-i32.ll +++ b/test/CodeGen/R600/array-ptr-calc-i32.ll @@ -10,7 +10,12 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate ; SI-LABEL: @test_private_array_ptr_calc: ; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]] -; SI: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]] +; +; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this +; alloca to a vector. It currently fails because it does not know how +; to interpret: +; getelementptr [4 x i32]* %alloca, i32 1, i32 %b +; SI: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]] define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) { %alloca = alloca [4 x i32], i32 4, align 16 %tid = call i32 @llvm.SI.tid() readnone diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll index 4d1f7347ec..b127b7ede2 100644 --- a/test/CodeGen/R600/indirect-private-64.ll +++ b/test/CodeGen/R600/indirect-private-64.ll @@ -3,10 +3,8 @@ declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind ; SI-LABEL: @private_access_f64_alloca: -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELS_B32_e32 -; SI: V_MOVRELS_B32_e32 +; SI: DS_WRITE_B64 +; SI: DS_READ_B64 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind { %val = load double addrspace(1)* %in, align 8 %array = alloca double, i32 16, align 8 @@ -19,14 +17,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double } ; SI-LABEL: @private_access_v2f64_alloca: -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELS_B32_e32 -; SI: V_MOVRELS_B32_e32 -; SI: V_MOVRELS_B32_e32 -; SI: V_MOVRELS_B32_e32 +; SI: DS_WRITE_B64 +; SI: DS_WRITE_B64 +; SI: DS_READ_B64 +; SI: DS_READ_B64 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x double> addrspace(1)* %in, align 16 %array = alloca <2 x double>, i32 16, align 16 @@ -39,10 +33,8 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out } ; SI-LABEL: @private_access_i64_alloca: -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELS_B32_e32 -; SI: V_MOVRELS_B32_e32 +; SI: DS_WRITE_B64 +; SI: DS_READ_B64 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind { %val = load i64 addrspace(1)* %in, align 8 %array = alloca i64, i32 16, align 8 @@ -55,14 +47,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs } ; SI-LABEL: @private_access_v2i64_alloca: -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELD_B32_e32 -; SI: V_MOVRELS_B32_e32 -; SI: V_MOVRELS_B32_e32 -; SI: V_MOVRELS_B32_e32 -; SI: V_MOVRELS_B32_e32 +; SI: DS_WRITE_B64 +; SI: DS_WRITE_B64 +; SI: DS_READ_B64 +; SI: DS_READ_B64 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind { %val = load <2 x i64> addrspace(1)* %in, align 16 %array = alloca <2 x i64>, i32 16, align 16 diff --git a/test/CodeGen/R600/large-alloca.ll b/test/CodeGen/R600/large-alloca.ll index dd9b6775c0..d8be6d40f3 100644 --- a/test/CodeGen/R600/large-alloca.ll +++ b/test/CodeGen/R600/large-alloca.ll @@ -2,10 +2,13 @@ ; REQUIRES: asserts ; RUN: llc -march=r600 -mcpu=SI < %s -define void @large_alloca(i32 addrspace(1)* %out, i32 %x) nounwind { - %large = alloca [256 x i32], align 4 - %gep = getelementptr [256 x i32]* %large, i32 0, i32 255 +define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) nounwind { + %large = alloca [8192 x i32], align 4 + %gep = getelementptr [8192 x i32]* %large, i32 0, i32 8191 store i32 %x, i32* %gep + %gep1 = getelementptr [8192 x i32]* %large, i32 0, i32 %y + %0 = load i32* %gep1 + store i32 %0, i32 addrspace(1)* %out ret void } diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll index 4afaf684bf..8a269e0cb4 100644 --- a/test/CodeGen/R600/parallelandifcollapse.ll +++ b/test/CodeGen/R600/parallelandifcollapse.ll @@ -7,6 +7,12 @@ ; CHECK: AND_INT ; CHECK-NEXT: AND_INT ; CHECK-NEXT: OR_INT + +; FIXME: For some reason having the allocas here allowed the flatten cfg pass +; to do its transfomation, however now that we are using local memory for +; allocas, the transformation isn't happening. +; XFAIL: * + define void @_Z9chk1D_512v() #0 { entry: %a0 = alloca i32, align 4 diff --git a/test/CodeGen/R600/parallelorifcollapse.ll b/test/CodeGen/R600/parallelorifcollapse.ll index b0db7cdd06..feca688c30 100644 --- a/test/CodeGen/R600/parallelorifcollapse.ll +++ b/test/CodeGen/R600/parallelorifcollapse.ll @@ -3,6 +3,11 @@ ; ; CFG flattening should use parallel-or to generate branch conditions and ; then merge if-regions with the same bodies. + +; FIXME: For some reason having the allocas here allowed the flatten cfg pass +; to do its transfomation, however now that we are using local memory for +; allocas, the transformation isn't happening. +; XFAIL: * ; ; CHECK: OR_INT ; CHECK-NEXT: OR_INT diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll index d3453f26ae..c60c059756 100644 --- a/test/CodeGen/R600/private-memory.ll +++ b/test/CodeGen/R600/private-memory.ll @@ -1,24 +1,17 @@ ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC -; This test checks that uses and defs of the AR register happen in the same -; instruction clause. - ; FUNC-LABEL: @mova_same_clause -; R600-CHECK: MOVA_INT -; R600-CHECK-NOT: ALU clause -; R600-CHECK: 0 + AR.x -; R600-CHECK: MOVA_INT -; R600-CHECK-NOT: ALU clause -; R600-CHECK: 0 + AR.x - -; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo -; SI-CHECK: V_MOVRELD -; SI-CHECK: S_CBRANCH -; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo -; SI-CHECK: V_MOVRELD -; SI-CHECK: S_CBRANCH +; R600-CHECK: LDS_WRITE +; R600-CHECK: LDS_WRITE +; R600-CHECK: LDS_READ +; R600-CHECK: LDS_READ + +; SI-CHECK: DS_WRITE_B32 +; SI-CHECK: DS_WRITE_B32 +; SI-CHECK: DS_READ_B32 +; SI-CHECK: DS_READ_B32 define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: %stack = alloca [5 x i32], align 4 @@ -114,12 +107,8 @@ for.end: ; FUNC-LABEL: @short_array -; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal -; R600-CHECK: 65536 -; R600-CHECK: * ; R600-CHECK: MOVA_INT -; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 0x10000 ; SI-CHECK: V_MOVRELS_B32_e32 define void @short_array(i32 addrspace(1)* %out, i32 %index) { entry: @@ -137,10 +126,7 @@ entry: ; FUNC-LABEL: @char_array -; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal -; R600-CHECK: 256 -; R600-CHECK: * -; R600-CHECK-NEXT: MOVA_INT +; R600-CHECK: MOVA_INT ; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100 ; SI-CHECK: V_MOVRELS_B32_e32 @@ -185,7 +171,9 @@ entry: ; Test that two stack objects are not stored in the same register ; The second stack object should be in T3.X ; FUNC-LABEL: @no_overlap -; R600-CHECK: MOV {{\** *}}T3.X +; R600_CHECK: MOV +; R600_CHECK: [[CHAN:[XYZW]]]+ +; R600-CHECK-NOT: [[CHAN]]+ ; SI-CHECK: V_MOV_B32_e32 v3 define void @no_overlap(i32 addrspace(1)* %out, i32 %in) { entry: diff --git a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll index d9f60ea1a4..dee432664e 100644 --- a/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll +++ b/test/CodeGen/R600/simplify-demanded-bits-build-pair.ll @@ -1,5 +1,7 @@ ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI %s +; XFAIL: * + ; 64-bit select was originally lowered with a build_pair, and this ; could be simplified to 1 cndmask instead of 2, but that broken when ; it started being implemented with a v2i32 build_vector and @@ -12,9 +14,10 @@ define void @trunc_select_i64(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) { ret void } +; FIXME: Fix truncating store for local memory ; SI-LABEL: @trunc_load_alloca_i64: -; SI: V_MOVRELS_B32 -; SI-NOT: V_MOVRELS_B32 +; SI: DS_READ_B32 +; SI-NOT: DS_READ_B64 ; SI: S_ENDPGM define void @trunc_load_alloca_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) { %idx = add i32 %a, %b diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll new file mode 100644 index 0000000000..6543f6d059 --- /dev/null +++ b/test/CodeGen/R600/vector-alloca.ll @@ -0,0 +1,74 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s +; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s + +; FUNC-LABEL: @vector_read +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOVA_INT +define void @vector_read(i32 addrspace(1)* %out, i32 %index) { +entry: + %0 = alloca [4 x i32] + %x = getelementptr [4 x i32]* %0, i32 0, i32 0 + %y = getelementptr [4 x i32]* %0, i32 0, i32 1 + %z = getelementptr [4 x i32]* %0, i32 0, i32 2 + %w = getelementptr [4 x i32]* %0, i32 0, i32 3 + store i32 0, i32* %x + store i32 1, i32* %y + store i32 2, i32* %z + store i32 3, i32* %w + %1 = getelementptr [4 x i32]* %0, i32 0, i32 %index + %2 = load i32* %1 + store i32 %2, i32 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @vector_write +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOV +; EG: MOVA_INT +; EG: MOVA_INT +define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %0 = alloca [4 x i32] + %x = getelementptr [4 x i32]* %0, i32 0, i32 0 + %y = getelementptr [4 x i32]* %0, i32 0, i32 1 + %z = getelementptr [4 x i32]* %0, i32 0, i32 2 + %w = getelementptr [4 x i32]* %0, i32 0, i32 3 + store i32 0, i32* %x + store i32 0, i32* %y + store i32 0, i32* %z + store i32 0, i32* %w + %1 = getelementptr [4 x i32]* %0, i32 0, i32 %w_index + store i32 1, i32* %1 + %2 = getelementptr [4 x i32]* %0, i32 0, i32 %r_index + %3 = load i32* %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +; This test should be optimize to: +; store i32 0, i32 addrspace(1)* %out +; FUNC-LABEL: @bitcast_gep +; CHECK: STORE_RAW +define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) { +entry: + %0 = alloca [4 x i32] + %x = getelementptr [4 x i32]* %0, i32 0, i32 0 + %y = getelementptr [4 x i32]* %0, i32 0, i32 1 + %z = getelementptr [4 x i32]* %0, i32 0, i32 2 + %w = getelementptr [4 x i32]* %0, i32 0, i32 3 + store i32 0, i32* %x + store i32 0, i32* %y + store i32 0, i32* %z + store i32 0, i32* %w + %1 = getelementptr [4 x i32]* %0, i32 0, i32 1 + %2 = bitcast i32* %1 to [4 x i32]* + %3 = getelementptr [4 x i32]* %2, i32 0, i32 0 + %4 = load i32* %3 + store i32 %4, i32 addrspace(1)* %out + ret void +} |