summaryrefslogtreecommitdiff
path: root/lib/Target/R600/AMDGPUPromoteAlloca.cpp
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2014-06-17 16:53:14 +0000
committerTom Stellard <thomas.stellard@amd.com>2014-06-17 16:53:14 +0000
commitf56e7678d1ced97d5513e0a75658dc48396e4a58 (patch)
tree72898cf24c9d9becee91dec7b74d37c2bbfc9242 /lib/Target/R600/AMDGPUPromoteAlloca.cpp
parentff8dc48da387719d4b4c4712715be0e2d2672d87 (diff)
downloadllvm-f56e7678d1ced97d5513e0a75658dc48396e4a58.tar.gz
llvm-f56e7678d1ced97d5513e0a75658dc48396e4a58.tar.bz2
llvm-f56e7678d1ced97d5513e0a75658dc48396e4a58.tar.xz
R600: Use LDS and vectors for private memory
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211110 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/R600/AMDGPUPromoteAlloca.cpp')
-rw-r--r--lib/Target/R600/AMDGPUPromoteAlloca.cpp365
1 files changed, 365 insertions, 0 deletions
diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
new file mode 100644
index 0000000000..2d3a7fdd4f
--- /dev/null
+++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
@@ -0,0 +1,365 @@
+//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates allocas by either converting them into vectors or
+// by migrating them to local address space.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "amdgpu-promote-alloca"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUPromoteAlloca : public FunctionPass,
+ public InstVisitor<AMDGPUPromoteAlloca> {
+
+ static char ID;
+ Module *Mod;
+ const AMDGPUSubtarget &ST;
+ int LocalMemAvailable;
+
+public:
+ AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
+ LocalMemAvailable(0) { }
+ virtual bool doInitialization(Module &M);
+ virtual bool runOnFunction(Function &F);
+ virtual const char *getPassName() const {
+ return "AMDGPU Promote Alloca";
+ }
+ void visitAlloca(AllocaInst &I);
+};
+
+} // End anonymous namespace
+
+char AMDGPUPromoteAlloca::ID = 0;
+
+bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
+ Mod = &M;
+ return false;
+}
+
+bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
+
+ const FunctionType *FTy = F.getFunctionType();
+
+ LocalMemAvailable = ST.getLocalMemorySize();
+
+
+ // If the function has any arguments in the local address space, then it's
+ // possible these arguments require the entire local memory space, so
+ // we cannot use local memory in the pass.
+ for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
+ const Type *ParamTy = FTy->getParamType(i);
+ if (ParamTy->isPointerTy() &&
+ ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+ LocalMemAvailable = 0;
+ DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+ "local memory disabled.\n");
+ break;
+ }
+ }
+
+ if (LocalMemAvailable > 0) {
+ // Check how much local memory is being used by global objects
+ for (Module::global_iterator I = Mod->global_begin(),
+ E = Mod->global_end(); I != E; ++I) {
+ GlobalVariable *GV = I;
+ PointerType *GVTy = GV->getType();
+ if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ continue;
+ for (Value::use_iterator U = GV->use_begin(),
+ UE = GV->use_end(); U != UE; ++U) {
+ Instruction *Use = dyn_cast<Instruction>(*U);
+ if (!Use)
+ continue;
+ if (Use->getParent()->getParent() == &F)
+ LocalMemAvailable -=
+ Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
+ }
+ }
+ }
+
+ LocalMemAvailable = std::max(0, LocalMemAvailable);
+ DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
+
+ visit(F);
+
+ return false;
+}
+
+static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
+ return VectorType::get(ArrayTy->getArrayElementType(),
+ ArrayTy->getArrayNumElements());
+}
+
+static Value* calculateVectorIndex(Value *Ptr,
+ std::map<GetElementPtrInst*, Value*> GEPIdx) {
+ if (isa<AllocaInst>(Ptr))
+ return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));
+
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+
+ return GEPIdx[GEP];
+}
+
+static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
+ // FIXME we only support simple cases
+ if (GEP->getNumOperands() != 3)
+ return NULL;
+
+ ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
+ if (!I0 || !I0->isZero())
+ return NULL;
+
+ return GEP->getOperand(2);
+}
+
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+ Type *AllocaTy = Alloca->getAllocatedType();
+
+ DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
+
+ // FIXME: There is no reason why we can't support larger arrays, we
+ // are just being conservative for now.
+ if (!AllocaTy->isArrayTy() ||
+ AllocaTy->getArrayElementType()->isVectorTy() ||
+ AllocaTy->getArrayNumElements() > 4) {
+
+ DEBUG(dbgs() << " Cannot convert type to vector");
+ return false;
+ }
+
+ std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
+ std::vector<Value*> WorkList;
+ for (User *AllocaUser : Alloca->users()) {
+ GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
+ if (!GEP) {
+ WorkList.push_back(AllocaUser);
+ continue;
+ }
+
+ Value *Index = GEPToVectorIndex(GEP);
+
+ // If we can't compute a vector index from this GEP, then we can't
+ // promote this alloca to vector.
+ if (!Index) {
+ DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << "\n");
+ return false;
+ }
+
+ GEPVectorIdx[GEP] = Index;
+ for (User *GEPUser : AllocaUser->users()) {
+ WorkList.push_back(GEPUser);
+ }
+ }
+
+ VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
+
+ DEBUG(dbgs() << " Converting alloca to vector "; AllocaTy->dump();
+ dbgs() << " -> "; VectorTy->dump(); dbgs() << "\n");
+
+ for (std::vector<Value*>::iterator I = WorkList.begin(),
+ E = WorkList.end(); I != E; ++I) {
+ Instruction *Inst = cast<Instruction>(*I);
+ IRBuilder<> Builder(Inst);
+ switch (Inst->getOpcode()) {
+ case Instruction::Load: {
+ Value *Ptr = Inst->getOperand(0);
+ Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+ Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+ Inst->replaceAllUsesWith(ExtractElement);
+ Inst->eraseFromParent();
+ break;
+ }
+ case Instruction::Store: {
+ Value *Ptr = Inst->getOperand(1);
+ Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+ Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
+ Value *VecValue = Builder.CreateLoad(BitCast);
+ Value *NewVecValue = Builder.CreateInsertElement(VecValue,
+ Inst->getOperand(0),
+ Index);
+ Builder.CreateStore(NewVecValue, BitCast);
+ Inst->eraseFromParent();
+ break;
+ }
+ case Instruction::BitCast:
+ break;
+
+ default:
+ Inst->dump();
+ llvm_unreachable("Do not know how to replace this instruction "
+ "with vector op");
+ }
+ }
+ return true;
+}
+
+static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
+ for (User *User : Val->users()) {
+ if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
+ continue;
+ if (isa<CallInst>(User)) {
+ WorkList.push_back(User);
+ continue;
+ }
+ if (!User->getType()->isPointerTy())
+ continue;
+ WorkList.push_back(User);
+ collectUsesWithPtrTypes(User, WorkList);
+ }
+}
+
+void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
+ IRBuilder<> Builder(&I);
+
+ // First try to replace the alloca with a vector
+ Type *AllocaTy = I.getAllocatedType();
+
+ DEBUG(dbgs() << "Trying to promote " << I);
+
+ if (tryPromoteAllocaToVector(&I))
+ return;
+
+ DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
+
+ // FIXME: This is the maximum work group size. We should try to get
+ // value from the reqd_work_group_size function attribute if it is
+ // available.
+ unsigned WorkGroupSize = 256;
+ int AllocaSize = WorkGroupSize *
+ Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
+
+ if (AllocaSize > LocalMemAvailable) {
+ DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
+ return;
+ }
+
+ DEBUG(dbgs() << "Promoting alloca to local memory\n");
+ LocalMemAvailable -= AllocaSize;
+
+ GlobalVariable *GV = new GlobalVariable(
+ *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
+ GlobalValue::ExternalLinkage, 0, I.getName(), 0,
+ GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+
+ FunctionType *FTy = FunctionType::get(
+ Type::getInt32Ty(Mod->getContext()), false);
+ AttributeSet AttrSet;
+ AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
+
+ Value *ReadLocalSizeY = Mod->getOrInsertFunction(
+ "llvm.r600.read.local.size.y", FTy, AttrSet);
+ Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
+ "llvm.r600.read.local.size.z", FTy, AttrSet);
+ Value *ReadTIDIGX = Mod->getOrInsertFunction(
+ "llvm.r600.read.tidig.x", FTy, AttrSet);
+ Value *ReadTIDIGY = Mod->getOrInsertFunction(
+ "llvm.r600.read.tidig.y", FTy, AttrSet);
+ Value *ReadTIDIGZ = Mod->getOrInsertFunction(
+ "llvm.r600.read.tidig.z", FTy, AttrSet);
+
+
+ Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
+ Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
+ Value *TIdX = Builder.CreateCall(ReadTIDIGX);
+ Value *TIdY = Builder.CreateCall(ReadTIDIGY);
+ Value *TIdZ = Builder.CreateCall(ReadTIDIGZ);
+
+ Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
+ Tmp0 = Builder.CreateMul(Tmp0, TIdX);
+ Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
+ Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
+ TID = Builder.CreateAdd(TID, TIdZ);
+
+ std::vector<Value*> Indices;
+ Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
+ Indices.push_back(TID);
+
+ Value *Offset = Builder.CreateGEP(GV, Indices);
+ I.mutateType(Offset->getType());
+ I.replaceAllUsesWith(Offset);
+ I.eraseFromParent();
+
+ std::vector<Value*> WorkList;
+
+ collectUsesWithPtrTypes(Offset, WorkList);
+
+ for (std::vector<Value*>::iterator i = WorkList.begin(),
+ e = WorkList.end(); i != e; ++i) {
+ Value *V = *i;
+ CallInst *Call = dyn_cast<CallInst>(V);
+ if (!Call) {
+ Type *EltTy = V->getType()->getPointerElementType();
+ PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+ V->mutateType(NewTy);
+ continue;
+ }
+
+ IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
+ if (!Intr) {
+ std::vector<Type*> ArgTypes;
+ for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
+ ArgIdx != ArgEnd; ++ArgIdx) {
+ ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
+ }
+ Function *F = Call->getCalledFunction();
+ FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
+ F->isVarArg());
+ Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
+ F->getAttributes());
+ Function *NewF = cast<Function>(C);
+ Call->setCalledFunction(NewF);
+ continue;
+ }
+
+ Builder.SetInsertPoint(Intr);
+ switch (Intr->getIntrinsicID()) {
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ // These intrinsics are for address space 0 only
+ Intr->eraseFromParent();
+ continue;
+ case Intrinsic::memcpy: {
+ MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
+ Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
+ MemCpy->getLength(), MemCpy->getAlignment(),
+ MemCpy->isVolatile());
+ Intr->eraseFromParent();
+ continue;
+ }
+ case Intrinsic::memset: {
+ MemSetInst *MemSet = cast<MemSetInst>(Intr);
+ Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
+ MemSet->getLength(), MemSet->getAlignment(),
+ MemSet->isVolatile());
+ Intr->eraseFromParent();
+ continue;
+ }
+ default:
+ Intr->dump();
+ llvm_unreachable("Don't know how to promote alloca intrinsic use.");
+ }
+ }
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
+ return new AMDGPUPromoteAlloca(ST);
+}