1//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9/// \file This pass replaces accesses to kernel arguments with loads from 10/// offsets from the kernarg base pointer. 12//===----------------------------------------------------------------------===// 20#include "llvm/IR/IntrinsicsAMDGPU.h" 24#define DEBUG_TYPE "amdgpu-lower-kernel-arguments" 30classPreloadKernelArgInfo {
34unsigned NumFreeUserSGPRs;
36enum HiddenArg :
unsigned {
49// Stores information about a specific hidden argument. 51// Offset in bytes from the location in the kernearg segment pointed to by 52// the implicitarg pointer. 54// The size of the hidden argument in bytes. 56// The name of the hidden argument in the kernel signature. 60staticconstexpr HiddenArgInfo HiddenArgs[END_HIDDEN_ARGS] = {
61 {0, 4,
"_hidden_block_count_x"}, {4, 4,
"_hidden_block_count_y"},
62 {8, 4,
"_hidden_block_count_z"}, {12, 2,
"_hidden_group_size_x"},
63 {14, 2,
"_hidden_group_size_y"}, {16, 2,
"_hidden_group_size_z"},
64 {18, 2,
"_hidden_remainder_x"}, {20, 2,
"_hidden_remainder_y"},
65 {22, 2,
"_hidden_remainder_z"}};
67static HiddenArg getHiddenArgFromOffset(
unsignedOffset) {
68for (
unsignedI = 0;
I < END_HIDDEN_ARGS; ++
I)
70returnstatic_cast<HiddenArg
>(
I);
72return END_HIDDEN_ARGS;
76if (HA < END_HIDDEN_ARGS)
82staticconstchar *getHiddenArgName(HiddenArg HA) {
83if (HA < END_HIDDEN_ARGS) {
84return HiddenArgs[HA].Name;
89// Clones the function after adding implicit arguments to the argument list 90// and returns the new updated function. Preloaded implicit arguments are 91// added up to and including the last one that will be preloaded, indicated by 92// LastPreloadIndex. Currently preloading is only performed on the totality of 93// sequential data from the kernarg segment including implicit (hidden) 94// arguments. This means that all arguments up to the last preloaded argument 95// will also be preloaded even if that data is unused. 96Function *cloneFunctionWithPreloadImplicitArgs(
unsigned LastPreloadIndex) {
100for (
unsignedI = 0;
I <= LastPreloadIndex; ++
I)
101 FTypes.
push_back(getHiddenArgType(Ctx, HiddenArg(
I)));
104 FunctionType::get(FT->getReturnType(), FTypes, FT->isVarArg());
112F.getParent()->getFunctionList().insert(
F.getIterator(), NF);
124 AB.addAttribute(Attribute::InReg);
125 AB.addAttribute(
"amdgpu-hidden-argument");
127for (
unsignedI = 0;
I <= LastPreloadIndex; ++
I) {
128 AL = AL.addParamAttributes(Ctx, NFArg->
getArgNo(), AB);
129 NFArg++->
setName(getHiddenArgName(HiddenArg(
I)));
133F.replaceAllUsesWith(NF);
141 setInitialFreeUserSGPRsCount();
144// Returns the maximum number of user SGPRs that we have available to preload 146void setInitialFreeUserSGPRsCount() {
151bool tryAllocPreloadSGPRs(
unsigned AllocSize,
uint64_t ArgOffset,
153// Check if this argument may be loaded into the same register as the 155if (ArgOffset - LastExplicitArgOffset < 4 &&
159// Pad SGPRs for kernarg alignment. 161unsigned Padding = ArgOffset - LastExplicitArgOffset;
162unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
163unsigned NumPreloadSGPRs =
alignTo(AllocSize, 4) / 4;
164if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
167 NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
171// Try to allocate SGPRs to preload implicit kernel arguments. 172void tryAllocImplicitArgPreloadSGPRs(
uint64_t ImplicitArgsBaseOffset,
176F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
181// Pair is the load and the load offset. 183for (
auto *U : ImplicitArgPtr->
users()) {
188for (
auto *U : CI->
users()) {
190auto *Load = dyn_cast<LoadInst>(U);
// Load from ImplicitArgPtr? 195 Load = dyn_cast<LoadInst>(*U->user_begin());
// Load from GEP? 198if (!Load || !Load->isSimple())
201// FIXME: Expand to handle 64-bit implicit args and large merged loads. 203Type *LoadTy = Load->getType();
204 HiddenArg HA = getHiddenArgFromOffset(
Offset);
205if (HA == END_HIDDEN_ARGS || LoadTy != getHiddenArgType(Ctx, HA))
212if (ImplicitArgLoads.
empty())
215// Allocate loads in order of offset. We need to be sure that the implicit 216// argument can actually be preloaded. 219// If we fail to preload any implicit argument we know we don't have SGPRs 220// to preload any subsequent ones with larger offsets. Find the first 221// argument that we cannot preload. 222auto *PreloadEnd = std::find_if(
223 ImplicitArgLoads.
begin(), ImplicitArgLoads.
end(),
224 [&](
const std::pair<LoadInst *, unsigned> &Load) {
225 unsigned LoadSize = DL.getTypeStoreSize(Load.first->getType());
226 unsigned LoadOffset = Load.second;
227 if (!tryAllocPreloadSGPRs(LoadSize,
228 LoadOffset + ImplicitArgsBaseOffset,
229 LastExplicitArgOffset))
232 LastExplicitArgOffset =
233 ImplicitArgsBaseOffset + LoadOffset + LoadSize;
237if (PreloadEnd == ImplicitArgLoads.
begin())
240unsigned LastHiddenArgIndex = getHiddenArgFromOffset(PreloadEnd[-1].second);
241Function *NF = cloneFunctionWithPreloadImplicitArgs(LastHiddenArgIndex);
243for (
constauto *
I = ImplicitArgLoads.
begin();
I != PreloadEnd; ++
I) {
245unsigned LoadOffset =
I->second;
246unsigned HiddenArgIndex = getHiddenArgFromOffset(LoadOffset);
247unsigned Index = NF->
arg_size() - LastHiddenArgIndex + HiddenArgIndex - 1;
268}
// end anonymous namespace 274AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
276// If this is a dynamic alloca, the value may depend on the loaded kernargs, 277// so loads will need to be inserted before it. 296constAlign KernArgBaseAlign(16);
// FIXME: Increase if necessary 297constuint64_t BaseOffset = ST.getExplicitKernelArgOffset();
300// FIXME: Alignment is broken with explicit arg offset.; 301constuint64_t TotalKernArgSize = ST.getKernArgSegmentSize(
F, MaxAlign);
302if (TotalKernArgSize == 0)
307nullptr,
F.getName() +
".kernarg.segment");
308 KernArgSegment->
addRetAttr(Attribute::NonNull);
313// Preloaded kernel arguments must be sequential. 314bool InPreloadSequence =
true;
315 PreloadKernelArgInfo PreloadInfo(
F, ST);
318constbool IsByRef = Arg.hasByRefAttr();
319Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
320MaybeAlign ParamAlign = IsByRef ? Arg.getParamAlign() : std::nullopt;
321Align ABITypeAlign =
DL.getValueOrABITypeAlignment(ParamAlign, ArgTy);
324uint64_t AllocSize =
DL.getTypeAllocSize(ArgTy);
326uint64_t EltOffset =
alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
327uint64_t LastExplicitArgOffset = ExplicitArgOffset;
328 ExplicitArgOffset =
alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
330// Guard against the situation where hidden arguments have already been 331// lowered and added to the kernel function signiture, i.e. in a situation 332// where this pass has run twice. 333if (Arg.hasAttribute(
"amdgpu-hidden-argument"))
336// Try to preload this argument into user SGPRs. 337if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
338 !Arg.getType()->isAggregateType())
339if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
340 LastExplicitArgOffset))
343 InPreloadSequence =
false;
348// If this is byval, the loads are already explicit in the function. We just 349// need to rewrite the pointer values. 352 Builder.
getInt8Ty(), KernArgSegment, EltOffset,
353 Arg.getName() +
".byval.kernarg.offset");
355Value *CastOffsetPtr =
361if (
PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
362// FIXME: Hack. We rely on AssertZext to be able to fold DS addressing 363// modes on SI to know the high bits are 0 so pointer adds don't wrap. We 364// can't represent this with range metadata because it's only allowed for 368 !ST.hasUsableDSOffset())
371// FIXME: We can replace this with equivalent alias.scope/noalias 372// metadata, but this appears to be a lot of work. 373if (Arg.hasNoAliasAttr())
377auto *VT = dyn_cast<FixedVectorType>(ArgTy);
378bool IsV3 = VT && VT->getNumElements() == 3;
383 int64_t AlignDownOffset =
alignDown(EltOffset, 4);
384 int64_t OffsetDiff = EltOffset - AlignDownOffset;
386 KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
390if (DoShiftOpt) {
// FIXME: Handle aggregate types 391// Since we don't have sub-dword scalar loads, avoid doing an extload by 392// loading earlier than the argument address, and extracting the relevant 394// TODO: Update this for GFX12 which does have scalar sub-dword loads. 396// Additionally widen any sub-dword load to i32 even if suitably aligned, 397// so that CSE between different argument loads works easily. 399 Builder.
getInt8Ty(), KernArgSegment, AlignDownOffset,
400 Arg.
getName() +
".kernarg.offset.align.down");
404 Builder.
getInt8Ty(), KernArgSegment, EltOffset,
405 Arg.getName() +
".kernarg.offset");
406 AdjustedArgTy = ArgTy;
409if (IsV3 &&
Size >= 32) {
411// Use the hack that clang uses to avoid SelectionDAG ruining v3 loads 412 AdjustedArgTy = V4Ty;
417 Load->setMetadata(LLVMContext::MD_invariant_load,
MDNode::get(Ctx, {}));
421if (Arg.hasAttribute(Attribute::NoUndef))
422 Load->setMetadata(LLVMContext::MD_noundef,
MDNode::get(Ctx, {}));
424if (Arg.hasAttribute(Attribute::Range)) {
426 Arg.getAttribute(Attribute::Range).getValueAsConstantRange();
427 Load->setMetadata(LLVMContext::MD_range,
431if (isa<PointerType>(ArgTy)) {
432if (Arg.hasNonNullAttr())
433 Load->setMetadata(LLVMContext::MD_nonnull,
MDNode::get(Ctx, {}));
435uint64_t DerefBytes = Arg.getDereferenceableBytes();
436if (DerefBytes != 0) {
438 LLVMContext::MD_dereferenceable,
441 ConstantInt::get(Builder.
getInt64Ty(), DerefBytes))));
444uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
445if (DerefOrNullBytes != 0) {
447 LLVMContext::MD_dereferenceable_or_null,
450 DerefOrNullBytes))));
453if (
MaybeAlign ParamAlign = Arg.getParamAlign()) {
455 LLVMContext::MD_align,
457 Builder.
getInt64Ty(), ParamAlign->value()))));
461// TODO: Convert noalias arg to !noalias 464Value *ExtractBits = OffsetDiff == 0 ?
465 Load : Builder.
CreateLShr(Load, OffsetDiff * 8);
470 Arg.getName() +
".load");
474 Arg.getName() +
".load");
477 Load->setName(Arg.getName() +
".load");
478 Arg.replaceAllUsesWith(Load);
485if (InPreloadSequence) {
487alignTo(ExplicitArgOffset, ST.getAlignmentForImplicitArgPtr()) +
489 PreloadInfo.tryAllocImplicitArgPreloadSGPRs(ImplicitArgsBaseOffset,
490 ExplicitArgOffset, Builder);
496bool AMDGPULowerKernelArguments::runOnFunction(
Function &
F) {
497auto &TPC = getAnalysis<TargetPassConfig>();
503"AMDGPU Lower Kernel Arguments",
false,
false)
507char AMDGPULowerKernelArguments::
ID = 0;
510returnnew AMDGPULowerKernelArguments();
517// TODO: Preserves a lot more. AMDGPU Lower Kernel Arguments
static BasicBlock::iterator getInsertPt(BasicBlock &BB)
static bool lowerKernelArguments(Function &F, const TargetMachine &TM)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file contains the simple types necessary to represent the attributes associated with functions a...
AMD GCN specific subclass of TargetSubtarget.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Target-Independent Code Generator Pass Configuration Options pass.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
an instruction to allocate memory on the stack
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
This class represents an incoming formal argument to a Function.
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
static Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
This class represents a function call, abstracting a target machine's calling convention.
This class represents a range of values.
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
A parsed version of the target data layout string in and methods for querying it.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
void splice(Function::iterator ToIt, Function *FromF)
Transfer all blocks from FromF to this function at ToIt.
AttributeList getAttributes() const
Return the attribute list for this Function.
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
void setIsNewDbgInfoFormat(bool NewVal)
Argument * getArg(unsigned i) const
void copyAttributesFrom(const Function *Src)
copyAttributesFrom - copy all additional attributes (those not needed to create a Function) from the ...
unsigned getNumFreeUserSGPRs()
void copyMetadata(const GlobalObject *Src, unsigned Offset)
Copy metadata from Src, adjusting offsets by Offset.
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Class to represent integer types.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
ConstantAsMetadata * createConstant(Constant *C)
Return the given constant as metadata.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void setName(const Twine &Name)
Change the name of the value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Function * getDeclarationIfExists(Module *M, ID id, ArrayRef< Type * > Tys, FunctionType *FT=nullptr)
This version supports overloaded intrinsics.
This is an optimization pass for GlobalISel generic memory operations.
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
FunctionPass * createAMDGPULowerKernelArgumentsPass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
This struct is a compact representation of a valid (non-zero power of two) alignment.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Function object to check whether the second component of a container supported by std::get (like std:...