1//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 10/// Implements the AMDGPU specific subclass of TargetSubtarget. 12//===----------------------------------------------------------------------===// 26#include "llvm/IR/IntrinsicsAMDGPU.h" 27#include "llvm/IR/IntrinsicsR600.h" 33#define DEBUG_TYPE "amdgpu-subtarget" 41// Returns the maximum per-workgroup LDS allocation size (in bytes) that still 42// allows the given function to achieve an occupancy of NWaves waves per 43// SIMD / EU, taking into account only the function's *maximum* workgroup size. 49constunsigned WavesPerWorkgroup =
50 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
52constunsigned WorkGroupsPerCU =
53 std::max(1u, (NWaves *
getEUsPerCU()) / WavesPerWorkgroup);
58std::pair<unsigned, unsigned>
61// FIXME: We should take into account the LDS allocation granularity. 64// Queried LDS size may be larger than available on a CU, in which case we 65// consider the only achievable occupancy to be 1, in line with what we 66// consider the occupancy to be when the number of requested registers in a 67// particular bank is higher than the number of available ones in that bank. 73auto PropsFromWGSize = [=](
unsigned WGSize)
74 -> std::tuple<const unsigned, const unsigned, unsigned> {
75unsigned WavesPerWG =
divideCeil(WGSize, WaveSize);
77return {WavesPerWG, WGsPerCU, WavesPerWG * WGsPerCU};
80// The maximum group size will generally yield the minimum number of 81// workgroups, maximum number of waves, and minimum occupancy. The opposite is 82// generally true for the minimum group size. LDS or barrier ressource 83// limitations can flip those minimums/maximums. 85auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
86auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
88// It is possible that we end up with flipped minimum and maximum number of 89// waves per CU when the number of minimum/maximum concurrent groups on the CU 90// is limited by LDS usage or barrier resources. 91if (MinWavesPerCU >= MaxWavesPerCU) {
94constunsigned WaveSlotsPerCU = WavesPerEU *
getEUsPerCU();
96// Look for a potential smaller group size than the maximum which decreases 97// the concurrent number of waves on the CU for the same number of 98// concurrent workgroups on the CU. 99unsigned MinWavesPerCUForWGSize =
100divideCeil(WaveSlotsPerCU, MinWGsPerCU + 1) * MinWGsPerCU;
101if (MinWavesPerCU > MinWavesPerCUForWGSize) {
102unsigned ExcessSlots = MinWavesPerCU - MinWavesPerCUForWGSize;
103if (
unsigned ExcessSlotsPerWG = ExcessSlots / MinWGsPerCU) {
104// There may exist a smaller group size than the maximum that achieves 105// the minimum number of waves per CU. This group size is the largest 106// possible size that requires MaxWavesPerWG - E waves where E is 107// maximized under the following constraints. 108// 1. 0 <= E <= ExcessSlotsPerWG 109// 2. (MaxWavesPerWG - E) * WaveSize >= MinWGSize 110 MinWavesPerCU -= MinWGsPerCU * std::min(ExcessSlotsPerWG,
111 MaxWavesPerWG - MinWavesPerWG);
115// Look for a potential larger group size than the minimum which increases 116// the concurrent number of waves on the CU for the same number of 117// concurrent workgroups on the CU. 118unsigned LeftoverSlots = WaveSlotsPerCU - MaxWGsPerCU * MinWavesPerWG;
119if (
unsigned LeftoverSlotsPerWG = LeftoverSlots / MaxWGsPerCU) {
120// There may exist a larger group size than the minimum that achieves the 121// maximum number of waves per CU. This group size is the smallest 122// possible size that requires MinWavesPerWG + L waves where L is 123// maximized under the following constraints. 124// 1. 0 <= L <= LeftoverSlotsPerWG 125// 2. (MinWavesPerWG + L - 1) * WaveSize <= MaxWGSize 126 MaxWavesPerCU += MaxWGsPerCU * std::min(LeftoverSlotsPerWG,
127 ((MaxWGSize - 1) / WaveSize) + 1 -
132// Return the minimum/maximum number of waves on any EU, assuming that all 133// wavefronts are spread across all EUs as evenly as possible. 134return {std::clamp(MinWavesPerCU /
getEUsPerCU(), 1U, WavesPerEU),
144std::pair<unsigned, unsigned>
161// Default minimum/maximum flat work group sizes. 162 std::pair<unsigned, unsigned>
Default =
165// Requested minimum/maximum flat work group sizes. 167F,
"amdgpu-flat-work-group-size",
Default);
169// Make sure requested minimum is less than requested maximum. 170if (Requested.first > Requested.second)
173// Make sure requested values do not violate subtarget's specifications. 183 std::pair<unsigned, unsigned> Requested,
184 std::pair<unsigned, unsigned> FlatWorkGroupSizes)
const{
185// Default minimum/maximum number of waves per execution unit. 188// If minimum/maximum flat work group sizes were explicitly requested using 189// "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum 190// number of waves per execution unit to values implied by requested 191// minimum/maximum flat work group sizes. 192unsigned MinImpliedByFlatWorkGroupSize =
194Default.first = MinImpliedByFlatWorkGroupSize;
196// Make sure requested minimum is less than requested maximum. 197if (Requested.second && Requested.first > Requested.second)
200// Make sure requested values do not violate subtarget's specifications. 205// Make sure requested values are compatible with values implied by requested 206// minimum/maximum flat work group sizes. 207if (Requested.first < MinImpliedByFlatWorkGroupSize)
214constFunction &
F, std::pair<unsigned, unsigned> FlatWorkGroupSizes)
const{
215// Default minimum/maximum number of waves per execution unit. 218// Requested minimum/maximum number of waves per execution unit. 219 std::pair<unsigned, unsigned> Requested =
226if (
Node &&
Node->getNumOperands() == 3)
227return mdconst::extract<ConstantInt>(
Node->getOperand(Dim))->getZExtValue();
228return std::numeric_limits<unsigned>::max();
236unsigned Dimension)
const{
238if (ReqdSize != std::numeric_limits<unsigned>::max())
244for (
intI = 0;
I < 3; ++
I) {
253Function *Kernel =
I->getParent()->getParent();
258// If reqd_work_group_size is present it narrows value down. 259if (
auto *CI = dyn_cast<CallInst>(
I)) {
260constFunction *
F = CI->getCalledFunction();
262unsigned Dim = UINT_MAX;
263switch (
F->getIntrinsicID()) {
264case Intrinsic::amdgcn_workitem_id_x:
265case Intrinsic::r600_read_tidig_x:
268case Intrinsic::r600_read_local_size_x:
271case Intrinsic::amdgcn_workitem_id_y:
272case Intrinsic::r600_read_tidig_y:
275case Intrinsic::r600_read_local_size_y:
278case Intrinsic::amdgcn_workitem_id_z:
279case Intrinsic::r600_read_tidig_z:
282case Intrinsic::r600_read_local_size_z:
291if (ReqdSize != std::numeric_limits<unsigned>::max())
292 MinSize = MaxSize = ReqdSize;
300// Range metadata is [Lo, Hi). For ID query we need to pass max size 301// as Hi. For size query we need to pass Hi + 1. 309if (
auto *CI = dyn_cast<CallBase>(
I)) {
311 CI->addRangeRetAttr(
Range);
315I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
323// We don't allocate the segment if we know the implicit arguments weren't 324// used, even if the ABI implies we need them. 325if (
F.hasFnAttribute(
"amdgpu-no-implicitarg-ptr"))
331// Assume all implicit inputs are used by default 335returnF.getFnAttributeAsParsedInteger(
"amdgpu-implicitarg-num-bytes",
340Align &MaxAlign)
const{
349if (Arg.hasAttribute(
"amdgpu-hidden-argument"))
352constbool IsByRef = Arg.hasByRefAttr();
353Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
354Align Alignment =
DL.getValueOrABITypeAlignment(
355 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
356uint64_t AllocSize =
DL.getTypeAllocSize(ArgTy);
357 ExplicitArgBytes =
alignTo(ExplicitArgBytes, Alignment) + AllocSize;
358 MaxAlign = std::max(MaxAlign, Alignment);
361return ExplicitArgBytes;
365Align &MaxAlign)
const{
374uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
376if (ImplicitBytes != 0) {
378 TotalSize =
alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
379 MaxAlign = std::max(MaxAlign, Alignment);
382// Being able to dereference past the end is useful for emitting scalar loads. 404// FIXME: This has no reason to be in subtarget 408 std::numeric_limits<uint32_t>::max());
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim)
Base class for AMDGPU specific classes of TargetSubtarget.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file describes how to lower LLVM inline asm to machine code INLINEASM.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
AMDGPU R600 specific subclass of TargetSubtarget.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
std::pair< unsigned, unsigned > getDefaultFlatWorkGroupSize(CallingConv::ID CC) const
bool EnableRealTrue16Insts
Align getAlignmentForImplicitArgPtr() const
unsigned getEUsPerCU() const
Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the "CU" is the unit onto whic...
bool isMesaKernel(const Function &F) const
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
virtual unsigned getMinWavesPerEU() const =0
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
bool makeLIDRangeMetadata(Instruction *I) const
Creates value range metadata on an workitemid.* intrinsic call or load.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getImplicitArgNumBytes(const Function &F) const
unsigned getLocalMemorySize() const
Return the maximum number of bytes of LDS available for all workgroups running on the same WGP or CU.
SmallVector< unsigned > getMaxNumWorkGroups(const Function &F) const
Return the number of work groups for the function.
virtual unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const =0
virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const =0
unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
virtual unsigned getMaxFlatWorkGroupSize() const =0
AMDGPUSubtarget(Triple TT)
unsigned getExplicitKernelArgOffset() const
Returns the offset in bytes from the start of the input buffer of the first explicit kernel argument.
unsigned getMaxWavesPerEU() const
uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const
std::pair< unsigned, unsigned > getEffectiveWavesPerEU(std::pair< unsigned, unsigned > WavesPerEU, std::pair< unsigned, unsigned > FlatWorkGroupSizes) const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
unsigned getWavefrontSize() const
virtual unsigned getMinFlatWorkGroupSize() const =0
Class for arbitrary precision integers.
This class represents an incoming formal argument to a Function.
This class represents a range of values.
A parsed version of the target data layout string in and methods for querying it.
MDNode * getMetadata(unsigned KindID) const
Get the current metadata attachments for the given kind, if any.
MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
A Module instance is used to store all the information related to an LLVM module.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
Triple - Helper class for working with autoconf configuration names.
ArchType getArch() const
Get the parsed architecture type of this triple.
The instances of the Type class are immutable: once they are created, they are never changed.
LLVM_READNONE bool isKernel(CallingConv::ID CC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isShader(CallingConv::ID cc)
SmallVector< unsigned > getIntegerVecAttribute(const Function &F, StringRef Name, unsigned Size, unsigned DefaultVal)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
This is an optimization pass for GlobalISel generic memory operations.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
@ Default
The result values are uniform if and only if all operands are uniform.
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.