1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This file defines the interfaces that NVPTX uses to lower LLVM code into a 12//===----------------------------------------------------------------------===// 47#include "llvm/IR/IntrinsicsNVPTX.h" 70#define DEBUG_TYPE "nvptx-lower" 82cl::desc(
"NVPTX Specific: FMA contraction (0: don't do it" 83" 1: do it 2: do it aggressively"),
88cl::desc(
"NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" 89" IEEE Compliant F32 div.rnd if available."),
94cl::desc(
"NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
97/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it 98/// does NOT use lg2.approx for log2, so this is disabled by default. 100"nvptx-approx-log2f32",
101cl::desc(
"NVPTX Specific: whether to use lg2.approx for log2"),
105"nvptx-force-min-byval-param-align",
cl::Hidden,
106cl::desc(
"NVPTX Specific: force 4-byte minimal alignment for byval" 107" params of device functions."),
112// If nvptx-prec-div32=N is used on the command-line, always honor it 115// Otherwise, use div.approx if fast math is enabled 125// If nvptx-prec-sqrtf32 is used on the command-line, always honor it 128// Otherwise, use sqrt.approx if fast math is enabled 146case MVT::v8i8:
// <2 x i8x4> 147case MVT::v16i8:
// <4 x i8x4> 150case MVT::v8i16:
// <4 x i16x2> 156case MVT::v8f16:
// <4 x f16x2> 159case MVT::v8bf16:
// <4 x bf16x2> 172// When legalizing vector loads/stores, this function is called, which does two 174// 1. Determines Whether the vector is something we want to custom lower, 175// std::nullopt is returned if we do not want to custom lower it. 176// 2. If we do want to handle it, returns two parameters: 177// - unsigned int NumElts - The number of elements in the final vector 178// - EVT EltVT - The type of the elements in the final vector 179static std::optional<std::pair<unsigned int, EVT>>
187// We only handle "native" vector sizes for now, e.g. <4 x double> is not 188// legal. We can (and should) split that into 2 stores of <2 x double> here 189// but I'm leaving that as a TODO for now. 207// This is a "native" vector type 208return std::pair(NumElts, EltVT);
209case MVT::v8i8:
// <2 x i8x4> 210case MVT::v8f16:
// <4 x f16x2> 211case MVT::v8bf16:
// <4 x bf16x2> 212case MVT::v8i16:
// <4 x i16x2> 213case MVT::v16i8:
// <4 x i8x4> 214// This can be upsized into a "native" vector type. 215// Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for 216// total load/store size, PTX syntax only supports v2/v4. Thus, we can't use 217// vectorized loads/stores with the actual element type for i8/i16 as that 218// would require v8/v16 variants that do not exist. 219// In order to load/store such vectors efficiently, here in Type 220// Legalization, we split the vector into word-sized chunks (v2x16/v4i8). 221// Later, we will lower to PTX as vectors of b32. 223// Number of elements to pack in one word. 226return std::pair(NumElts / NPerWord,
233/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive 234/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors 235/// into their primitive components. 236/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the 237/// same number of types as the Ins/Outs arrays in LowerFormalArguments, 238/// LowerCall, and LowerReturn. 246// Special case for i128 - decompose to (i64, i64) 252 Offsets->push_back(StartingOffset + 0);
253 Offsets->push_back(StartingOffset + 8);
259// Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs. 260if (
StructType *STy = dyn_cast<StructType>(Ty)) {
261autoconst *SL =
DL.getStructLayout(STy);
263for(
auto *EI : STy->elements()) {
265 StartingOffset + SL->getElementOffset(ElementNum));
271// Given an array type, recursively traverse the elements with custom ComputePTXValueVTs. 272if (
ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
273Type *EltTy = ATy->getElementType();
275for (
intI : llvm::seq<int>(ATy->getNumElements()))
281for (
unsigned i = 0, e = TempVTs.
size(); i != e; ++i) {
284// Split vectors into individual elements, except for v2f16, which 285// we will pass as a single scalar. 289// We require power-of-2 sized vectors becuase 290// TargetLoweringBase::getVectorTypeBreakdown() which is invoked in 291// ComputePTXValueVTs() cannot currently break down non-power-of-2 sized 295// Vectors with an even number of f16 elements will be passed to 296// us as an array of v2f16/v2bf16 elements. We must match this so we 297// stay in sync with Ins/Outs. 315// v*i8 are formally lowered as v4i8 317 NumElts = (NumElts + 3) / 4;
318 }
elseif (EltVT.
getSimpleVT() == MVT::i8 && NumElts == 2) {
319// v2i8 is promoted to v2i16 323for (
unsigned j = 0; j != NumElts; ++j) {
331 Offsets->push_back(Off);
336/// PromoteScalarIntegerPTX 337/// Used to make sure the arguments/returns are suitable for passing 338/// and promote them to a larger size if they're not. 340/// The promoted type is placed in \p PromoteVT if the function returns true. 346"Promotion is not suitable for scalars of size larger than 64-bits");
348 *PromotedVT = MVT::i1;
353 *PromotedVT = MVT::i8;
356 *PromotedVT = MVT::i16;
359 *PromotedVT = MVT::i32;
362 *PromotedVT = MVT::i64;
365returnEVT(*PromotedVT) != VT;
370// Check whether we can merge loads/stores of some of the pieces of a 371// flattened function parameter or return value into a single vector 374// The flattened parameter is represented as a list of EVTs and 375// offsets, and the whole structure is aligned to ParamAlignment. This 376// function determines whether we can load/store pieces of the 377// parameter starting at index Idx using a single vectorized op of 378// size AccessSize. If so, it returns the number of param pieces 379// covered by the vector op. Otherwise, it returns 1. 384// Can't vectorize if param alignment is not sufficient. 385if (ParamAlignment < AccessSize)
387// Can't vectorize if offset is not aligned. 388if (Offsets[
Idx] & (AccessSize - 1))
394// Element is too large to vectorize. 395if (EltSize >= AccessSize)
398unsigned NumElts = AccessSize / EltSize;
399// Can't vectorize if AccessBytes if not a multiple of EltSize. 400if (AccessSize != EltSize * NumElts)
403// We don't have enough elements to vectorize. 404if (
Idx + NumElts > ValueVTs.
size())
407// PTX ISA can only deal with 2- and 4-element vector ops. 408if (NumElts != 4 && NumElts != 2)
411for (
unsigned j =
Idx + 1; j <
Idx + NumElts; ++j) {
412// Types do not match. 413if (ValueVTs[j] != EltVT)
416// Elements are not contiguous. 417if (Offsets[j] - Offsets[j - 1] != EltSize)
420// OK. We can vectorize ValueVTs[i..i+NumElts) 424// Flags for tracking per-element vectorization state of loads/stores 425// of a flattened function parameter or return value. 430// Scalar is effectively a 1-element vector. 434// Computes whether and how we can vectorize the loads/stores of a 435// flattened function parameter or return value. 437// The flattened parameter is represented as the list of ValueVTs and 438// Offsets, and is aligned to ParamAlignment bytes. We return a vector 439// of the same size as ValueVTs indicating how each piece should be 440// loaded/stored (i.e. as a scalar, or as part of a vector 445Align ParamAlignment,
bool IsVAArg =
false) {
446// Set vector size to match ValueVTs and mark all elements as 447// scalars by default. 454// Check what we can vectorize using 128/64/32-bit accesses. 455for (
intI = 0, E = ValueVTs.
size();
I != E; ++
I) {
456// Skip elements we've already processed. 458for (
unsigned AccessSize : {16, 8, 4, 2}) {
460I, AccessSize, ValueVTs, Offsets, ParamAlignment);
461// Mark vectorized elements. 466// Can't vectorize using this size, try next smaller size. 469assert(
I + 1 < E &&
"Not enough elements.");
475assert(
I + 3 < E &&
"Not enough elements.");
483// Break out of the inner loop because we've already succeeded 484// using largest possible AccessSize. 493if (
Value->getValueType(0) == VT)
498// NVPTXTargetLowering Constructor. 502// always lower memset, memcpy, and memmove intrinsics to load/store 503// instructions, rather 504// then generating calls to memset, mempcy or memmove. 512// Jump is Expensive. Don't create extra control flow for 'and', 'or' 513// condition branches. 516// Wide divides are _very_ slow. Try to reduce the width of the divide if 520// By default, use the Source scheduling 530// Several FP16 instructions are available on sm_80 only. 550Op, VT, IsOpSupported ? Action : NoBF16Action);
555bool IsOpSupported =
false;
556// instructions are available on sm_90 only 582// Conversion to/from FP16/FP16x2 is always legal. 595// Conversion to/from BFP16/BFP16x2 is always legal. 606// Conversion to/from i16/i16x2 is always legal. 617// Custom conversions to/from v2i8. 620// Only logical ops can be done on v4i8 directly, others must be done 640// Operations not directly supported by NVPTX. 641for (
MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
642 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
643 MVT::i32, MVT::i64}) {
648// Some SIGN_EXTEND_INREG can be done using cvt instruction. 649// For others we will expand to a SHL/SRA pair. 668 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
682// We want to legalize constant related memmove and memcopy 686// Turn FP extload into load/fpextend 706// Turn FP truncstore into trunc + store. 707// FIXME: vector types should also be expanded 714// PTX does not support load / store predicate registers 730// expand extload of vector of integers. 735// This is legal in NVPTX 744// TRAP can be lowered to PTX trap 746// DEBUGTRAP can be lowered to PTX brkpt 749// Register custom handling for vector loads/stores 764// Custom handling for i8 intrinsics 767for (
constauto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
793// Other arithmetic and logic ops are unsupported. 815// PTX does not directly support SELP of i1, so promote to i32 first 818// PTX cannot multiply two i64s in a single instruction. 822// We have some custom DAG combine patterns for these nodes 827// setcc for f16x2 and bf16x2 needs special handling to prevent 828// legalizer's attempt to scalarize it due to v2i1 not being legal. 832// Promote fp16 arithmetic if fp16 hardware isn't available or the 833// user passed --nvptx-no-fp16-math. The flag is useful because, 834// although sm_53+ GPUs have some sort of FP16 support in 835// hardware, only sm_53 and sm_60 have full implementation. Others 836// only have token amount of hardware and are likely to run faster 837// by using fp32 units instead. 842// bf16 must be promoted to f32. 848// On SM80, we select add/mul/sub as fma to avoid promotion to float 850for (
constauto &VT : {MVT::bf16, MVT::v2bf16}) {
857// f16/f16x2 neg was introduced in PTX 60, SM_53. 858constbool IsFP16FP16x2NegAvailable = STI.
getSmVersion() >= 53 &&
861for (
constauto &VT : {MVT::f16, MVT::v2f16})
867// (would be) Library functions. 869// These map to conversion instructions for scalar FP types. 886for (
MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
892// sm_80 only has conversions between f32 and bf16. Custom lower all other 895for (
MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
913// 'Expand' implements FCOPYSIGN without calling an external library. 921// These map to corresponding instructions for f32/f64. f16 must be 922// promoted to f32. v2f16 is expanded to f16, which is then promoted 958bool SupportsF32MinMaxNaN =
968// Custom lowering for inline asm with 128-bit operands 974// - f16/f16x2 (sm_70+, PTX 7.0+) 975// - bf16/bf16x2 (sm_90+, PTX 7.8+) 976// When f16/bf16 types aren't supported, they are promoted/expanded to f32. 983// FLOG2 supports f32 only 984// f16/bf16 types aren't supported, but they are promoted/expanded to f32. 992// No FPOW or FREM in PTX. 994// Now deduce the information based on the above mentioned 1005#define MAKE_CASE(V) \ 1092bool Reciprocal)
const{
1109// The sqrt and rsqrt refinement processes assume we always start out with an 1110// approximation of the rsqrt. Therefore, if we're going to do any refinement 1111// (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing 1112// any refinement, we must return a regular sqrt. 1113if (Reciprocal || ExtraSteps > 0) {
1115return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1116 : Intrinsic::nvvm_rsqrt_approx_f);
1117elseif (VT == MVT::f64)
1118return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1123return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1124 : Intrinsic::nvvm_sqrt_approx_f);
1126// There's no sqrt.approx.f64 instruction, so we emit 1127// reciprocal(rsqrt(x)). This is faster than 1128// select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain 1132 DAG.
getConstant(Intrinsic::nvvm_rcp_approx_ftz_d,
DL, MVT::i32),
1133 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1155 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1156constCallBase &CB,
unsigned UniqueCallSite)
const{
1160assert(isABI &&
"Non-ABI compilation is not supported");
1164 std::string Prototype;
1166 O <<
"prototype_" << UniqueCallSite <<
" : .callprototype ";
1175if (
auto *ITy = dyn_cast<IntegerType>(retTy)) {
1176size = ITy->getBitWidth();
1179"Floating point type expected here");
1182// PTX ABI requires all scalar return values to be at least 32 1183// bits in size. fp16 normally uses .b16 as its storage type in 1184// PTX, so its size must be adjusted here, too. 1187 O <<
".param .b" <<
size <<
" _";
1188 }
elseif (isa<PointerType>(retTy)) {
1189 O <<
".param .b" << PtrVT.getSizeInBits() <<
" _";
1191 O <<
".param .align " << (retAlignment ? retAlignment->value() : 0)
1192 <<
" .b8 _[" <<
DL.getTypeAllocSize(retTy) <<
"]";
1202unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1203for (
unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1204Type *Ty = Args[i].Ty;
1210if (!Outs[OIdx].Flags.isByVal()) {
1214 O <<
".param .align " << ParamAlign.
value() <<
" .b8 ";
1216 O <<
"[" <<
DL.getTypeAllocSize(Ty) <<
"]";
1217// update the index for Outs 1220if (
unsigned len = vtparts.
size())
1224// i8 types in IR will be i16 types in SDAG 1226 (
getValueType(
DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1227"type mismatch between callee prototype and arguments");
1230if (isa<IntegerType>(Ty)) {
1231 sz = cast<IntegerType>(Ty)->getBitWidth();
1233 }
elseif (isa<PointerType>(Ty)) {
1234 sz = PtrVT.getSizeInBits();
1238 O <<
".param .b" << sz <<
" ";
1243// Indirect calls need strict ABI alignment so we disable optimizations by 1244// not providing a function to optimize. 1245Type *ETy = Args[i].IndirectType;
1246Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1247Align ParamByValAlign =
1250 O <<
".param .align " << ParamByValAlign.
value() <<
" .b8 ";
1252 O <<
"[" << Outs[OIdx].Flags.getByValSize() <<
"]";
1256 O << (first ?
"" :
",") <<
" .param .align " << VAInfo->second
1275// CallSite is zero, fallback to ABI type alignment 1276returnDL.getABITypeAlign(Ty);
1282// We don't have a direct function symbol, but that may be because of 1283// constant cast instructions in the call. 1285// With bitcast'd call targets, the instruction will be the call 1286if (
constauto *CI = dyn_cast<CallInst>(CB)) {
1287// Check if we have call alignment metadata 1289return StackAlign.value();
1294// Check for function alignment information if we found that the 1295// ultimate target is a Function 1299// Call is indirect, fall back to the ABI type alignment 1300returnDL.getABITypeAlign(Ty);
1304switch (ElementType.getSimpleVT().SimpleTy) {
1309 ElementType = MVT::i16;
1314 ElementType = MVT::i32;
1317 ElementType = MVT::i64;
1322// Use byte-store when the param address of the argument value is unaligned. 1323// This may happen when the return value is a field of a packed structure. 1325// This is called in LowerCall() when passing the param values. 1329unsigned ArgID,
constSDLoc &dl) {
1330// Bit logic only works on integer types 1336for (
unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1337// Shift the byte to the last byte position 1343// Trunc store only the last byte by using 1345// The register type can be larger than b8. 1354// Use byte-load when the param adress of the returned value is unaligned. 1355// This may happen when the returned value is a field of a packed structure. 1361// Bit logic only works on integer types 1362EVT MergedType = ElementType;
1365// Load each byte and construct the whole value. Initial value to 0 1367// LoadParamMemI8 loads into i16 register only 1369for (
unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1373// This will be selected to LoadParamMemI8 1387// Need to extend the i16 register to the whole width. 1389// Mask off the high bits. Leave only the lower 8bits. 1390// Do this because we are using loadparam.b8. 1396if (ElementType != MergedType)
1406if (
auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1416"Support for variadic functions (unsized array parameter) introduced " 1417"in PTX ISA version 6.0 and requires target sm_30.");
1433assert(isABI &&
"Non-ABI compilation is not supported");
1437// Variadic arguments. 1439// Normally, for each argument, we declare a param scalar or a param 1440// byte array in the .param space, and store the argument value to that 1441// param scalar or array starting at offset 0. 1443// In the case of the first variadic argument, we declare a vararg byte array 1444// with size 0. The exact size of this array isn't known at this point, so 1445// it'll be patched later. All the variadic arguments will be stored to this 1446// array at a certain offset (which gets tracked by 'VAOffset'). The offset is 1447// initially set to 0, so it can be used for non-variadic arguments (which use 1448// 0 offset) to simplify the code. 1450// After all vararg is processed, 'VAOffset' holds the size of the 1451// vararg byte array. 1453SDValue VADeclareParam;
// vararg byte array 1454unsigned FirstVAArg = CLI.
NumFixedArgs;
// position of the first variadic 1455unsigned VAOffset = 0;
// current offset in the param array 1462unsigned ParamCount = 0;
1463// Args.size() and Outs.size() need not match. 1464// Outs.size() will be larger 1465// * if there is an aggregate argument with multiple fields (each field 1466// showing up separately in Outs) 1467// * if there is a vector argument with more than typical vector-length 1468// elements (generally if more than 4) where each vector element is 1469// individually present in Outs. 1470// So a different index should be used for indexing into Outs/OutVals. 1471// See similar issue in LowerFormalArguments. 1473// Declare the .params or .reg need to pass values 1475for (
unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1476EVT VT = Outs[OIdx].VT;
1477Type *Ty = Args[i].Ty;
1479bool IsByVal = Outs[OIdx].Flags.isByVal();
1484assert((!IsByVal || Args[i].IndirectType) &&
1485"byval arg must have indirect type");
1486Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1491// The ByValAlign in the Outs[OIdx].Flags is always set at this point, 1492// so we don't need to worry whether it's naturally aligned or not. 1493// See TargetLowering::LowerCallTo(). 1494Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1498 VAOffset =
alignTo(VAOffset, ArgAlign);
1500 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1,
DL);
1504 (IsByVal ? Outs[OIdx].Flags.getByValSize() :
DL.getTypeAllocSize(Ty));
1507bool NeedAlign;
// Does argument declaration specify alignment? 1510if (ParamCount == FirstVAArg) {
1516 DeclareParamVTs, DeclareParamOps);
1518 NeedAlign = PassAsArray;
1519 }
elseif (PassAsArray) {
1520// declare .param .align <align> .b8 .param<n>[<size>]; 1529// declare .param .b<size> .param<n>; 1531// PTX ABI requires integral types to be at least 32 bits in 1532// size. FP16 is loaded/stored using i16, so it's handled 1536SDValue DeclareScalarParamOps[] = {
1541 DeclareScalarParamOps);
1546// PTX Interoperability Guide 3.3(A): [Integer] Values shorter 1547// than 32-bits are sign extended or zero extended, depending on 1548// whether they are signed or unsigned types. This case applies 1549// only to scalar parameters and not to aggregate values. 1550bool ExtendIntegerParam =
1555for (
unsigned j = 0, je = VTs.
size(); j != je; ++j) {
1557int CurOffset = Offsets[j];
1566 EltVT =
EVT(PromotedVT);
1571 StVal = DAG.
getNode(Ext, dl, PromotedVT, StVal);
1580 }
elseif (ExtendIntegerParam) {
1581assert(VTs.
size() == 1 &&
"Scalar can't have multiple parts.");
1585 dl, MVT::i32, StVal);
1589// Use 16-bit registers for small stores as it's the 1590// smallest general purpose register size supported by NVPTX. 1594// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a 1595// scalar store. In such cases, fall back to byte stores. 1596if (VectorInfo[j] ==
PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1599assert(StoreOperands.
empty() &&
"Unfinished preceeding store.");
1601 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1602 StVal, InGlue, ParamCount, dl);
1604// LowerUnalignedStoreParam took care of inserting the necessary nodes 1605// into the SDAG, so just move on to the next element. 1613assert(StoreOperands.
empty() &&
"Unfinished preceding store.");
1616 DAG.
getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1619 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1623// Record the value to store. 1627unsigned NumElts = StoreOperands.
size() - 3;
1645// Adjust type of the store op if we've extended the scalar 1647EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1650Op, dl, DAG.
getVTList(MVT::Other, MVT::Glue), StoreOperands,
1656 StoreOperands.
clear();
1658// TODO: We may need to support vector types that can be passed 1659// as scalars in variadic arguments. 1660if (!IsByVal && IsVAArg) {
1662"Vectorization is expected to be disabled for variadics.");
1663 VAOffset +=
DL.getTypeAllocSize(
1670assert(StoreOperands.
empty() &&
"Unfinished parameter store.");
1671if (!IsByVal && VTs.
size() > 0)
1674if (IsByVal && IsVAArg)
1682if (Ins.size() > 0) {
1687// .param .align N .b8 retval0[<size-in-bytes>], or 1688// .param .b<size-in-bits> retval0 1689unsigned resultsz =
DL.getTypeAllocSizeInBits(
RetTy);
1700 retAlignment = getArgumentAlignment(CB,
RetTy, 0,
DL);
1701assert(retAlignment &&
"retAlignment is guaranteed to be set");
1704 Chain, DAG.
getConstant(retAlignment->value(), dl, MVT::i32),
1714// Set the size of the vararg param byte array if the callee is a variadic 1715// function and the variadic part is not empty. 1722 VADeclareParam->
getVTList(), DeclareParamOps);
1725// If the type of the callsite does not match that of the function, convert 1726// the callsite to an indirect call. 1729// Both indirect calls and libcalls have nullptr Func. In order to distinguish 1730// between them we must rely on the call site value which is valid for 1731// indirect calls but is always null for libcalls. 1734if (isa<ExternalSymbolSDNode>(Callee)) {
1737// Try to find the callee in the current module. 1739assert(CalleeFunc !=
nullptr &&
"Libcall callee must be set.");
1741// Set the "libcall callee" attribute to indicate that the function 1742// must always have a declaration. 1743 CalleeFunc->
addFnAttr(
"nvptx-libcall-callee",
"true");
1747// This is indirect function call case : PTX requires a prototype of the 1749// proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _); 1750// to be emitted, and the label has to used as the last arg of call 1752// The prototype is embedded in a string and put as the operand for a 1753// CallPrototype SDNode which will print out to the value of the string. 1756DL,
RetTy, Args, Outs, retAlignment,
1758 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1761 *CB, UniqueCallSite);
1771// Op to just print "call" 1774 Chain, DAG.
getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1776// We model convergent calls as separate opcodes. 1781 Chain = DAG.
getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1784if (ConvertToIndirectCall) {
1785// Copy the function ptr to a ptx register and use the register to call the 1787EVT DestVT = Callee.getValueType();
1796// Ops to print out the function name 1798SDValue CallVoidOps[] = { Chain, Callee, InGlue };
1802// Ops to print out the param list 1804SDValue CallArgBeginOps[] = { Chain, InGlue };
1809for (
unsigned i = 0, e = std::min(CLI.
NumFixedArgs + 1, ParamCount); i != e;
1819 Chain = DAG.
getNode(opcode, dl, CallArgVTs, CallArgOps);
1823SDValue CallArgEndOps[] = { Chain,
1832 Chain, DAG.
getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
1839// An item of the vector is filled if the element does not need a ProxyReg 1840// operation on it and should be added to InVals as is. ProxyRegOps and 1841// ProxyRegTruncates contain empty/none items at the same index. 1843// A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()` 1844// to use the values of `LoadParam`s and to be replaced later then 1845// `CALLSEQ_END` is added. 1848// Generate loads from param memory/moves from registers for result 1849if (Ins.size() > 0) {
1853assert(VTs.
size() == Ins.size() &&
"Bad value decomposition");
1859int VecIdx = -1;
// Index of the first element of the vector. 1861// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 1862// 32-bits are sign extended or zero extended, depending on whether 1863// they are signed or unsigned types. 1864bool ExtendIntegerRetVal =
1865RetTy->isIntegerTy() &&
DL.getTypeAllocSizeInBits(
RetTy) < 32;
1867for (
unsigned i = 0, e = VTs.
size(); i != e; ++i) {
1868bool needTruncate =
false;
1869EVT TheLoadType = VTs[i];
1870EVT EltType = Ins[i].VT;
1875 TheLoadType =
EVT(PromotedVT);
1876 EltType =
EVT(PromotedVT);
1880if (ExtendIntegerRetVal) {
1881 TheLoadType = MVT::i32;
1885if (VTs[i].isInteger())
1890// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a 1891// scalar load. In such cases, fall back to byte loads. 1893 EltAlign <
DL.getABITypeAlign(
1895assert(VecIdx == -1 && LoadVTs.
empty() &&
"Orphaned operand list.");
1897 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
1899 ProxyRegTruncates.
push_back(std::optional<MVT>());
1906// Record index of the very first element of the vector. 1908assert(VecIdx == -1 && LoadVTs.
empty() &&
"Orphaned operand list.");
1915unsigned NumElts = LoadVTs.
size();
1935 DAG.
getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
1937Op, dl, DAG.
getVTList(LoadVTs), LoadOperands, TheLoadType,
1941for (
unsigned j = 0; j < NumElts; ++j) {
1945 ProxyRegTruncates.
push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
1947 ProxyRegTruncates.
push_back(std::optional<MVT>());
1951 InGlue = RetVal.
getValue(NumElts + 1);
1961 DAG.
getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
1964// Append ProxyReg instructions to the chain to make sure that `callseq_end` 1965// will not get lost. Otherwise, during libcalls expansion, the nodes can become 1967for (
unsigned i = 0; i < ProxyRegOps.
size(); ++i) {
1968if (i < RetElts.
size() && RetElts[i]) {
1975 DAG.
getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1976 { Chain, ProxyRegOps[i], InGlue }
1979 Chain = Ret.getValue(1);
1980 InGlue = Ret.getValue(2);
1982if (ProxyRegTruncates[i]) {
1989for (
SDValue &
T : TempProxyRegOps) {
1992 DAG.
getVTList(
T.getSimpleValueType(), MVT::Other, MVT::Glue),
1993 {Chain, T.getOperand(0), InGlue});
2001// set isTailCall to false for now, until we figure out how to express 2002// tail call optimization in PTX 2015"Support for dynamic alloca introduced in PTX ISA version 7.3 and " 2016"requires target sm_52.",
2026uint64_tAlign = cast<ConstantSDNode>(
Op.getOperand(2))->getZExtValue();
2029// The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32. 2034EVT RetTypes[] = {ValueSizeTy, MVT::Other};
2046"Support for stackrestore requires PTX ISA version >= 7.3 and target " 2050returnOp.getOperand(0);
2069"Support for stacksave requires PTX ISA version >= 7.3 and target >= " 2086// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack() 2087// (see LegalizeDAG.cpp). This is slow and uses local memory. 2088// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5 2094unsigned NumOperands = Node->getNumOperands();
2095for (
unsigned i = 0; i < NumOperands; ++i) {
2096SDValue SubOp = Node->getOperand(i);
2100for (
unsigned j = 0; j < NumSubElem; ++j) {
2109// Handle bitcasting from v2i8 without hitting the default promotion 2110// strategy which goes through stack memory. 2111EVT FromVT =
Op->getOperand(0)->getValueType(0);
2112if (FromVT != MVT::v2i8) {
2116// Pack vector elements into i16 and bitcast to final type 2128EVT ToVT =
Op->getValueType(0);
2132// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it 2133// would get lowered as two constant loads and vector-packing move. 2134// Instead we want just a constant move: 2135// mov.b32 %r2, 0x40003C00 2138EVT VT =
Op->getValueType(0);
2144 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2145 isa<ConstantFPSDNode>(Operand);
2149// Lower non-const v4i8 vector as byte-wise constructed i32, which allows us 2150// to optimize calculation of constant parts. 2164auto PRMT__10 = GetPRMT(
Op->getOperand(0),
Op->getOperand(1),
true, 0x3340);
2165auto PRMT__32 = GetPRMT(
Op->getOperand(2),
Op->getOperand(3),
true, 0x3340);
2166auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32,
false, 0x5410);
2170// Get value or the Nth operand as an APInt(32). Undef values treated as 0. 2173EVT VT =
Op->getValueType(0);
2177if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2178Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2179elseif (VT == MVT::v2i16 || VT == MVT::v4i8)
2183// i8 values are carried around as i16, so we need to zero out upper bits, 2184// so they do not get in the way of combining individual byte values 2187returnValue.zext(32);
2191Value = GetOperand(
Op, 0) | GetOperand(
Op, 1).shl(16);
2192 }
elseif (VT == MVT::v4i8) {
2193Value = GetOperand(
Op, 0) | GetOperand(
Op, 1).shl(8) |
2194 GetOperand(
Op, 2).shl(16) | GetOperand(
Op, 3).shl(24);
2209if (VectorVT == MVT::v4i8) {
2220// Constant index will be matched by tablegen. 2221if (isa<ConstantSDNode>(
Index.getNode()))
2224// Extract individual elements and select one of them. 2242if (VectorVT != MVT::v4i8)
2246if (
Value->isUndef())
2265if (VectorVT != MVT::v4i8 ||
Op.getValueType() != MVT::v4i8)
2268// Lower shuffle to PRMT instruction. 2273if (
I.value() != -1)
// -1 is a placeholder for undef. 2274 Selector |= (
I.value() << (
I.index() * 4));
2282/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which 2283/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2285/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2292EVT VT =
Op.getValueType();
2301// For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2302// {dHi, dLo} = {aHi, aLo} >> Amt 2304// dLo = shf.r.clamp aLo, aHi, Amt 2314// {dHi, dLo} = {aHi, aLo} >> Amt 2315// - if (Amt>=size) then 2316// dLo = aHi >> (Amt-size) 2317// dHi = aHi >> Amt (this is either all 0 or all 1) 2319// dLo = (aLo >>logic Amt) | (aHi << (size-Amt)) 2343/// LowerShiftLeftParts - Lower SHL_PARTS, which 2344/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift 2346/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift 2353EVT VT =
Op.getValueType();
2361// For 32bit and sm35, we can use the funnel shift 'shf' instruction. 2362// {dHi, dLo} = {aHi, aLo} << Amt 2363// dHi = shf.l.clamp aLo, aHi, Amt 2374// {dHi, dLo} = {aHi, aLo} << Amt 2375// - if (Amt>=size) then 2376// dLo = aLo << Amt (all 0) 2377// dLo = aLo << (Amt-size) 2380// dHi = (aHi << Amt) | (aLo >> (size-Amt)) 2403/// If the types match, convert the generic copysign to the NVPTXISD version, 2404/// otherwise bail ensuring that mismatched cases are properly expaned. 2407EVT VT =
Op.getValueType();
2421EVT VT =
Op.getValueType();
2424return LowerFROUND32(
Op, DAG);
2427return LowerFROUND64(
Op, DAG);
2432// This is the the rounding method used in CUDA libdevice in C like code: 2433// float roundf(float A) 2435// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)); 2436// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2437// return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2443EVT VT =
Op.getValueType();
2447// RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f)) 2449constunsigned SignBitMask = 0x80000000;
2452constunsigned PointFiveInBits = 0x3F000000;
2461// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA; 2468// return abs(A) < 0.5 ? (float)(int)A : RoundedA; 2475// The implementation of round(double) is similar to that of round(float) in 2476// that they both separate the value range into three regions and use a method 2477// specific to the region to round the values. However, round(double) first 2478// calculates the round of the absolute value and then adds the sign back while 2479// round(float) directly rounds the value with sign. 2484EVT VT =
Op.getValueType();
2488// double RoundedA = (double) (int) (abs(A) + 0.5f); 2493// RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA; 2501// Add sign to rounded_A 2505// RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA; 2513EVT VT =
N->getValueType(0);
2537if (
Op.getValueType() == MVT::bf16) {
2541 DAG.
getNode(
Op.getOpcode(), Loc, MVT::f32,
Op.getOperand(0)),
2545// Everything else is considered legal. 2553if (
Op.getOperand(0).getValueType() == MVT::bf16) {
2556Op.getOpcode(), Loc,
Op.getValueType(),
2560// Everything else is considered legal. 2566EVT NarrowVT =
Op.getValueType();
2575// This combination was the first to support f32 -> bf16. 2582// Round-inexact-to-odd f64 to f32, then do the final rounding using 2583// the hardware f32 -> bf16 instruction. 2595// Everything else is considered legal. 2603EVT WideVT =
Op.getValueType();
2624// Everything else is considered legal. 2630if (
Op.getValueType() != MVT::v2i16)
2632EVT EltVT =
Op.getValueType().getVectorElementType();
2634for (
intI = 0, E =
Op.getValueType().getVectorNumElements();
I < E;
I++) {
2637 [&](
constSDUse &O) {
2638 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2639 O.get(), DAG.getIntPtrConstant(I, DL));
2650switch (
Op.getOpcode()) {
2660return LowerBUILD_VECTOR(
Op, DAG);
2662return LowerBITCAST(
Op, DAG);
2666return LowerEXTRACT_VECTOR_ELT(
Op, DAG);
2668return LowerINSERT_VECTOR_ELT(
Op, DAG);
2670return LowerVECTOR_SHUFFLE(
Op, DAG);
2672return LowerCONCAT_VECTORS(
Op, DAG);
2674return LowerSTORE(
Op, DAG);
2676return LowerLOAD(
Op, DAG);
2678return LowerShiftLeftParts(
Op, DAG);
2681return LowerShiftRightParts(
Op, DAG);
2683return LowerSelect(
Op, DAG);
2685return LowerFROUND(
Op, DAG);
2687return LowerFCOPYSIGN(
Op, DAG);
2690return LowerINT_TO_FP(
Op, DAG);
2693return LowerFP_TO_INT(
Op, DAG);
2695return LowerFP_ROUND(
Op, DAG);
2697return LowerFP_EXTEND(
Op, DAG);
2699return LowerBR_JT(
Op, DAG);
2701return LowerVAARG(
Op, DAG);
2703return LowerVASTART(
Op, DAG);
2723return LowerCopyToReg_128(
Op, DAG);
2727// Used only for bf16 on SM80, where we select fma for non-ftz operation 2728return PromoteBinOpIfF32FTZ(
Op, DAG);
2738constauto *JT = cast<JumpTableSDNode>(
Op.getOperand(1));
2741unsigned JId = JT->getIndex();
2747// Generate BrxStart node 2751// Generate BrxItem nodes 2757// Generate BrxEnd nodes 2765// This will prevent AsmPrinter from trying to print the jump tables itself. 2770// This function is almost a copy of SelectionDAG::expandVAArg(). 2771// The only diff is that this one produces loads from local address space. 2777constValue *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2778EVT VT = Node->getValueType(0);
2780SDValue Tmp1 = Node->getOperand(0);
2781SDValue Tmp2 = Node->getOperand(1);
2782constMaybeAlign MA(Node->getConstantOperandVal(3));
2798// Increment the pointer, VAList, to the next vaarg 2803// Store the incremented VAList to the legalized pointer 2810// Load the actual argument out of the pointer VAList 2819// Store the address of unsized array <function>_vararg[] in the ap object. 2820SDValue Arg = getParamSymbol(DAG,
/* vararg */ -1, PtrVT);
2823constValue *SV = cast<SrcValueSDNode>(
Op.getOperand(2))->getValue();
2834assert(
Op.getValueType() == MVT::i1 &&
"Custom lowering enabled only for i1");
2845if (
Op.getValueType() == MVT::i1)
2846return LowerLOADi1(
Op, DAG);
2848// v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle 2849// unaligned loads and have to handle it here. 2850EVT VT =
Op.getValueType();
2853EVT MemVT =
Load->getMemoryVT();
2855 MemVT, *
Load->getMemOperand())) {
2867// v1 = ld i8* addr (-> i16) 2868// v = trunc i16 to i1 2875"Custom lowering for i1 load only");
2877LD->getBasePtr(),
LD->getPointerInfo(),
2878 MVT::i8,
LD->getAlign(),
2879LD->getMemOperand()->getFlags());
2881// The legalizer (the caller) is expecting two values from the legalized 2882// load, so we build a MergeValues node for it. See ExpandUnalignedLoad() 2883// in LegalizeDAG.cpp which also uses MergeValues. 2884SDValue Ops[] = { result,
LD->getChain() };
2893return LowerSTOREi1(
Op, DAG);
2895// v2f16 is legal, so we can't rely on legalizer to handle unaligned 2896// stores and have to handle it here. 2897if ((
Isv2x16VT(VT) || VT == MVT::v4i8) &&
2899 VT, *
Store->getMemOperand()))
2902// v2f16, v2bf16 and v2i16 don't need special handling. 2907return LowerSTOREVector(
Op, DAG);
2920if (!NumEltsAndEltVT)
2922auto [NumElts, EltVT] = NumEltsAndEltVT.value();
2929if (Alignment < PrefAlign) {
2930// This store is not sufficiently aligned, so bail out and let this vector 2931// store be scalarized. Note that we may still be able to emit smaller 2932// vector stores. For example, if we are storing a <4 x float> with an 2933// alignment of 8, this check will fail but the legalizer will try again 2934// with 2 x <2 x float>, which will succeed with an alignment of 8. 2938// Since StoreV2 is a target node, we cannot rely on DAG type legalization. 2939// Therefore, we must ensure the type is legal. For i1 and i8, we set the 2940// stored type to i16 and propagate the "real" type as the memory type. 2959// First is the chain 2962// Then the split values 2964"NumElts should not increase, only decrease or stay the same.");
2966// If the number of elements has decreased, getVectorLoweringShape has 2967// upsized the element types 2970// Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be 2973for (
unsigned i = 0; i < NumElts; ++i) {
2976 NumEltsPerSubVector);
2981for (
unsigned i = 0; i < NumElts; ++i) {
2990// Then any remaining arguments 2991 Ops.
append(
N->op_begin() + 2,
N->op_end());
2997// return DCI.CombineTo(N, NewSt, true); 3016ST->getAlign(),
ST->getMemOperand()->getFlags());
3022// Change the CopyToReg to take in two 64-bit operands instead of a 128-bit 3023// operand so that it can pass the legalization. 3025assert(
Op.getOperand(1).getValueType() == MVT::i128 &&
3026"Custom lowering for 128-bit CopyToReg only");
3040 NewOps[0] =
Op->getOperand(0);
// Chain 3041 NewOps[1] =
Op->getOperand(1);
// Dst Reg 3042 NewOps[2] =
Lo;
// Lower 64-bit 3043 NewOps[3] =
Hi;
// Higher 64-bit 3045 NewOps[4] =
Op->getOperand(3);
// Glue if exists 3050unsigned NVPTXTargetLowering::getNumRegisters(
3052 std::optional<MVT> RegisterVT = std::nullopt)
const{
3053if (VT == MVT::i128 && RegisterVT == MVT::i128)
3058bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3060unsigned NumParts,
MVT PartVT, std::optional<CallingConv::ID>
CC)
const{
3068// This creates target external symbol for a function parameter. 3069// Name of the symbol is composed from its index and the function name. 3070// Negative index corresponds to special parameter (unsized array) used for 3071// passing variable arguments. 3092 std::vector<SDValue> OutChains;
3095assert(isABI &&
"Non-ABI compilation is not supported");
3099 std::vector<Type *> argTypes;
3100 std::vector<const Argument *> theArgs;
3102 theArgs.push_back(&
I);
3103 argTypes.push_back(
I.getType());
3105// argTypes.size() (or theArgs.size()) and Ins.size() need not match. 3106// Ins.size() will be larger 3107// * if there is an aggregate argument with multiple fields (each field 3108// showing up separately in Ins) 3109// * if there is a vector argument with more than typical vector-length 3110// elements (generally if more than 4) where each vector element is 3111// individually present in Ins. 3112// So a different index should be used for indexing into Ins. 3113// See similar issue in LowerCall. 3116for (
unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3117Type *Ty = argTypes[i];
3119if (theArgs[i]->use_empty()) {
3128for (
unsigned parti = 0, parte = vtparts.
size(); parti != parte;
3133if (vtparts.
size() > 0)
3140for (
unsigned parti = 0; parti < NumRegs; ++parti) {
3152// In the following cases, assign a node order of "i+1" 3153// to newly created nodes. The SDNodes for params have to 3154// appear in the same order as their order of appearance 3155// in the original function. "i+1" holds that order. 3157bool aggregateIsPacked =
false;
3158if (
StructType *STy = dyn_cast<StructType>(Ty))
3159 aggregateIsPacked = STy->isPacked();
3171SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3172int VecIdx = -1;
// Index of the first element of the current vector. 3173for (
unsigned parti = 0, parte = VTs.
size(); parti != parte; ++parti) {
3175assert(VecIdx == -1 &&
"Orphaned vector.");
3179// That's the last element of this store op. 3181unsigned NumElts = parti - VecIdx + 1;
3182EVT EltVT = VTs[parti];
3183// i1 is loaded/stored as i8. 3185if (EltVT == MVT::i1)
3187elseif (
Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3188// getLoad needs a vector type, but it can't handle 3189// vectors which contain v2f16 or v2bf16 elements. So we must load 3190// using i32 here and then bitcast back. 3201if (aggregateIsPacked)
3214P.getNode()->setIROrder(i + 1);
3215for (
unsigned j = 0; j < NumElts; ++j) {
3218// We've loaded i1 as an i8 and now must truncate it back to i1 3219if (EltVT == MVT::i1)
3221// v2f16 was loaded as an i32. Now we must bitcast it back. 3222elseif (EltVT != LoadVT)
3225// If a promoted integer type is used, truncate down to the original 3231// Extend the element if necessary (e.g. an i8 is loaded 3232// into an i16 register) 3234 Ins[InsIdx].VT.getFixedSizeInBits() >
3238 Elt = DAG.
getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3243// Reset vector tracking state. 3253// Param has ByVal attribute 3254// Return MoveParam(param symbol). 3255// Ideally, the param symbol can be returned directly, 3256// but when SDNode builder decides to use it in a CopyToReg(), 3257// machine instruction fails because TargetExternalSymbol 3258// (not lowered) is target dependent, and CopyToReg assumes 3259// the source is lowered. 3261assert(ObjectVT == Ins[InsIdx].VT &&
3262"Ins type did not match function type");
3263SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3266 p.getNode()->setIROrder(i + 1);
3270if (!OutChains.empty())
3276// Use byte-store when the param adress of the return value is unaligned. 3277// This may happen when the return value is a field of a packed structure. 3281// Bit logic only works on integer types 3286for (
unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3287// Shift the byte to the last byte position 3292// Trunc store only the last byte by using 3294// The register type can be larger than b8. 3296 DAG.
getVTList(MVT::Other), StoreOperands,
3314assert(isABI &&
"Non-ABI compilation is not supported");
3323assert(VTs.
size() == OutVals.
size() &&
"Bad return value decomposition");
3325for (
unsigned i = 0, e = VTs.
size(); i != e; ++i) {
3326SDValue PromotedOutVal = OutVals[i];
3329 VTs[i] =
EVT(PromotedVT);
3334 PromotedOutVal = DAG.
getNode(Ext, dl, PromotedVT, PromotedOutVal);
3336 PromotedOutVals.
push_back(PromotedOutVal);
3344// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than 3345// 32-bits are sign extended or zero extended, depending on whether 3346// they are signed or unsigned types. 3347bool ExtendIntegerRetVal =
3348RetTy->isIntegerTy() &&
DL.getTypeAllocSizeInBits(
RetTy) < 32;
3351for (
unsigned i = 0, e = VTs.
size(); i != e; ++i) {
3353SDValue RetVal = PromotedOutVals[i];
3355if (ExtendIntegerRetVal) {
3358 dl, MVT::i32, RetVal);
3360// Use 16-bit registers for small load-stores as it's the 3361// smallest general purpose register size supported by NVPTX. 3365// If we have a PVF_SCALAR entry, it may not even be sufficiently aligned 3366// for a scalar store. In such cases, fall back to byte stores. 3368EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3369Align ElementTypeAlign =
3370DL.getABITypeAlign(ElementType.getTypeForEVT(
RetTy->getContext()));
3373if (ElementAlign < ElementTypeAlign) {
3374assert(StoreOperands.
empty() &&
"Orphaned operand list.");
3378// The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes 3379// into the graph, so just move on to the next element. 3384// New load/store. Record chain and offset operands. 3386assert(StoreOperands.
empty() &&
"Orphaned operand list.");
3391// Record the value to return. 3394// That's the last element of this store op. 3397unsigned NumElts = StoreOperands.
size() - 2;
3412// Adjust type of load/store op if we've extended the scalar 3414EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3416Op, dl, DAG.
getVTList(MVT::Other), StoreOperands, TheStoreType,
3418// Cleanup vector state. 3419 StoreOperands.
clear();
3429if (Constraint.
size() > 1)
3434// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as 3436// because we need the information that is only available in the "Value" type 3438// pointer. In particular, the address space information. 3445case Intrinsic::nvvm_match_all_sync_i32p:
3446case Intrinsic::nvvm_match_all_sync_i64p:
3448// memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute 3449// in order to model data exchange with other threads, but perform no real 3451Info.memVT = MVT::i1;
3453// Our result depends on both our and other thread's arguments. 3456case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3457case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3458case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3459case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3460case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3461case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3462case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3463case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3464case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3465case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3466case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3467case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3468case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3469case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3470case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3471case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3472case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3473case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3474case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3475case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3476case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3477case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3478case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3479case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3481Info.memVT = MVT::v8f16;
3482Info.ptrVal =
I.getArgOperand(0);
3488case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3489case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3490case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3491case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3492case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3493case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3494case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3495case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3496case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3497case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3498case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3499case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3500case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3501case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3502case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3503case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3504case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3505case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3506case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3507case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3508case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3509case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3510case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3511case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3513Info.memVT = MVT::v2i32;
3514Info.ptrVal =
I.getArgOperand(0);
3521case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3522case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3523case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3524case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3525case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3526case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3527case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3528case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3529case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3530case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3531case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3532case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3533case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3534case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3535case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3536case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3538case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3539case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3540case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3541case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3542case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3543case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3544case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3545case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3546case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3547case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3548case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3549case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3550case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3551case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3552case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3553case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3554case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3555case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
3557Info.memVT = MVT::v4i32;
3558Info.ptrVal =
I.getArgOperand(0);
3565case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3566case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3567case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3568case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3569case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3570case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3571case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3572case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3574case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3575case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3576case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3577case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3578case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3579case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3580case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3581case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3582case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3583case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3584case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3585case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3586case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3587case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3588case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3589case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3590case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3591case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3592case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3593case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3594case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3595case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
3597Info.memVT = MVT::i32;
3598Info.ptrVal =
I.getArgOperand(0);
3605case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3606case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3607case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3608case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3609case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3610case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3611case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3612case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3613case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3614case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3615case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3616case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3618Info.memVT = MVT::v4f16;
3619Info.ptrVal =
I.getArgOperand(0);
3626case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3627case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3628case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3629case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3630case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3631case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3632case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3633case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3634case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3635case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3636case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3637case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
3638case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
3639case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
3640case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
3641case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
3643Info.memVT = MVT::v8f32;
3644Info.ptrVal =
I.getArgOperand(0);
3651case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
3652case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
3653case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
3654case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
3656case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
3657case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
3658case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
3659case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
3661case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3662case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3663case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3664case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3665case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3666case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3667case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3668case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3669case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3670case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3671case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3672case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3674Info.memVT = MVT::v8i32;
3675Info.ptrVal =
I.getArgOperand(0);
3682case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3683case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3684case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3685case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3686case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3687case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3688case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3689case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
3690case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
3691case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
3693Info.memVT = MVT::v2i32;
3694Info.ptrVal =
I.getArgOperand(0);
3701case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
3702case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
3703case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
3704case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
3706case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
3707case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
3708case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
3709case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
3711Info.memVT = MVT::f64;
3712Info.ptrVal =
I.getArgOperand(0);
3719case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
3720case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
3721case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
3722case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
3724Info.memVT = MVT::v2f64;
3725Info.ptrVal =
I.getArgOperand(0);
3732case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3733case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3734case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3735case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3736case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3737case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3738case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3739case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3740case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3741case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3742case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3743case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3745Info.memVT = MVT::v4f16;
3746Info.ptrVal =
I.getArgOperand(0);
3753case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3754case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3755case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3756case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3757case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3758case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3759case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3760case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3761case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3762case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3763case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3764case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
3765case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
3766case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
3767case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
3768case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
3770Info.memVT = MVT::v8f32;
3771Info.ptrVal =
I.getArgOperand(0);
3778case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3779case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3780case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3781case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3782case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3783case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3784case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3785case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3786case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3787case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3788case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3789case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3791Info.memVT = MVT::v8i32;
3792Info.ptrVal =
I.getArgOperand(0);
3799case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3800case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3801case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3802case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3803case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3804case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3805case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3806case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3808Info.memVT = MVT::v2i32;
3809Info.ptrVal =
I.getArgOperand(0);
3816case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
3817case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
3818case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
3819case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
3821Info.memVT = MVT::v2f64;
3822Info.ptrVal =
I.getArgOperand(0);
3829case Intrinsic::nvvm_atomic_load_inc_32:
3830case Intrinsic::nvvm_atomic_load_dec_32:
3832case Intrinsic::nvvm_atomic_add_gen_f_cta:
3833case Intrinsic::nvvm_atomic_add_gen_f_sys:
3834case Intrinsic::nvvm_atomic_add_gen_i_cta:
3835case Intrinsic::nvvm_atomic_add_gen_i_sys:
3836case Intrinsic::nvvm_atomic_and_gen_i_cta:
3837case Intrinsic::nvvm_atomic_and_gen_i_sys:
3838case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3839case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3840case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3841case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3842case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3843case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3844case Intrinsic::nvvm_atomic_max_gen_i_cta:
3845case Intrinsic::nvvm_atomic_max_gen_i_sys:
3846case Intrinsic::nvvm_atomic_min_gen_i_cta:
3847case Intrinsic::nvvm_atomic_min_gen_i_sys:
3848case Intrinsic::nvvm_atomic_or_gen_i_cta:
3849case Intrinsic::nvvm_atomic_or_gen_i_sys:
3850case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3851case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3852case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3853case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3854auto &
DL =
I.getDataLayout();
3857Info.ptrVal =
I.getArgOperand(0);
3864case Intrinsic::nvvm_ldu_global_i:
3865case Intrinsic::nvvm_ldu_global_f:
3866case Intrinsic::nvvm_ldu_global_p: {
3867auto &
DL =
I.getDataLayout();
3869if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3871elseif(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3875Info.ptrVal =
I.getArgOperand(0);
3878Info.align = cast<ConstantInt>(
I.getArgOperand(1))->getMaybeAlignValue();
3882case Intrinsic::nvvm_tex_1d_v4f32_s32:
3883case Intrinsic::nvvm_tex_1d_v4f32_f32:
3884case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3885case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3886case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3887case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3888case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3889case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3890case Intrinsic::nvvm_tex_2d_v4f32_s32:
3891case Intrinsic::nvvm_tex_2d_v4f32_f32:
3892case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3893case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3894case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3895case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3896case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3897case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3898case Intrinsic::nvvm_tex_3d_v4f32_s32:
3899case Intrinsic::nvvm_tex_3d_v4f32_f32:
3900case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3901case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3902case Intrinsic::nvvm_tex_cube_v4f32_f32:
3903case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3904case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3905case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3906case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3907case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3908case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3909case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3910case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3911case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3912case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3913case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3914case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3915case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3916case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3917case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3918case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3919case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3920case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3921case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3922case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3923case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3924case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3925case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3926case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3927case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3928case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3929case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3930case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3931case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3932case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3933case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3934case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3935case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3936case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3937case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3938case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3939case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3941Info.memVT = MVT::v4f32;
3942Info.ptrVal =
nullptr;
3948case Intrinsic::nvvm_tex_1d_v4s32_s32:
3949case Intrinsic::nvvm_tex_1d_v4s32_f32:
3950case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3951case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3952case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3953case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3954case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3955case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3956case Intrinsic::nvvm_tex_2d_v4s32_s32:
3957case Intrinsic::nvvm_tex_2d_v4s32_f32:
3958case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3959case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3960case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3961case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3962case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3963case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3964case Intrinsic::nvvm_tex_3d_v4s32_s32:
3965case Intrinsic::nvvm_tex_3d_v4s32_f32:
3966case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3967case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3968case Intrinsic::nvvm_tex_cube_v4s32_f32:
3969case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3970case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3971case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3972case Intrinsic::nvvm_tex_cube_v4u32_f32:
3973case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3974case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3975case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3976case Intrinsic::nvvm_tex_1d_v4u32_s32:
3977case Intrinsic::nvvm_tex_1d_v4u32_f32:
3978case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3979case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3980case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3981case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3982case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3983case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3984case Intrinsic::nvvm_tex_2d_v4u32_s32:
3985case Intrinsic::nvvm_tex_2d_v4u32_f32:
3986case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3987case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3988case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3989case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3990case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3991case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3992case Intrinsic::nvvm_tex_3d_v4u32_s32:
3993case Intrinsic::nvvm_tex_3d_v4u32_f32:
3994case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3995case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3996case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3997case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3998case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3999case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4000case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4001case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4002case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4003case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4004case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4005case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4006case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4007case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4008case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4009case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4010case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4011case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4012case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4013case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4014case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4015case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4016case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4017case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4018case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4019case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4020case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4021case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4022case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4023case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4024case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4025case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4026case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4027case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4028case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4029case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4030case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4031case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4032case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4033case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4034case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4035case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4036case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4037case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4038case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4039case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4040case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4041case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4042case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4043case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4044case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4045case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4046case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4047case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4048case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4049case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4050case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4051case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4052case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4053case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4054case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4055case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4056case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4057case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4058case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4059case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4060case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4061case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4062case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4063case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4065Info.memVT = MVT::v4i32;
4066Info.ptrVal =
nullptr;
4072case Intrinsic::nvvm_suld_1d_i8_clamp:
4073case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4074case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4075case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4076case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4077case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4078case Intrinsic::nvvm_suld_2d_i8_clamp:
4079case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4080case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4081case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4082case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4083case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4084case Intrinsic::nvvm_suld_3d_i8_clamp:
4085case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4086case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4087case Intrinsic::nvvm_suld_1d_i8_trap:
4088case Intrinsic::nvvm_suld_1d_v2i8_trap:
4089case Intrinsic::nvvm_suld_1d_v4i8_trap:
4090case Intrinsic::nvvm_suld_1d_array_i8_trap:
4091case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4092case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4093case Intrinsic::nvvm_suld_2d_i8_trap:
4094case Intrinsic::nvvm_suld_2d_v2i8_trap:
4095case Intrinsic::nvvm_suld_2d_v4i8_trap:
4096case Intrinsic::nvvm_suld_2d_array_i8_trap:
4097case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4098case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4099case Intrinsic::nvvm_suld_3d_i8_trap:
4100case Intrinsic::nvvm_suld_3d_v2i8_trap:
4101case Intrinsic::nvvm_suld_3d_v4i8_trap:
4102case Intrinsic::nvvm_suld_1d_i8_zero:
4103case Intrinsic::nvvm_suld_1d_v2i8_zero:
4104case Intrinsic::nvvm_suld_1d_v4i8_zero:
4105case Intrinsic::nvvm_suld_1d_array_i8_zero:
4106case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4107case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4108case Intrinsic::nvvm_suld_2d_i8_zero:
4109case Intrinsic::nvvm_suld_2d_v2i8_zero:
4110case Intrinsic::nvvm_suld_2d_v4i8_zero:
4111case Intrinsic::nvvm_suld_2d_array_i8_zero:
4112case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4113case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4114case Intrinsic::nvvm_suld_3d_i8_zero:
4115case Intrinsic::nvvm_suld_3d_v2i8_zero:
4116case Intrinsic::nvvm_suld_3d_v4i8_zero:
4118Info.memVT = MVT::i8;
4119Info.ptrVal =
nullptr;
4125case Intrinsic::nvvm_suld_1d_i16_clamp:
4126case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4127case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4128case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4129case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4130case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4131case Intrinsic::nvvm_suld_2d_i16_clamp:
4132case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4133case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4134case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4135case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4136case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4137case Intrinsic::nvvm_suld_3d_i16_clamp:
4138case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4139case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4140case Intrinsic::nvvm_suld_1d_i16_trap:
4141case Intrinsic::nvvm_suld_1d_v2i16_trap:
4142case Intrinsic::nvvm_suld_1d_v4i16_trap:
4143case Intrinsic::nvvm_suld_1d_array_i16_trap:
4144case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4145case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4146case Intrinsic::nvvm_suld_2d_i16_trap:
4147case Intrinsic::nvvm_suld_2d_v2i16_trap:
4148case Intrinsic::nvvm_suld_2d_v4i16_trap:
4149case Intrinsic::nvvm_suld_2d_array_i16_trap:
4150case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4151case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4152case Intrinsic::nvvm_suld_3d_i16_trap:
4153case Intrinsic::nvvm_suld_3d_v2i16_trap:
4154case Intrinsic::nvvm_suld_3d_v4i16_trap:
4155case Intrinsic::nvvm_suld_1d_i16_zero:
4156case Intrinsic::nvvm_suld_1d_v2i16_zero:
4157case Intrinsic::nvvm_suld_1d_v4i16_zero:
4158case Intrinsic::nvvm_suld_1d_array_i16_zero:
4159case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4160case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4161case Intrinsic::nvvm_suld_2d_i16_zero:
4162case Intrinsic::nvvm_suld_2d_v2i16_zero:
4163case Intrinsic::nvvm_suld_2d_v4i16_zero:
4164case Intrinsic::nvvm_suld_2d_array_i16_zero:
4165case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4166case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4167case Intrinsic::nvvm_suld_3d_i16_zero:
4168case Intrinsic::nvvm_suld_3d_v2i16_zero:
4169case Intrinsic::nvvm_suld_3d_v4i16_zero:
4171Info.memVT = MVT::i16;
4172Info.ptrVal =
nullptr;
4178case Intrinsic::nvvm_suld_1d_i32_clamp:
4179case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4180case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4181case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4182case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4183case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4184case Intrinsic::nvvm_suld_2d_i32_clamp:
4185case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4186case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4187case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4188case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4189case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4190case Intrinsic::nvvm_suld_3d_i32_clamp:
4191case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4192case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4193case Intrinsic::nvvm_suld_1d_i32_trap:
4194case Intrinsic::nvvm_suld_1d_v2i32_trap:
4195case Intrinsic::nvvm_suld_1d_v4i32_trap:
4196case Intrinsic::nvvm_suld_1d_array_i32_trap:
4197case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4198case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4199case Intrinsic::nvvm_suld_2d_i32_trap:
4200case Intrinsic::nvvm_suld_2d_v2i32_trap:
4201case Intrinsic::nvvm_suld_2d_v4i32_trap:
4202case Intrinsic::nvvm_suld_2d_array_i32_trap:
4203case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4204case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4205case Intrinsic::nvvm_suld_3d_i32_trap:
4206case Intrinsic::nvvm_suld_3d_v2i32_trap:
4207case Intrinsic::nvvm_suld_3d_v4i32_trap:
4208case Intrinsic::nvvm_suld_1d_i32_zero:
4209case Intrinsic::nvvm_suld_1d_v2i32_zero:
4210case Intrinsic::nvvm_suld_1d_v4i32_zero:
4211case Intrinsic::nvvm_suld_1d_array_i32_zero:
4212case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4213case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4214case Intrinsic::nvvm_suld_2d_i32_zero:
4215case Intrinsic::nvvm_suld_2d_v2i32_zero:
4216case Intrinsic::nvvm_suld_2d_v4i32_zero:
4217case Intrinsic::nvvm_suld_2d_array_i32_zero:
4218case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4219case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4220case Intrinsic::nvvm_suld_3d_i32_zero:
4221case Intrinsic::nvvm_suld_3d_v2i32_zero:
4222case Intrinsic::nvvm_suld_3d_v4i32_zero:
4224Info.memVT = MVT::i32;
4225Info.ptrVal =
nullptr;
4231case Intrinsic::nvvm_suld_1d_i64_clamp:
4232case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4233case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4234case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4235case Intrinsic::nvvm_suld_2d_i64_clamp:
4236case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4237case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4238case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4239case Intrinsic::nvvm_suld_3d_i64_clamp:
4240case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4241case Intrinsic::nvvm_suld_1d_i64_trap:
4242case Intrinsic::nvvm_suld_1d_v2i64_trap:
4243case Intrinsic::nvvm_suld_1d_array_i64_trap:
4244case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4245case Intrinsic::nvvm_suld_2d_i64_trap:
4246case Intrinsic::nvvm_suld_2d_v2i64_trap:
4247case Intrinsic::nvvm_suld_2d_array_i64_trap:
4248case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4249case Intrinsic::nvvm_suld_3d_i64_trap:
4250case Intrinsic::nvvm_suld_3d_v2i64_trap:
4251case Intrinsic::nvvm_suld_1d_i64_zero:
4252case Intrinsic::nvvm_suld_1d_v2i64_zero:
4253case Intrinsic::nvvm_suld_1d_array_i64_zero:
4254case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4255case Intrinsic::nvvm_suld_2d_i64_zero:
4256case Intrinsic::nvvm_suld_2d_v2i64_zero:
4257case Intrinsic::nvvm_suld_2d_array_i64_zero:
4258case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4259case Intrinsic::nvvm_suld_3d_i64_zero:
4260case Intrinsic::nvvm_suld_3d_v2i64_zero:
4262Info.memVT = MVT::i64;
4263Info.ptrVal =
nullptr;
4272/// getFunctionParamOptimizedAlign - since function arguments are passed via 4273/// .param space, we may want to increase their alignment in a way that 4274/// ensures that we can effectively vectorize their loads & stores. We can 4275/// increase alignment only if the function has internal or has private 4276/// linkage as for other linkage types callers may already rely on default 4277/// alignment. To allow using 128-bit vectorized loads/stores, this function 4278/// ensures that alignment is 16 or greater. 4281// Capping the alignment to 128 bytes as that is the maximum alignment 4283constAlign ABITypeAlign = std::min(
Align(128),
DL.getABITypeAlign(ArgTy));
4285// If a function has linkage different from internal or private, we 4286// must use default ABI alignment as external users rely on it. Same 4287// for a function that may be called from a function pointer. 4288if (!
F || !
F->hasLocalLinkage() ||
4289F->hasAddressTaken(
/*Users=*/nullptr,
4290/*IgnoreCallbackUses=*/false,
4291/*IgnoreAssumeLikeCalls=*/true,
4292/*IgnoreLLVMUsed=*/true))
4296return std::max(
Align(16), ABITypeAlign);
4299/// Helper for computing alignment of a device function byval parameter. 4303Align ArgAlign = InitialAlign;
4304// Try to increase alignment to enhance vectorization options. 4308// Old ptx versions have a bug. When PTX code takes address of 4309// byval parameter with alignment < 4, ptxas generates code to 4310// spill argument into memory. Alas on sm_50+ ptxas generates 4311// SASS code that fails with misaligned access. To work around 4312// the problem, make sure that we align byval parameters by at 4313// least 4. This bug seems to be fixed at least starting from 4315// TODO: remove this after verifying the bug is not reproduced 4316// on non-deprecated ptxas versions. 4318 ArgAlign = std::max(ArgAlign,
Align(4));
4323// Helper for getting a function parameter name. Name is composed from 4324// its index and the function name. Negative index corresponds to special 4325// parameter (unsized array) used for passing variable arguments. 4328 std::string ParamName;
4333 ParamStr <<
"_vararg";
4335 ParamStr <<
"_param_" <<
Idx;
4340/// isLegalAddressingMode - Return true if the addressing mode represented 4341/// by AM is legal for this target, for a load/store of the specified type. 4342/// Used to guide target specific optimizations, like loop strength reduction 4343/// (LoopStrengthReduce.cpp) and memory optimization for address mode 4344/// (CodeGenPrepare.cpp) 4348// AddrMode - This represents an addressing mode of: 4349// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg 4351// The legal address modes are 4357// immoff must fit in a signed 32-bit int 4365case 0:
// "r", "r+i" or "i" is allowed 4368if (AM.
HasBaseReg)
// "r+r+i" or "r+r" is not allowed. 4370// Otherwise we have r+i. 4373// No scale > 1 is allowed 4379//===----------------------------------------------------------------------===// 4380// NVPTX Inline Assembly Support 4381//===----------------------------------------------------------------------===// 4383/// getConstraintType - Given a constraint letter, return the type of 4384/// constraint it is for this target. 4387if (Constraint.
size() == 1) {
4388switch (Constraint[0]) {
4407std::pair<unsigned, const TargetRegisterClass *>
4411if (Constraint.
size() == 1) {
4412switch (Constraint[0]) {
4414return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4416return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4418return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4420return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4423return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4427"supported for sm_70 and higher!");
4428return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
4431return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4433return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4439//===----------------------------------------------------------------------===// 4440// NVPTX DAG Combining 4441//===----------------------------------------------------------------------===// 4445// Always honor command-line argument 4449// Do not contract if we're not optimizing the code. 4453// Honor TargetOptions flags that explicitly say fusion is okay. 4461// Honor TargetOptions flags that explicitly say unsafe math is okay. 4465// Allow unsafe math if unsafe-fp-math attribute explicitly says so. 4467returnF.getFnAttribute(
"unsafe-fp-math").getValueAsBool();
4471constauto *Const = dyn_cast<ConstantSDNode>(Operand);
4472return Const && Const->getZExtValue() == 0;
4475/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with 4476/// operands N0 and N1. This is a helper for PerformADDCombine that is 4477/// called with the default operands, and if that fails, with commuted 4484// Since integer multiply-add costs the same as integer multiply 4485// but is more costly than integer add, do the fusion only when 4486// the mul is only used in the add. 4487// TODO: this may not be true for later architectures, consider relaxing this 4491// fold (add (select cond, 0, (mul a, b)), c) 4492// -> (select cond, c, (add (mul a, b), c)) 4504if (M->getOpcode() !=
ISD::MUL || !M.getNode()->hasOneUse())
4512 ((ZeroOpNum == 1) ? N1 : MAD),
4513 ((ZeroOpNum == 1) ? MAD : N1));
4530// For floating point: 4531// Do the fusion only when the mul has less than 5 uses and all 4533// The heuristic is that if a use is not an add, then that use 4534// cannot be fused into fma, therefore mul is still needed anyway. 4535// If there are more than 4 uses, even if they are all add, fusing 4536// them will increase register pressue. 4548int orderNo =
N->getIROrder();
4550// simple heuristics here for considering potential register 4551// pressure, the logics here is that the differnce are used 4552// to measure the distance between def and use, the longer distance 4553// more likely cause register pressure. 4554if (orderNo - orderNo2 < 500)
4557// Now, check if at least one of the FMUL's operands is live beyond the 4558// node N, which guarantees that the FMA will not increase register 4559// pressure at node N. 4560bool opIsLive =
false;
4564if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4569int orderNo3 =
User->getIROrder();
4570if (orderNo3 > orderNo) {
4578int orderNo3 =
User->getIROrder();
4579if (orderNo3 > orderNo) {
4598if (
all_of(
N->ops().drop_front(Front).drop_back(Back),
4599 [](
constSDUse &U) { return U.get()->isUndef(); }))
4600// Operand 0 is the previous value in the chain. Cannot return EntryToken 4601// as the previous value will become unused and eliminated later. 4602returnN->getOperand(0);
4608// Operands from the 3rd to the 2nd last one are the values to be stored. 4609// {Chain, ArgID, Offset, Val, Glue} 4614// Operands from the 2nd to the last one are the values to be stored 4618/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. 4629// Skip non-integer, non-scalar case 4631if (VT.
isVector() || VT != MVT::i32)
4634// First try with the default operand order. 4638// If that didn't work, try again with the operands commuted. 4642/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD. 4651if (VT.
isVector() || !(VT == MVT::f32 || VT == MVT::f64))
4654// First try with the default operand order. 4658// If that didn't work, try again with the operands commuted. 4664// The type legalizer turns a vector load of i8 values into a zextload to i16 4665// registers, optionally ANY_EXTENDs it (if target type is integer), 4666// and ANDs off the high 8 bits. Since we turn this load into a 4667// target-specific DAG node, the DAG combiner fails to eliminate these AND 4668// nodes. Do that here. 4672if (isa<ConstantSDNode>(Val)) {
4678// Convert BFE-> truncate i16 -> and 255 4679// To just BFE-> truncate i16, as the value already has all the bits in the 4686ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
4693// Not an AND with a constant 4698if (MaskVal != (
uint64_t(1) << BFEBitsVal) - 1)
4700// If we get here, the AND is unnecessary. Just replace it with the trunc 4703// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and 4713// Not an AND with a constant 4718if (MaskVal != 0xff) {
4719// Not an AND that chops off top 8 bits 4723MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4725// Not a MemSDNode?!? 4730if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4731// We only handle the i8 case 4737// If for some reason the load is a sextload, the and is needed to zero 4738// out the high 8 bits 4743if (AExt.
getNode() !=
nullptr) {
4744// Re-insert the ext as a zext. 4750// If we get here, the AND is unnecessary. Just replace it with the load 4762// Don't do anything at less than -O2. 4768EVT VT =
N->getValueType(0);
4772constSDValue &Num =
N->getOperand(0);
4773constSDValue &Den =
N->getOperand(1);
4776if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4777 U->getOperand(1) == Den) {
4778// Num % Den -> Num - (Num / Den) * Den 4794/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand 4795/// that can be demoted to \p OptSize bits without loss of information. The 4796/// signedness of the operand, if determinable, is placed in \p S. 4804EVT OrigVT =
Op.getOperand(0).getValueType();
4810EVT OrigVT =
Op.getOperand(0).getValueType();
4820/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can 4821/// be demoted to \p OptSize bits without loss of information. If the operands 4822/// contain a constant, it should appear as the RHS operand. The signedness of 4823/// the operands is placed in \p IsSigned. 4829// The LHS operand must be a demotable op 4833// We should have been able to determine the signedness from the LHS 4837 IsSigned = (LHSSign ==
Signed);
4839// The RHS can be a demotable op or a constant 4841constAPInt &Val = CI->getAPIntValue();
4843return Val.
isIntN(OptSize);
4852return LHSSign == RHSSign;
4856/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply 4857/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform 4858/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift 4862EVT MulType =
N->getValueType(0);
4863if (MulType != MVT::i32 && MulType != MVT::i64) {
4872// Canonicalize the multiply so the constant (if any) is on the right 4874if (isa<ConstantSDNode>(
LHS)) {
4879// If we have a SHL, determine the actual multiply amount 4897// Verify that our operands are demotable 4903if (MulType == MVT::i32) {
4904 DemotedVT = MVT::i16;
4906 DemotedVT = MVT::i32;
4909// Truncate the operands to the correct size. Note that these are just for 4910// type consistency and will (likely) be eliminated in later phases. 4923return DCI.
DAG.
getNode(Opc,
DL, MulType, TruncLHS, TruncRHS);
4927constauto *Const = dyn_cast<ConstantSDNode>(Operand);
4928return Const && Const->getZExtValue() == 1;
4936returnAdd->getOperand(1);
4939returnAdd->getOperand(0);
4973// Do not combine if the resulting sequence is not obviously profitable. 4980 (ConstOpNo == 1) ?
X : NewMul,
4981 (ConstOpNo == 1) ? NewMul :
X);
4992if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
4997// (mul x, (add y, 1)) -> (add (mul x, y), x) 5003// (mul x, (select y, 1)) -> (select (mul x, y), x) 5012/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes. 5027/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes. 5032// Try mul.wide combining at OptLevel > 0 5043EVT CCType =
N->getValueType(0);
5047EVT AType =
A.getValueType();
5048if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5051if (
A.getValueType() == MVT::v2bf16 &&
SmVersion < 90)
5055// setp.f16x2 returns two scalar predicates, which we need to 5056// convert back to v2i1. The returned result will be scalarized by 5057// the legalizer, but the comparison will remain a single vector 5062DL, DCI.
DAG.
getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5076returnSDValue();
// Native vector loads already combine nicely w/ 5077// extract_vector_elt. 5078// Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already 5081 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5084// Don't mess with undef values as sra may be simplified to 0, not undef. 5089// We only handle the types we can extract in-register. 5090if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5094// Index == 0 is handled by generic DAG combiner. 5095if (!Index || Index->getZExtValue() == 0)
5109// If element has non-integer type, bitcast it back to the expected type. 5112// Past legalizer, we may need to extent i8 -> i16 to match the register type. 5113if (EltVT !=
N->getValueType(0))
5123if (VectorVT != MVT::v4i8)
5126// We need to split vselect into individual per-element operations Because we 5127// use BFE/BFI instruction for byte extraction/insertion, we do end up with 5128// 32-bit values, so we may as well do comparison as i32 to avoid conversions 5129// to/from i16 normally used for i8 values. 5134for (
intI = 0;
I < 4; ++
I) {
5153auto VT =
N->getValueType(0);
5157auto Op0 =
N->getOperand(0);
5158auto Op1 =
N->getOperand(1);
5160// Start out by assuming we want to take the lower 2 bytes of each i32 5165 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
5168// Check that each operand is an i16, truncated from an i32 operand. We'll 5169// select individual bytes from those original operands. Optionally, fold in a 5170// shift right of that original operand. 5171for (
auto &[
Op, OpBytes] : OpData) {
5172// Eat up any bitcast 5174 *
Op =
Op->getOperand(0);
5177Op->getOperand(0).getValueType() == MVT::i32))
5180// If the truncate has multiple uses, this optimization can increase 5182if (!
Op->hasOneUse())
5185 *
Op =
Op->getOperand(0);
5187// Optionally, fold in a shift-right of the original operand and let permute 5188// pick the two higher bytes of the original value directly. 5189if (
Op->getOpcode() ==
ISD::SRL && isa<ConstantSDNode>(
Op->getOperand(1))) {
5190if (cast<ConstantSDNode>(
Op->getOperand(1))->getZExtValue() == 16) {
5191// Shift the PRMT byte selector to pick upper bytes from each respective 5192// value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76 5193assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
5194"PRMT selector values out of range");
5196 *
Op =
Op->getOperand(0);
5206 {Op0, Op1, DAG.
getConstant((Op1Bytes << 8) | Op0Bytes,
DL, MVT::i32),
5212 DAGCombinerInfo &DCI)
const{
5214switch (
N->getOpcode()) {
5251// Handle bitcasting to v2i8 without hitting the default promotion 5252// strategy which goes through stack memory. 5254EVT ToVT =
Op->getValueType(0);
5255if (ToVT != MVT::v2i8) {
5259// Bitcast to i16 and unpack elements into a vector 5271/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. 5274EVT ResVT =
N->getValueType(0);
5280if (!NumEltsAndEltVT)
5282auto [NumElts, EltVT] = NumEltsAndEltVT.value();
5286Align Alignment = LD->getAlign();
5290if (Alignment < PrefAlign) {
5291// This load is not sufficiently aligned, so bail out and let this vector 5292// load be scalarized. Note that we may still be able to emit smaller 5293// vector loads. For example, if we are loading a <4 x float> with an 5294// alignment of 8, this check will fail but the legalizer will try again 5295// with 2 x <2 x float>, which will succeed with an alignment of 8. 5299// Since LoadV2 is a target node, we cannot rely on DAG type legalization. 5300// Therefore, we must ensure the type is legal. For i1 and i8, we set the 5301// loaded type to i16 and propagate the "real" type as the memory type. 5302bool NeedTrunc =
false;
5316 LdResVTs = DAG.
getVTList(EltVT, EltVT, MVT::Other);
5320EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5326// Copy regular operands 5329// The select routine does not have access to the LoadSDNode instance, so 5330// pass along the extension information 5335 LD->getMemOperand());
5339"NumElts should not increase, only decrease or stay the same.");
5341// If the number of elements has decreased, getVectorLoweringShape has 5342// upsized the element types 5345// Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back 5346// into individual elements. 5347for (
unsigned i = 0; i < NumElts; ++i) {
5352for (
unsigned i = 0; i < NumElts; ++i) {
5374// Get the intrinsic ID 5379case Intrinsic::nvvm_ldu_global_i:
5380case Intrinsic::nvvm_ldu_global_f:
5381case Intrinsic::nvvm_ldu_global_p: {
5382EVT ResVT =
N->getValueType(0);
5390// Since LDU/LDG are target nodes, we cannot rely on DAG type 5392// Therefore, we must ensure the type is legal. For i1 and i8, we set the 5393// loaded type to i16 and propagate the "real" type as the memory type. 5394bool NeedTrunc =
false;
5408 LdResVTs = DAG.
getVTList(EltVT, EltVT, MVT::Other);
5412EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5420// Copy regular operands 5423// Skip operand 1 (intrinsic ID) 5425 OtherOps.
append(
N->op_begin() + 2,
N->op_end());
5435for (
unsigned i = 0; i < NumElts; ++i) {
5453"Custom handling of non-i8 ldu/ldg?");
5455// Just copy all operands as-is 5458// Force output to i16 5463// We make sure the memory type is i8, which will be used during isel 5464// to select the proper instruction. 5479// Change the CopyFromReg to output 2 64-bit results instead of a 128-bit 5480// result so that it can pass the legalization 5486assert(Reg.getValueType() == MVT::i128 &&
5487"Custom lowering for CopyFromReg with 128-bit reg only");
5501void NVPTXTargetLowering::ReplaceNodeResults(
5503switch (
N->getOpcode()) {
5542auto ITy = cast<llvm::IntegerType>(Ty);
5551switch (ITy->getBitWidth()) {
5570switch (ITy->getBitWidth()) {
5588// Pin NVPTXTargetObjectFile's vtables to this file. AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This file contains the declarations of entities that describe floating point environment and related ...
Module.h This file contains the declarations for the Module class.
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
unsigned const TargetRegisterInfo * TRI
NVPTX address space definition.
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static cl::opt< int > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2))
static SDValue PerformStoreParamCombine(SDNode *N)
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static bool Is16bitsType(MVT VT)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static bool IsTypePassedAsArray(const Type *Ty)
static SmallVector< ParamVectorizationFlags, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static unsigned CanMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment)
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static bool isConstZero(const SDValue &Operand)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive EVTs that compose it.
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue StVal, SDValue &InGlue, unsigned ArgID, const SDLoc &dl)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static std::optional< std::pair< unsigned int, EVT > > getVectorLoweringShape(EVT VectorVT)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformStoreRetvalCombine(SDNode *N)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, std::size_t Back)
static bool adjustElementType(EVT &ElementType)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT, SDValue Value)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue RetVal, const SDLoc &dl)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static SDValue LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, EVT ElementType, SDValue &InGlue, SmallVectorImpl< SDValue > &TempProxyRegOps, const SDLoc &dl)
static std::atomic< unsigned > GlobalUniqueCallSite
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
This file describes how to lower LLVM code to machine code.
Class for arbitrary precision integers.
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
bool slt(const APInt &RHS) const
Signed less than comparison.
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
const T & back() const
back - Get the last element.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
an instruction that atomically reads a memory location, combines it with another value,...
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
bool isFloatingPointOperation() const
BinOp getOperation() const
bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Return true if the attribute exists for the given argument.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
This class represents a function call, abstracting a target machine's calling convention.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Type * getReturnType() const
Returns the type of the ret val.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
Instances of this class represent a uniqued identifier for a section in the current translation unit.
StringRef getName() const
getName - Get the symbol name.
unsigned getVectorNumElements() const
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
static auto fixedlen_vector_valuetypes()
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
const std::vector< MachineJumpTableEntry > & getJumpTables() const
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasAtomAddF64() const
const NVPTXTargetLowering * getTargetLowering() const override
unsigned getMinCmpXchgSizeInBits() const
unsigned getPTXVersion() const
bool hasNativeBF16Support(int Opcode) const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getSmVersion() const
bool hasAtomBitwise64() const
bool allowFP16Math() const
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, MaybeAlign retAlignment, std::optional< std::pair< unsigned, const APInt & > > VAInfo, const CallBase &CB, unsigned UniqueCallSite) const
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32() const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool allowUnsafeFPMath(MachineFunction &MF) const
int getDivF32Level() const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
UniqueStringSaver & getStrPool() const
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
~NVPTXTargetObjectFile() override
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumOperands() const
Return the number of values used by this operation.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
bool isUndef() const
Return true if the type of the node type undefined.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
const TargetLowering & getTargetLoweringInfo() const
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getBasicBlock(MachineBasicBlock *MBB)
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
MachineFunction & getMachineFunction() const
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
StringRef - Represent a constant reference to a string, i.e.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Class to represent struct types.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrNegativeOneBooleanContent
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
MCSymbol * getSymbol(const GlobalValue *GV) const
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
@ VoidTyID
type with no size
bool isAggregateType() const
Return true if the type is an aggregate type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
StringRef save(const char *S)
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
int getNumOccurrences() const
A raw_ostream that writes to an std::string.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
@ C
The default llvm calling convention, compatible with C.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ BSWAP
Byte Swap and Counting operators.
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ SIGN_EXTEND
Conversion operators.
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ UNDEF
UNDEF - An undefined node.
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
@ Bitcast
Perform the operation on a different, but equivalently sized type.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
static bool isIndirectCall(const MachineInstr &MI)
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
MaybeAlign getAlign(const Function &F, unsigned Index)
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
unsigned promoteScalarArgumentSize(unsigned size)
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
CodeGenOptLevel
Code generation optimization level.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
Function * getMaybeBitcastedCallee(const CallBase *CB)
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static const fltSemantics & IEEEsingle() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
ElementCount getVectorElementCount() const
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
uint64_t getScalarSizeInBits() const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isAfterLegalizeDAG() const
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)