Movatterモバイル変換


[0]ホーム

URL:


LLVM 20.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
15#include "MCTargetDesc/NVPTXBaseInfo.h"
16#include "NVPTX.h"
17#include "NVPTXSubtarget.h"
18#include "NVPTXTargetMachine.h"
19#include "NVPTXTargetObjectFile.h"
20#include "NVPTXUtilities.h"
21#include "llvm/ADT/APInt.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/SmallVector.h"
24#include "llvm/ADT/StringRef.h"
25#include "llvm/CodeGen/Analysis.h"
26#include "llvm/CodeGen/ISDOpcodes.h"
27#include "llvm/CodeGen/MachineFunction.h"
28#include "llvm/CodeGen/MachineJumpTableInfo.h"
29#include "llvm/CodeGen/MachineMemOperand.h"
30#include "llvm/CodeGen/SelectionDAG.h"
31#include "llvm/CodeGen/SelectionDAGNodes.h"
32#include "llvm/CodeGen/TargetCallingConv.h"
33#include "llvm/CodeGen/TargetLowering.h"
34#include "llvm/CodeGen/ValueTypes.h"
35#include "llvm/CodeGenTypes/MachineValueType.h"
36#include "llvm/IR/Argument.h"
37#include "llvm/IR/Attributes.h"
38#include "llvm/IR/Constants.h"
39#include "llvm/IR/DataLayout.h"
40#include "llvm/IR/DerivedTypes.h"
41#include "llvm/IR/DiagnosticInfo.h"
42#include "llvm/IR/FPEnv.h"
43#include "llvm/IR/Function.h"
44#include "llvm/IR/GlobalValue.h"
45#include "llvm/IR/Instruction.h"
46#include "llvm/IR/Instructions.h"
47#include "llvm/IR/IntrinsicsNVPTX.h"
48#include "llvm/IR/Module.h"
49#include "llvm/IR/Type.h"
50#include "llvm/IR/Value.h"
51#include "llvm/Support/Alignment.h"
52#include "llvm/Support/Casting.h"
53#include "llvm/Support/CodeGen.h"
54#include "llvm/Support/CommandLine.h"
55#include "llvm/Support/ErrorHandling.h"
56#include "llvm/Support/NVPTXAddrSpace.h"
57#include "llvm/Support/raw_ostream.h"
58#include "llvm/Target/TargetMachine.h"
59#include "llvm/Target/TargetOptions.h"
60#include <algorithm>
61#include <cassert>
62#include <cmath>
63#include <cstdint>
64#include <iterator>
65#include <optional>
66#include <string>
67#include <utility>
68#include <vector>
69
70#define DEBUG_TYPE "nvptx-lower"
71
72using namespacellvm;
73
74static std::atomic<unsigned>GlobalUniqueCallSite;
75
76staticcl::opt<bool>sched4reg(
77"nvptx-sched4reg",
78cl::desc("NVPTX Specific: schedule for register pressue"),cl::init(false));
79
80staticcl::opt<unsigned>FMAContractLevelOpt(
81"nvptx-fma-level",cl::Hidden,
82cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
83" 1: do it 2: do it aggressively"),
84cl::init(2));
85
86staticcl::opt<int>UsePrecDivF32(
87"nvptx-prec-divf32",cl::Hidden,
88cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"
89" IEEE Compliant F32 div.rnd if available."),
90cl::init(2));
91
92staticcl::opt<bool>UsePrecSqrtF32(
93"nvptx-prec-sqrtf32",cl::Hidden,
94cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
95cl::init(true));
96
97/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
98/// does NOT use lg2.approx for log2, so this is disabled by default.
99staticcl::opt<bool>UseApproxLog2F32(
100"nvptx-approx-log2f32",
101cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
102cl::init(false));
103
104staticcl::opt<bool>ForceMinByValParamAlign(
105"nvptx-force-min-byval-param-align",cl::Hidden,
106cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"
107" params of device functions."),
108cl::init(false));
109
110intNVPTXTargetLowering::getDivF32Level() const{
111if (UsePrecDivF32.getNumOccurrences() > 0) {
112// If nvptx-prec-div32=N is used on the command-line, always honor it
113returnUsePrecDivF32;
114 }else {
115// Otherwise, use div.approx if fast math is enabled
116if (getTargetMachine().Options.UnsafeFPMath)
117return 0;
118else
119return 2;
120 }
121}
122
123boolNVPTXTargetLowering::usePrecSqrtF32() const{
124if (UsePrecSqrtF32.getNumOccurrences() > 0) {
125// If nvptx-prec-sqrtf32 is used on the command-line, always honor it
126returnUsePrecSqrtF32;
127 }else {
128// Otherwise, use sqrt.approx if fast math is enabled
129return !getTargetMachine().Options.UnsafeFPMath;
130 }
131}
132
133boolNVPTXTargetLowering::useF32FTZ(constMachineFunction &MF) const{
134return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
135DenormalMode::PreserveSign;
136}
137
138staticboolIsPTXVectorType(MVT VT) {
139switch (VT.SimpleTy) {
140default:
141returnfalse;
142case MVT::v2i1:
143case MVT::v4i1:
144case MVT::v2i8:
145case MVT::v4i8:
146case MVT::v8i8:// <2 x i8x4>
147case MVT::v16i8:// <4 x i8x4>
148case MVT::v2i16:
149case MVT::v4i16:
150case MVT::v8i16:// <4 x i16x2>
151case MVT::v2i32:
152case MVT::v4i32:
153case MVT::v2i64:
154case MVT::v2f16:
155case MVT::v4f16:
156case MVT::v8f16:// <4 x f16x2>
157case MVT::v2bf16:
158case MVT::v4bf16:
159case MVT::v8bf16:// <4 x bf16x2>
160case MVT::v2f32:
161case MVT::v4f32:
162case MVT::v2f64:
163returntrue;
164 }
165}
166
167staticboolIs16bitsType(MVT VT) {
168return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||
169 VT.SimpleTy == MVT::i16);
170}
171
172// When legalizing vector loads/stores, this function is called, which does two
173// things:
174// 1. Determines Whether the vector is something we want to custom lower,
175// std::nullopt is returned if we do not want to custom lower it.
176// 2. If we do want to handle it, returns two parameters:
177// - unsigned int NumElts - The number of elements in the final vector
178// - EVT EltVT - The type of the elements in the final vector
179static std::optional<std::pair<unsigned int, EVT>>
180getVectorLoweringShape(EVT VectorVT) {
181if (!VectorVT.isVector() || !VectorVT.isSimple())
182return std::nullopt;
183
184EVT EltVT = VectorVT.getVectorElementType();
185unsigned NumElts = VectorVT.getVectorNumElements();
186
187// We only handle "native" vector sizes for now, e.g. <4 x double> is not
188// legal. We can (and should) split that into 2 stores of <2 x double> here
189// but I'm leaving that as a TODO for now.
190switch (VectorVT.getSimpleVT().SimpleTy) {
191default:
192return std::nullopt;
193case MVT::v2i8:
194case MVT::v2i16:
195case MVT::v2i32:
196case MVT::v2i64:
197case MVT::v2f16:
198case MVT::v2bf16:
199case MVT::v2f32:
200case MVT::v2f64:
201case MVT::v4i8:
202case MVT::v4i16:
203case MVT::v4i32:
204case MVT::v4f16:
205case MVT::v4bf16:
206case MVT::v4f32:
207// This is a "native" vector type
208return std::pair(NumElts, EltVT);
209case MVT::v8i8:// <2 x i8x4>
210case MVT::v8f16:// <4 x f16x2>
211case MVT::v8bf16:// <4 x bf16x2>
212case MVT::v8i16:// <4 x i16x2>
213case MVT::v16i8:// <4 x i8x4>
214// This can be upsized into a "native" vector type.
215// Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for
216// total load/store size, PTX syntax only supports v2/v4. Thus, we can't use
217// vectorized loads/stores with the actual element type for i8/i16 as that
218// would require v8/v16 variants that do not exist.
219// In order to load/store such vectors efficiently, here in Type
220// Legalization, we split the vector into word-sized chunks (v2x16/v4i8).
221// Later, we will lower to PTX as vectors of b32.
222
223// Number of elements to pack in one word.
224unsigned NPerWord = 32 / EltVT.getSizeInBits();
225
226return std::pair(NumElts / NPerWord,
227MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord));
228 }
229
230llvm_unreachable("All cases in switch should return.");
231}
232
233/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
234/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors
235/// into their primitive components.
236/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
237/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
238/// LowerCall, and LowerReturn.
239staticvoidComputePTXValueVTs(constTargetLowering &TLI,constDataLayout &DL,
240Type *Ty,SmallVectorImpl<EVT> &ValueVTs,
241SmallVectorImpl<uint64_t> *Offsets =nullptr,
242uint64_t StartingOffset = 0) {
243SmallVector<EVT, 16> TempVTs;
244SmallVector<uint64_t, 16> TempOffsets;
245
246// Special case for i128 - decompose to (i64, i64)
247if (Ty->isIntegerTy(128)) {
248 ValueVTs.push_back(EVT(MVT::i64));
249 ValueVTs.push_back(EVT(MVT::i64));
250
251if (Offsets) {
252 Offsets->push_back(StartingOffset + 0);
253 Offsets->push_back(StartingOffset + 8);
254 }
255
256return;
257 }
258
259// Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.
260if (StructType *STy = dyn_cast<StructType>(Ty)) {
261autoconst *SL =DL.getStructLayout(STy);
262auto ElementNum = 0;
263for(auto *EI : STy->elements()) {
264ComputePTXValueVTs(TLI,DL, EI, ValueVTs, Offsets,
265 StartingOffset + SL->getElementOffset(ElementNum));
266 ++ElementNum;
267 }
268return;
269 }
270
271// Given an array type, recursively traverse the elements with custom ComputePTXValueVTs.
272if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
273Type *EltTy = ATy->getElementType();
274uint64_t EltSize =DL.getTypeAllocSize(EltTy);
275for (intI : llvm::seq<int>(ATy->getNumElements()))
276ComputePTXValueVTs(TLI,DL, EltTy, ValueVTs, Offsets, StartingOffset +I * EltSize);
277return;
278 }
279
280ComputeValueVTs(TLI,DL, Ty, TempVTs, &TempOffsets, StartingOffset);
281for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
282EVT VT = TempVTs[i];
283uint64_t Off = TempOffsets[i];
284// Split vectors into individual elements, except for v2f16, which
285// we will pass as a single scalar.
286if (VT.isVector()) {
287unsigned NumElts = VT.getVectorNumElements();
288EVT EltVT = VT.getVectorElementType();
289// We require power-of-2 sized vectors becuase
290// TargetLoweringBase::getVectorTypeBreakdown() which is invoked in
291// ComputePTXValueVTs() cannot currently break down non-power-of-2 sized
292// vectors.
293if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 &&
294isPowerOf2_32(NumElts)) {
295// Vectors with an even number of f16 elements will be passed to
296// us as an array of v2f16/v2bf16 elements. We must match this so we
297// stay in sync with Ins/Outs.
298switch (EltVT.getSimpleVT().SimpleTy) {
299case MVT::f16:
300 EltVT = MVT::v2f16;
301break;
302case MVT::bf16:
303 EltVT = MVT::v2bf16;
304break;
305case MVT::i16:
306 EltVT = MVT::v2i16;
307break;
308default:
309llvm_unreachable("Unexpected type");
310 }
311 NumElts /= 2;
312 }elseif (EltVT.getSimpleVT() == MVT::i8 &&
313 ((NumElts % 4 == 0 &&isPowerOf2_32(NumElts)) ||
314 NumElts == 3)) {
315// v*i8 are formally lowered as v4i8
316 EltVT = MVT::v4i8;
317 NumElts = (NumElts + 3) / 4;
318 }elseif (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) {
319// v2i8 is promoted to v2i16
320 NumElts = 1;
321 EltVT = MVT::v2i16;
322 }
323for (unsigned j = 0; j != NumElts; ++j) {
324 ValueVTs.push_back(EltVT);
325if (Offsets)
326 Offsets->push_back(Off + j * EltVT.getStoreSize());
327 }
328 }else {
329 ValueVTs.push_back(VT);
330if (Offsets)
331 Offsets->push_back(Off);
332 }
333 }
334}
335
336/// PromoteScalarIntegerPTX
337/// Used to make sure the arguments/returns are suitable for passing
338/// and promote them to a larger size if they're not.
339///
340/// The promoted type is placed in \p PromoteVT if the function returns true.
341staticboolPromoteScalarIntegerPTX(constEVT &VT,MVT *PromotedVT) {
342if (VT.isScalarInteger()) {
343switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
344default:
345llvm_unreachable(
346"Promotion is not suitable for scalars of size larger than 64-bits");
347case 1:
348 *PromotedVT = MVT::i1;
349break;
350case 2:
351case 4:
352case 8:
353 *PromotedVT = MVT::i8;
354break;
355case 16:
356 *PromotedVT = MVT::i16;
357break;
358case 32:
359 *PromotedVT = MVT::i32;
360break;
361case 64:
362 *PromotedVT = MVT::i64;
363break;
364 }
365returnEVT(*PromotedVT) != VT;
366 }
367returnfalse;
368}
369
370// Check whether we can merge loads/stores of some of the pieces of a
371// flattened function parameter or return value into a single vector
372// load/store.
373//
374// The flattened parameter is represented as a list of EVTs and
375// offsets, and the whole structure is aligned to ParamAlignment. This
376// function determines whether we can load/store pieces of the
377// parameter starting at index Idx using a single vectorized op of
378// size AccessSize. If so, it returns the number of param pieces
379// covered by the vector op. Otherwise, it returns 1.
380staticunsignedCanMergeParamLoadStoresStartingAt(
381unsignedIdx,uint32_t AccessSize,constSmallVectorImpl<EVT> &ValueVTs,
382constSmallVectorImpl<uint64_t> &Offsets,Align ParamAlignment) {
383
384// Can't vectorize if param alignment is not sufficient.
385if (ParamAlignment < AccessSize)
386return 1;
387// Can't vectorize if offset is not aligned.
388if (Offsets[Idx] & (AccessSize - 1))
389return 1;
390
391EVT EltVT = ValueVTs[Idx];
392unsigned EltSize = EltVT.getStoreSize();
393
394// Element is too large to vectorize.
395if (EltSize >= AccessSize)
396return 1;
397
398unsigned NumElts = AccessSize / EltSize;
399// Can't vectorize if AccessBytes if not a multiple of EltSize.
400if (AccessSize != EltSize * NumElts)
401return 1;
402
403// We don't have enough elements to vectorize.
404if (Idx + NumElts > ValueVTs.size())
405return 1;
406
407// PTX ISA can only deal with 2- and 4-element vector ops.
408if (NumElts != 4 && NumElts != 2)
409return 1;
410
411for (unsigned j =Idx + 1; j <Idx + NumElts; ++j) {
412// Types do not match.
413if (ValueVTs[j] != EltVT)
414return 1;
415
416// Elements are not contiguous.
417if (Offsets[j] - Offsets[j - 1] != EltSize)
418return 1;
419 }
420// OK. We can vectorize ValueVTs[i..i+NumElts)
421return NumElts;
422}
423
424// Flags for tracking per-element vectorization state of loads/stores
425// of a flattened function parameter or return value.
426enumParamVectorizationFlags {
427PVF_INNER = 0x0,// Middle elements of a vector.
428PVF_FIRST = 0x1,// First element of the vector.
429PVF_LAST = 0x2,// Last element of the vector.
430// Scalar is effectively a 1-element vector.
431PVF_SCALAR =PVF_FIRST |PVF_LAST
432};
433
434// Computes whether and how we can vectorize the loads/stores of a
435// flattened function parameter or return value.
436//
437// The flattened parameter is represented as the list of ValueVTs and
438// Offsets, and is aligned to ParamAlignment bytes. We return a vector
439// of the same size as ValueVTs indicating how each piece should be
440// loaded/stored (i.e. as a scalar, or as part of a vector
441// load/store).
442staticSmallVector<ParamVectorizationFlags, 16>
443VectorizePTXValueVTs(constSmallVectorImpl<EVT> &ValueVTs,
444constSmallVectorImpl<uint64_t> &Offsets,
445Align ParamAlignment,bool IsVAArg =false) {
446// Set vector size to match ValueVTs and mark all elements as
447// scalars by default.
448SmallVector<ParamVectorizationFlags, 16> VectorInfo;
449 VectorInfo.assign(ValueVTs.size(),PVF_SCALAR);
450
451if (IsVAArg)
452return VectorInfo;
453
454// Check what we can vectorize using 128/64/32-bit accesses.
455for (intI = 0, E = ValueVTs.size();I != E; ++I) {
456// Skip elements we've already processed.
457assert(VectorInfo[I] ==PVF_SCALAR &&"Unexpected vector info state.");
458for (unsigned AccessSize : {16, 8, 4, 2}) {
459unsigned NumElts =CanMergeParamLoadStoresStartingAt(
460I, AccessSize, ValueVTs, Offsets, ParamAlignment);
461// Mark vectorized elements.
462switch (NumElts) {
463default:
464llvm_unreachable("Unexpected return value");
465case 1:
466// Can't vectorize using this size, try next smaller size.
467continue;
468case 2:
469assert(I + 1 < E &&"Not enough elements.");
470 VectorInfo[I] =PVF_FIRST;
471 VectorInfo[I + 1] =PVF_LAST;
472I += 1;
473break;
474case 4:
475assert(I + 3 < E &&"Not enough elements.");
476 VectorInfo[I] =PVF_FIRST;
477 VectorInfo[I + 1] =PVF_INNER;
478 VectorInfo[I + 2] =PVF_INNER;
479 VectorInfo[I + 3] =PVF_LAST;
480I += 3;
481break;
482 }
483// Break out of the inner loop because we've already succeeded
484// using largest possible AccessSize.
485break;
486 }
487 }
488return VectorInfo;
489}
490
491staticSDValueMaybeBitcast(SelectionDAG &DAG,SDLocDL,EVT VT,
492SDValueValue) {
493if (Value->getValueType(0) == VT)
494returnValue;
495return DAG.getNode(ISD::BITCAST,DL, VT,Value);
496}
497
498// NVPTXTargetLowering Constructor.
499NVPTXTargetLowering::NVPTXTargetLowering(constNVPTXTargetMachine &TM,
500constNVPTXSubtarget &STI)
501 :TargetLowering(TM), nvTM(&TM), STI(STI) {
502// always lower memset, memcpy, and memmove intrinsics to load/store
503// instructions, rather
504// then generating calls to memset, mempcy or memmove.
505MaxStoresPerMemset =MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;
506MaxStoresPerMemcpy =MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;
507MaxStoresPerMemmove =MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;
508
509setBooleanContents(ZeroOrNegativeOneBooleanContent);
510setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
511
512// Jump is Expensive. Don't create extra control flow for 'and', 'or'
513// condition branches.
514setJumpIsExpensive(true);
515
516// Wide divides are _very_ slow. Try to reduce the width of the divide if
517// possible.
518addBypassSlowDiv(64, 32);
519
520// By default, use the Source scheduling
521if (sched4reg)
522setSchedulingPreference(Sched::RegPressure);
523else
524setSchedulingPreference(Sched::Source);
525
526auto setFP16OperationAction = [&](unsignedOp,MVT VT,LegalizeAction Action,
527LegalizeAction NoF16Action) {
528bool IsOpSupported = STI.allowFP16Math();
529switch (Op) {
530// Several FP16 instructions are available on sm_80 only.
531caseISD::FMINNUM:
532caseISD::FMAXNUM:
533caseISD::FMAXNUM_IEEE:
534caseISD::FMINNUM_IEEE:
535caseISD::FMAXIMUM:
536caseISD::FMINIMUM:
537 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
538break;
539caseISD::FEXP2:
540 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
541break;
542 }
543setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
544 };
545
546auto setBF16OperationAction = [&](unsignedOp,MVT VT,LegalizeAction Action,
547LegalizeAction NoBF16Action) {
548bool IsOpSupported = STI.hasNativeBF16Support(Op);
549setOperationAction(
550Op, VT, IsOpSupported ? Action : NoBF16Action);
551 };
552
553auto setI16x2OperationAction = [&](unsignedOp,MVT VT,LegalizeAction Action,
554LegalizeAction NoI16x2Action) {
555bool IsOpSupported =false;
556// instructions are available on sm_90 only
557switch (Op) {
558caseISD::ADD:
559caseISD::SMAX:
560caseISD::SMIN:
561caseISD::UMIN:
562caseISD::UMAX:
563 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
564break;
565 }
566setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
567 };
568
569addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
570addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
571addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);
572addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);
573addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
574addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
575addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
576addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
577addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);
578addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);
579addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);
580addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);
581
582// Conversion to/from FP16/FP16x2 is always legal.
583setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16,Custom);
584setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16,Custom);
585setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16,Expand);
586setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16,Expand);
587
588setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,Legal);
589if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
590setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64,Legal);
591
592 setFP16OperationAction(ISD::SETCC, MVT::f16,Legal,Promote);
593 setFP16OperationAction(ISD::SETCC, MVT::v2f16,Legal,Expand);
594
595// Conversion to/from BFP16/BFP16x2 is always legal.
596setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16,Custom);
597setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16,Custom);
598setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16,Expand);
599setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16,Expand);
600
601 setBF16OperationAction(ISD::SETCC, MVT::v2bf16,Legal,Expand);
602 setBF16OperationAction(ISD::SETCC, MVT::bf16,Legal,Promote);
603if (getOperationAction(ISD::SETCC, MVT::bf16) ==Promote)
604AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
605
606// Conversion to/from i16/i16x2 is always legal.
607setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16,Custom);
608setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16,Custom);
609setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16,Expand);
610setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16,Expand);
611
612setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8,Custom);
613setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8,Custom);
614setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8,Custom);
615setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8,Custom);
616
617// Custom conversions to/from v2i8.
618setOperationAction(ISD::BITCAST, MVT::v2i8,Custom);
619
620// Only logical ops can be done on v4i8 directly, others must be done
621// elementwise.
622setOperationAction(
623 {ISD::ABS,ISD::ADD,ISD::ADDC,ISD::ADDE,
624ISD::BITREVERSE,ISD::CTLZ,ISD::CTPOP,ISD::CTTZ,
625ISD::FP_TO_SINT,ISD::FP_TO_UINT,ISD::FSHL,ISD::FSHR,
626ISD::MUL,ISD::MULHS,ISD::MULHU,ISD::PARITY,
627ISD::ROTL,ISD::ROTR,ISD::SADDO,ISD::SADDO_CARRY,
628ISD::SADDSAT,ISD::SDIV,ISD::SDIVREM,ISD::SELECT_CC,
629ISD::SETCC,ISD::SHL,ISD::SINT_TO_FP,ISD::SMAX,
630ISD::SMIN,ISD::SMULO,ISD::SMUL_LOHI,ISD::SRA,
631ISD::SREM,ISD::SRL,ISD::SSHLSAT,ISD::SSUBO,
632ISD::SSUBO_CARRY,ISD::SSUBSAT,ISD::SUB,ISD::SUBC,
633ISD::SUBE,ISD::UADDO,ISD::UADDO_CARRY,ISD::UADDSAT,
634ISD::UDIV,ISD::UDIVREM,ISD::UINT_TO_FP,ISD::UMAX,
635ISD::UMIN,ISD::UMULO,ISD::UMUL_LOHI,ISD::UREM,
636ISD::USHLSAT,ISD::USUBO,ISD::USUBO_CARRY,ISD::VSELECT,
637ISD::USUBSAT},
638 MVT::v4i8,Expand);
639
640// Operations not directly supported by NVPTX.
641for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
642 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,
643 MVT::i32, MVT::i64}) {
644setOperationAction(ISD::SELECT_CC, VT,Expand);
645setOperationAction(ISD::BR_CC, VT,Expand);
646 }
647
648// Some SIGN_EXTEND_INREG can be done using cvt instruction.
649// For others we will expand to a SHL/SRA pair.
650setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64,Legal);
651setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32,Legal);
652setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16,Legal);
653setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 ,Legal);
654setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,Expand);
655setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16,Expand);
656
657setOperationAction(ISD::SHL_PARTS, MVT::i32 ,Custom);
658setOperationAction(ISD::SRA_PARTS, MVT::i32 ,Custom);
659setOperationAction(ISD::SRL_PARTS, MVT::i32 ,Custom);
660setOperationAction(ISD::SHL_PARTS, MVT::i64 ,Custom);
661setOperationAction(ISD::SRA_PARTS, MVT::i64 ,Custom);
662setOperationAction(ISD::SRL_PARTS, MVT::i64 ,Custom);
663
664setOperationAction(ISD::BITREVERSE, MVT::i32,Legal);
665setOperationAction(ISD::BITREVERSE, MVT::i64,Legal);
666
667setOperationAction({ISD::ROTL,ISD::ROTR},
668 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
669Expand);
670
671if (STI.hasHWROT32())
672setOperationAction({ISD::FSHL,ISD::FSHR}, MVT::i32,Legal);
673
674setOperationAction(ISD::BSWAP, MVT::i16,Expand);
675
676setOperationAction(ISD::BR_JT, MVT::Other,Custom);
677setOperationAction(ISD::BRIND, MVT::Other,Expand);
678
679setOperationAction(ISD::GlobalAddress, MVT::i32,Custom);
680setOperationAction(ISD::GlobalAddress, MVT::i64,Custom);
681
682// We want to legalize constant related memmove and memcopy
683// intrinsics.
684setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other,Custom);
685
686// Turn FP extload into load/fpextend
687setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16,Expand);
688setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16,Expand);
689setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16,Expand);
690setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16,Expand);
691setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32,Expand);
692setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16,Expand);
693setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16,Expand);
694setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16,Expand);
695setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16,Expand);
696setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32,Expand);
697setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16,Expand);
698setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16,Expand);
699setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16,Expand);
700setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16,Expand);
701setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32,Expand);
702setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16,Expand);
703setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16,Expand);
704setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16,Expand);
705setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16,Expand);
706// Turn FP truncstore into trunc + store.
707// FIXME: vector types should also be expanded
708setTruncStoreAction(MVT::f32, MVT::f16,Expand);
709setTruncStoreAction(MVT::f64, MVT::f16,Expand);
710setTruncStoreAction(MVT::f32, MVT::bf16,Expand);
711setTruncStoreAction(MVT::f64, MVT::bf16,Expand);
712setTruncStoreAction(MVT::f64, MVT::f32,Expand);
713
714// PTX does not support load / store predicate registers
715setOperationAction(ISD::LOAD, MVT::i1,Custom);
716setOperationAction(ISD::STORE, MVT::i1,Custom);
717
718for (MVT VT :MVT::integer_valuetypes()) {
719setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1,Promote);
720setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1,Promote);
721setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1,Promote);
722setTruncStoreAction(VT, MVT::i1,Expand);
723 }
724
725setCondCodeAction({ISD::SETNE,ISD::SETEQ,ISD::SETUGE,ISD::SETULE,
726ISD::SETUGT,ISD::SETULT,ISD::SETGT,ISD::SETLT,
727ISD::SETGE,ISD::SETLE},
728 MVT::i1,Expand);
729
730// expand extload of vector of integers.
731setLoadExtAction({ISD::EXTLOAD,ISD::SEXTLOAD,ISD::ZEXTLOAD}, MVT::v2i16,
732 MVT::v2i8,Expand);
733setTruncStoreAction(MVT::v2i16, MVT::v2i8,Expand);
734
735// This is legal in NVPTX
736setOperationAction(ISD::ConstantFP, MVT::f64,Legal);
737setOperationAction(ISD::ConstantFP, MVT::f32,Legal);
738setOperationAction(ISD::ConstantFP, MVT::f16,Legal);
739setOperationAction(ISD::ConstantFP, MVT::bf16,Legal);
740
741setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64},Custom);
742setOperationAction({ISD::STACKRESTORE,ISD::STACKSAVE}, MVT::Other,Custom);
743
744// TRAP can be lowered to PTX trap
745setOperationAction(ISD::TRAP, MVT::Other,Legal);
746// DEBUGTRAP can be lowered to PTX brkpt
747setOperationAction(ISD::DEBUGTRAP, MVT::Other,Legal);
748
749// Register custom handling for vector loads/stores
750for (MVT VT :MVT::fixedlen_vector_valuetypes()) {
751if (IsPTXVectorType(VT)) {
752setOperationAction(ISD::LOAD, VT,Custom);
753setOperationAction(ISD::STORE, VT,Custom);
754setOperationAction(ISD::INTRINSIC_W_CHAIN, VT,Custom);
755 }
756 }
757
758// Support varargs.
759setOperationAction(ISD::VASTART, MVT::Other,Custom);
760setOperationAction(ISD::VAARG, MVT::Other,Custom);
761setOperationAction(ISD::VACOPY, MVT::Other,Expand);
762setOperationAction(ISD::VAEND, MVT::Other,Expand);
763
764// Custom handling for i8 intrinsics
765setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8,Custom);
766
767for (constauto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {
768setOperationAction(ISD::ABS, Ty,Legal);
769setOperationAction(ISD::SMIN, Ty,Legal);
770setOperationAction(ISD::SMAX, Ty,Legal);
771setOperationAction(ISD::UMIN, Ty,Legal);
772setOperationAction(ISD::UMAX, Ty,Legal);
773
774setOperationAction(ISD::CTPOP, Ty,Legal);
775setOperationAction(ISD::CTLZ, Ty,Legal);
776 }
777
778 setI16x2OperationAction(ISD::ABS, MVT::v2i16,Legal,Custom);
779 setI16x2OperationAction(ISD::SMIN, MVT::v2i16,Legal,Custom);
780 setI16x2OperationAction(ISD::SMAX, MVT::v2i16,Legal,Custom);
781 setI16x2OperationAction(ISD::UMIN, MVT::v2i16,Legal,Custom);
782 setI16x2OperationAction(ISD::UMAX, MVT::v2i16,Legal,Custom);
783 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16,Legal,Expand);
784 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16,Legal,Expand);
785
786 setI16x2OperationAction(ISD::ADD, MVT::v2i16,Legal,Custom);
787 setI16x2OperationAction(ISD::SUB, MVT::v2i16,Legal,Custom);
788 setI16x2OperationAction(ISD::MUL, MVT::v2i16,Legal,Custom);
789 setI16x2OperationAction(ISD::SHL, MVT::v2i16,Legal,Custom);
790 setI16x2OperationAction(ISD::SREM, MVT::v2i16,Legal,Custom);
791 setI16x2OperationAction(ISD::UREM, MVT::v2i16,Legal,Custom);
792
793// Other arithmetic and logic ops are unsupported.
794setOperationAction({ISD::SDIV,ISD::UDIV,ISD::SRA,ISD::SRL,ISD::MULHS,
795ISD::MULHU,ISD::FP_TO_SINT,ISD::FP_TO_UINT,
796ISD::SINT_TO_FP,ISD::UINT_TO_FP,ISD::SETCC},
797 MVT::v2i16,Expand);
798
799setOperationAction(ISD::ADDC, MVT::i32,Legal);
800setOperationAction(ISD::ADDE, MVT::i32,Legal);
801setOperationAction(ISD::SUBC, MVT::i32,Legal);
802setOperationAction(ISD::SUBE, MVT::i32,Legal);
803if (STI.getPTXVersion() >= 43) {
804setOperationAction(ISD::ADDC, MVT::i64,Legal);
805setOperationAction(ISD::ADDE, MVT::i64,Legal);
806setOperationAction(ISD::SUBC, MVT::i64,Legal);
807setOperationAction(ISD::SUBE, MVT::i64,Legal);
808 }
809
810setOperationAction(ISD::CTTZ, MVT::i16,Expand);
811setOperationAction(ISD::CTTZ, MVT::v2i16,Expand);
812setOperationAction(ISD::CTTZ, MVT::i32,Expand);
813setOperationAction(ISD::CTTZ, MVT::i64,Expand);
814
815// PTX does not directly support SELP of i1, so promote to i32 first
816setOperationAction(ISD::SELECT, MVT::i1,Custom);
817
818// PTX cannot multiply two i64s in a single instruction.
819setOperationAction(ISD::SMUL_LOHI, MVT::i64,Expand);
820setOperationAction(ISD::UMUL_LOHI, MVT::i64,Expand);
821
822// We have some custom DAG combine patterns for these nodes
823setTargetDAGCombine({ISD::ADD,ISD::AND,ISD::EXTRACT_VECTOR_ELT,ISD::FADD,
824ISD::MUL,ISD::SHL,ISD::SREM,ISD::UREM,ISD::VSELECT,
825ISD::BUILD_VECTOR});
826
827// setcc for f16x2 and bf16x2 needs special handling to prevent
828// legalizer's attempt to scalarize it due to v2i1 not being legal.
829if (STI.allowFP16Math() || STI.hasBF16Math())
830setTargetDAGCombine(ISD::SETCC);
831
832// Promote fp16 arithmetic if fp16 hardware isn't available or the
833// user passed --nvptx-no-fp16-math. The flag is useful because,
834// although sm_53+ GPUs have some sort of FP16 support in
835// hardware, only sm_53 and sm_60 have full implementation. Others
836// only have token amount of hardware and are likely to run faster
837// by using fp32 units instead.
838for (constauto &Op : {ISD::FADD,ISD::FMUL,ISD::FSUB,ISD::FMA}) {
839 setFP16OperationAction(Op, MVT::f16,Legal,Promote);
840 setFP16OperationAction(Op, MVT::v2f16,Legal,Expand);
841 setBF16OperationAction(Op, MVT::v2bf16,Legal,Expand);
842// bf16 must be promoted to f32.
843 setBF16OperationAction(Op, MVT::bf16,Legal,Promote);
844if (getOperationAction(Op, MVT::bf16) ==Promote)
845AddPromotedToType(Op, MVT::bf16, MVT::f32);
846 }
847
848// On SM80, we select add/mul/sub as fma to avoid promotion to float
849for (constauto &Op : {ISD::FADD,ISD::FMUL,ISD::FSUB}) {
850for (constauto &VT : {MVT::bf16, MVT::v2bf16}) {
851if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
852setOperationAction(Op, VT,Custom);
853 }
854 }
855 }
856
857// f16/f16x2 neg was introduced in PTX 60, SM_53.
858constbool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
859 STI.getPTXVersion() >= 60 &&
860 STI.allowFP16Math();
861for (constauto &VT : {MVT::f16, MVT::v2f16})
862setOperationAction(ISD::FNEG, VT,
863 IsFP16FP16x2NegAvailable ?Legal :Expand);
864
865 setBF16OperationAction(ISD::FNEG, MVT::bf16,Legal,Expand);
866 setBF16OperationAction(ISD::FNEG, MVT::v2bf16,Legal,Expand);
867// (would be) Library functions.
868
869// These map to conversion instructions for scalar FP types.
870for (constauto &Op : {ISD::FCEIL,ISD::FFLOOR,ISD::FNEARBYINT,ISD::FRINT,
871ISD::FROUNDEVEN,ISD::FTRUNC}) {
872setOperationAction(Op, MVT::f16,Legal);
873setOperationAction(Op, MVT::f32,Legal);
874setOperationAction(Op, MVT::f64,Legal);
875setOperationAction(Op, MVT::v2f16,Expand);
876setOperationAction(Op, MVT::v2bf16,Expand);
877 setBF16OperationAction(Op, MVT::bf16,Legal,Promote);
878if (getOperationAction(Op, MVT::bf16) ==Promote)
879AddPromotedToType(Op, MVT::bf16, MVT::f32);
880 }
881
882if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
883setOperationAction(ISD::BF16_TO_FP, MVT::f32,Expand);
884 }
885if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
886for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
887setOperationAction(ISD::FP_EXTEND, VT,Custom);
888setOperationAction(ISD::FP_ROUND, VT,Custom);
889 }
890 }
891
892// sm_80 only has conversions between f32 and bf16. Custom lower all other
893// bf16 conversions.
894if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
895for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
896setOperationAction(
897 {ISD::SINT_TO_FP,ISD::UINT_TO_FP,ISD::FP_TO_SINT,ISD::FP_TO_UINT},
898 VT,Custom);
899 }
900setOperationAction(
901 {ISD::SINT_TO_FP,ISD::UINT_TO_FP,ISD::FP_TO_SINT,ISD::FP_TO_UINT},
902 MVT::bf16,Custom);
903 }
904
905setOperationAction(ISD::FROUND, MVT::f16,Promote);
906setOperationAction(ISD::FROUND, MVT::v2f16,Expand);
907setOperationAction(ISD::FROUND, MVT::v2bf16,Expand);
908setOperationAction(ISD::FROUND, MVT::f32,Custom);
909setOperationAction(ISD::FROUND, MVT::f64,Custom);
910setOperationAction(ISD::FROUND, MVT::bf16,Promote);
911AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
912
913// 'Expand' implements FCOPYSIGN without calling an external library.
914setOperationAction(ISD::FCOPYSIGN, MVT::f16,Expand);
915setOperationAction(ISD::FCOPYSIGN, MVT::v2f16,Expand);
916setOperationAction(ISD::FCOPYSIGN, MVT::bf16,Expand);
917setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16,Expand);
918setOperationAction(ISD::FCOPYSIGN, MVT::f32,Custom);
919setOperationAction(ISD::FCOPYSIGN, MVT::f64,Custom);
920
921// These map to corresponding instructions for f32/f64. f16 must be
922// promoted to f32. v2f16 is expanded to f16, which is then promoted
923// to f32.
924for (constauto &Op :
925 {ISD::FDIV,ISD::FREM,ISD::FSQRT,ISD::FSIN,ISD::FCOS}) {
926setOperationAction(Op, MVT::f16,Promote);
927setOperationAction(Op, MVT::f32,Legal);
928setOperationAction(Op, MVT::f64,Legal);
929setOperationAction(Op, MVT::v2f16,Expand);
930setOperationAction(Op, MVT::v2bf16,Expand);
931setOperationAction(Op, MVT::bf16,Promote);
932AddPromotedToType(Op, MVT::bf16, MVT::f32);
933 }
934
935setOperationAction(ISD::FABS, {MVT::f32, MVT::f64},Legal);
936if (STI.getPTXVersion() >= 65) {
937 setFP16OperationAction(ISD::FABS, MVT::f16,Legal,Promote);
938 setFP16OperationAction(ISD::FABS, MVT::v2f16,Legal,Expand);
939 }else {
940setOperationAction(ISD::FABS, MVT::f16,Promote);
941setOperationAction(ISD::FABS, MVT::v2f16,Expand);
942 }
943 setBF16OperationAction(ISD::FABS, MVT::v2bf16,Legal,Expand);
944 setBF16OperationAction(ISD::FABS, MVT::bf16,Legal,Promote);
945if (getOperationAction(ISD::FABS, MVT::bf16) ==Promote)
946AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
947
948for (constauto &Op : {ISD::FMINNUM,ISD::FMAXNUM}) {
949setOperationAction(Op, MVT::f32,Legal);
950setOperationAction(Op, MVT::f64,Legal);
951 setFP16OperationAction(Op, MVT::f16,Legal,Promote);
952 setFP16OperationAction(Op, MVT::v2f16,Legal,Expand);
953 setBF16OperationAction(Op, MVT::v2bf16,Legal,Expand);
954 setBF16OperationAction(Op, MVT::bf16,Legal,Promote);
955if (getOperationAction(Op, MVT::bf16) ==Promote)
956AddPromotedToType(Op, MVT::bf16, MVT::f32);
957 }
958bool SupportsF32MinMaxNaN =
959 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
960for (constauto &Op : {ISD::FMINIMUM,ISD::FMAXIMUM}) {
961setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ?Legal :Expand);
962 setFP16OperationAction(Op, MVT::f16,Legal,Expand);
963 setFP16OperationAction(Op, MVT::v2f16,Legal,Expand);
964 setBF16OperationAction(Op, MVT::bf16,Legal,Expand);
965 setBF16OperationAction(Op, MVT::v2bf16,Legal,Expand);
966 }
967
968// Custom lowering for inline asm with 128-bit operands
969setOperationAction(ISD::CopyToReg, MVT::i128,Custom);
970setOperationAction(ISD::CopyFromReg, MVT::i128,Custom);
971
972// FEXP2 support:
973// - f32
974// - f16/f16x2 (sm_70+, PTX 7.0+)
975// - bf16/bf16x2 (sm_90+, PTX 7.8+)
976// When f16/bf16 types aren't supported, they are promoted/expanded to f32.
977setOperationAction(ISD::FEXP2, MVT::f32,Legal);
978 setFP16OperationAction(ISD::FEXP2, MVT::f16,Legal,Promote);
979 setFP16OperationAction(ISD::FEXP2, MVT::v2f16,Legal,Expand);
980 setBF16OperationAction(ISD::FEXP2, MVT::bf16,Legal,Promote);
981 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16,Legal,Expand);
982
983// FLOG2 supports f32 only
984// f16/bf16 types aren't supported, but they are promoted/expanded to f32.
985if (UseApproxLog2F32) {
986setOperationAction(ISD::FLOG2, MVT::f32,Legal);
987setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
988setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
989setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16},Expand);
990 }
991
992// No FPOW or FREM in PTX.
993
994// Now deduce the information based on the above mentioned
995// actions
996computeRegisterProperties(STI.getRegisterInfo());
997
998setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
999setMaxAtomicSizeInBitsSupported(64);
1000setMaxDivRemBitWidthSupported(64);
1001}
1002
1003constchar *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const{
1004
1005#define MAKE_CASE(V) \
1006 case V: \
1007 return #V;
1008
1009switch ((NVPTXISD::NodeType)Opcode) {
1010caseNVPTXISD::FIRST_NUMBER:
1011break;
1012
1013MAKE_CASE(NVPTXISD::CALL)
1014MAKE_CASE(NVPTXISD::RET_GLUE)
1015MAKE_CASE(NVPTXISD::LOAD_PARAM)
1016MAKE_CASE(NVPTXISD::Wrapper)
1017MAKE_CASE(NVPTXISD::DeclareParam)
1018MAKE_CASE(NVPTXISD::DeclareScalarParam)
1019MAKE_CASE(NVPTXISD::DeclareRet)
1020MAKE_CASE(NVPTXISD::DeclareScalarRet)
1021MAKE_CASE(NVPTXISD::DeclareRetParam)
1022MAKE_CASE(NVPTXISD::PrintCall)
1023MAKE_CASE(NVPTXISD::PrintConvergentCall)
1024MAKE_CASE(NVPTXISD::PrintCallUni)
1025MAKE_CASE(NVPTXISD::PrintConvergentCallUni)
1026MAKE_CASE(NVPTXISD::LoadParam)
1027MAKE_CASE(NVPTXISD::LoadParamV2)
1028MAKE_CASE(NVPTXISD::LoadParamV4)
1029MAKE_CASE(NVPTXISD::StoreParam)
1030MAKE_CASE(NVPTXISD::StoreParamV2)
1031MAKE_CASE(NVPTXISD::StoreParamV4)
1032MAKE_CASE(NVPTXISD::StoreParamS32)
1033MAKE_CASE(NVPTXISD::StoreParamU32)
1034MAKE_CASE(NVPTXISD::CallArgBegin)
1035MAKE_CASE(NVPTXISD::CallArg)
1036MAKE_CASE(NVPTXISD::LastCallArg)
1037MAKE_CASE(NVPTXISD::CallArgEnd)
1038MAKE_CASE(NVPTXISD::CallVoid)
1039MAKE_CASE(NVPTXISD::CallVal)
1040MAKE_CASE(NVPTXISD::CallSymbol)
1041MAKE_CASE(NVPTXISD::Prototype)
1042MAKE_CASE(NVPTXISD::MoveParam)
1043MAKE_CASE(NVPTXISD::StoreRetval)
1044MAKE_CASE(NVPTXISD::StoreRetvalV2)
1045MAKE_CASE(NVPTXISD::StoreRetvalV4)
1046MAKE_CASE(NVPTXISD::PseudoUseParam)
1047MAKE_CASE(NVPTXISD::RETURN)
1048MAKE_CASE(NVPTXISD::CallSeqBegin)
1049MAKE_CASE(NVPTXISD::CallSeqEnd)
1050MAKE_CASE(NVPTXISD::CallPrototype)
1051MAKE_CASE(NVPTXISD::ProxyReg)
1052MAKE_CASE(NVPTXISD::LoadV2)
1053MAKE_CASE(NVPTXISD::LoadV4)
1054MAKE_CASE(NVPTXISD::LDUV2)
1055MAKE_CASE(NVPTXISD::LDUV4)
1056MAKE_CASE(NVPTXISD::StoreV2)
1057MAKE_CASE(NVPTXISD::StoreV4)
1058MAKE_CASE(NVPTXISD::FSHL_CLAMP)
1059MAKE_CASE(NVPTXISD::FSHR_CLAMP)
1060MAKE_CASE(NVPTXISD::BFE)
1061MAKE_CASE(NVPTXISD::BFI)
1062MAKE_CASE(NVPTXISD::PRMT)
1063MAKE_CASE(NVPTXISD::FCOPYSIGN)
1064MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)
1065MAKE_CASE(NVPTXISD::STACKRESTORE)
1066MAKE_CASE(NVPTXISD::STACKSAVE)
1067MAKE_CASE(NVPTXISD::SETP_F16X2)
1068MAKE_CASE(NVPTXISD::SETP_BF16X2)
1069MAKE_CASE(NVPTXISD::Dummy)
1070MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)
1071MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)
1072MAKE_CASE(NVPTXISD::BrxEnd)
1073MAKE_CASE(NVPTXISD::BrxItem)
1074MAKE_CASE(NVPTXISD::BrxStart)
1075 }
1076returnnullptr;
1077
1078#undef MAKE_CASE
1079}
1080
1081TargetLoweringBase::LegalizeTypeAction
1082NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const{
1083if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1084 VT.getScalarType() == MVT::i1)
1085returnTypeSplitVector;
1086returnTargetLoweringBase::getPreferredVectorAction(VT);
1087}
1088
1089SDValueNVPTXTargetLowering::getSqrtEstimate(SDValue Operand,SelectionDAG &DAG,
1090intEnabled,int &ExtraSteps,
1091bool &UseOneConst,
1092bool Reciprocal) const{
1093if (!(Enabled ==ReciprocalEstimate::Enabled ||
1094 (Enabled ==ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
1095returnSDValue();
1096
1097if (ExtraSteps ==ReciprocalEstimate::Unspecified)
1098 ExtraSteps = 0;
1099
1100SDLocDL(Operand);
1101EVT VT = Operand.getValueType();
1102bool Ftz =useF32FTZ(DAG.getMachineFunction());
1103
1104auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1105return DAG.getNode(ISD::INTRINSIC_WO_CHAIN,DL, VT,
1106 DAG.getConstant(IID,DL, MVT::i32), Operand);
1107 };
1108
1109// The sqrt and rsqrt refinement processes assume we always start out with an
1110// approximation of the rsqrt. Therefore, if we're going to do any refinement
1111// (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1112// any refinement, we must return a regular sqrt.
1113if (Reciprocal || ExtraSteps > 0) {
1114if (VT == MVT::f32)
1115return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1116 : Intrinsic::nvvm_rsqrt_approx_f);
1117elseif (VT == MVT::f64)
1118return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1119else
1120returnSDValue();
1121 }else {
1122if (VT == MVT::f32)
1123return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1124 : Intrinsic::nvvm_sqrt_approx_f);
1125else {
1126// There's no sqrt.approx.f64 instruction, so we emit
1127// reciprocal(rsqrt(x)). This is faster than
1128// select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1129// x * rsqrt(x).)
1130return DAG.getNode(
1131ISD::INTRINSIC_WO_CHAIN,DL, VT,
1132 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d,DL, MVT::i32),
1133 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1134 }
1135 }
1136}
1137
1138SDValue
1139NVPTXTargetLowering::LowerGlobalAddress(SDValueOp,SelectionDAG &DAG) const{
1140SDLoc dl(Op);
1141constGlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);
1142auto PtrVT =getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());
1143Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);
1144return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT,Op);
1145}
1146
1147staticboolIsTypePassedAsArray(constType *Ty) {
1148return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||
1149 Ty->isHalfTy() || Ty->isBFloatTy();
1150}
1151
1152std::stringNVPTXTargetLowering::getPrototype(
1153constDataLayout &DL,Type *retTy,constArgListTy &Args,
1154constSmallVectorImpl<ISD::OutputArg> &Outs,MaybeAlign retAlignment,
1155 std::optional<std::pair<unsigned, const APInt &>> VAInfo,
1156constCallBase &CB,unsigned UniqueCallSite) const{
1157auto PtrVT =getPointerTy(DL);
1158
1159bool isABI = (STI.getSmVersion() >= 20);
1160assert(isABI &&"Non-ABI compilation is not supported");
1161if (!isABI)
1162return"";
1163
1164 std::string Prototype;
1165raw_string_ostream O(Prototype);
1166 O <<"prototype_" << UniqueCallSite <<" : .callprototype ";
1167
1168if (retTy->getTypeID() ==Type::VoidTyID) {
1169 O <<"()";
1170 }else {
1171 O <<"(";
1172if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&
1173 !IsTypePassedAsArray(retTy)) {
1174unsignedsize = 0;
1175if (auto *ITy = dyn_cast<IntegerType>(retTy)) {
1176size = ITy->getBitWidth();
1177 }else {
1178assert(retTy->isFloatingPointTy() &&
1179"Floating point type expected here");
1180size = retTy->getPrimitiveSizeInBits();
1181 }
1182// PTX ABI requires all scalar return values to be at least 32
1183// bits in size. fp16 normally uses .b16 as its storage type in
1184// PTX, so its size must be adjusted here, too.
1185size =promoteScalarArgumentSize(size);
1186
1187 O <<".param .b" <<size <<" _";
1188 }elseif (isa<PointerType>(retTy)) {
1189 O <<".param .b" << PtrVT.getSizeInBits() <<" _";
1190 }elseif (IsTypePassedAsArray(retTy)) {
1191 O <<".param .align " << (retAlignment ? retAlignment->value() : 0)
1192 <<" .b8 _[" <<DL.getTypeAllocSize(retTy) <<"]";
1193 }else {
1194llvm_unreachable("Unknown return type");
1195 }
1196 O <<") ";
1197 }
1198 O <<"_ (";
1199
1200bool first =true;
1201
1202unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();
1203for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {
1204Type *Ty = Args[i].Ty;
1205if (!first) {
1206 O <<", ";
1207 }
1208 first =false;
1209
1210if (!Outs[OIdx].Flags.isByVal()) {
1211if (IsTypePassedAsArray(Ty)) {
1212Align ParamAlign =
1213 getArgumentAlignment(&CB, Ty, i +AttributeList::FirstArgIndex,DL);
1214 O <<".param .align " << ParamAlign.value() <<" .b8 ";
1215 O <<"_";
1216 O <<"[" <<DL.getTypeAllocSize(Ty) <<"]";
1217// update the index for Outs
1218SmallVector<EVT, 16> vtparts;
1219ComputeValueVTs(*this,DL, Ty, vtparts);
1220if (unsigned len = vtparts.size())
1221 OIdx += len - 1;
1222continue;
1223 }
1224// i8 types in IR will be i16 types in SDAG
1225assert((getValueType(DL, Ty) == Outs[OIdx].VT ||
1226 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&
1227"type mismatch between callee prototype and arguments");
1228// scalar type
1229unsigned sz = 0;
1230if (isa<IntegerType>(Ty)) {
1231 sz = cast<IntegerType>(Ty)->getBitWidth();
1232 sz =promoteScalarArgumentSize(sz);
1233 }elseif (isa<PointerType>(Ty)) {
1234 sz = PtrVT.getSizeInBits();
1235 }else {
1236 sz = Ty->getPrimitiveSizeInBits();
1237 }
1238 O <<".param .b" << sz <<" ";
1239 O <<"_";
1240continue;
1241 }
1242
1243// Indirect calls need strict ABI alignment so we disable optimizations by
1244// not providing a function to optimize.
1245Type *ETy = Args[i].IndirectType;
1246Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1247Align ParamByValAlign =
1248getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign,DL);
1249
1250 O <<".param .align " << ParamByValAlign.value() <<" .b8 ";
1251 O <<"_";
1252 O <<"[" << Outs[OIdx].Flags.getByValSize() <<"]";
1253 }
1254
1255if (VAInfo)
1256 O << (first ?"" :",") <<" .param .align " << VAInfo->second
1257 <<" .b8 _[]\n";
1258 O <<")";
1259if (shouldEmitPTXNoReturn(&CB, *nvTM))
1260 O <<" .noreturn";
1261 O <<";";
1262
1263return Prototype;
1264}
1265
1266AlignNVPTXTargetLowering::getFunctionArgumentAlignment(
1267constFunction *F,Type *Ty,unsignedIdx,constDataLayout &DL) const{
1268returngetAlign(*F,Idx).value_or(getFunctionParamOptimizedAlign(F, Ty,DL));
1269}
1270
1271Align NVPTXTargetLowering::getArgumentAlignment(constCallBase *CB,Type *Ty,
1272unsignedIdx,
1273constDataLayout &DL) const{
1274if (!CB) {
1275// CallSite is zero, fallback to ABI type alignment
1276returnDL.getABITypeAlign(Ty);
1277 }
1278
1279constFunction *DirectCallee = CB->getCalledFunction();
1280
1281if (!DirectCallee) {
1282// We don't have a direct function symbol, but that may be because of
1283// constant cast instructions in the call.
1284
1285// With bitcast'd call targets, the instruction will be the call
1286if (constauto *CI = dyn_cast<CallInst>(CB)) {
1287// Check if we have call alignment metadata
1288if (MaybeAlign StackAlign =getAlign(*CI,Idx))
1289return StackAlign.value();
1290 }
1291 DirectCallee =getMaybeBitcastedCallee(CB);
1292 }
1293
1294// Check for function alignment information if we found that the
1295// ultimate target is a Function
1296if (DirectCallee)
1297returngetFunctionArgumentAlignment(DirectCallee, Ty,Idx,DL);
1298
1299// Call is indirect, fall back to the ABI type alignment
1300returnDL.getABITypeAlign(Ty);
1301}
1302
1303staticbooladjustElementType(EVT &ElementType) {
1304switch (ElementType.getSimpleVT().SimpleTy) {
1305default:
1306returnfalse;
1307case MVT::f16:
1308case MVT::bf16:
1309 ElementType = MVT::i16;
1310returntrue;
1311case MVT::f32:
1312case MVT::v2f16:
1313case MVT::v2bf16:
1314 ElementType = MVT::i32;
1315returntrue;
1316case MVT::f64:
1317 ElementType = MVT::i64;
1318returntrue;
1319 }
1320}
1321
1322// Use byte-store when the param address of the argument value is unaligned.
1323// This may happen when the return value is a field of a packed structure.
1324//
1325// This is called in LowerCall() when passing the param values.
1326staticSDValueLowerUnalignedStoreParam(SelectionDAG &DAG,SDValue Chain,
1327uint64_tOffset,EVT ElementType,
1328SDValue StVal,SDValue &InGlue,
1329unsigned ArgID,constSDLoc &dl) {
1330// Bit logic only works on integer types
1331if (adjustElementType(ElementType))
1332 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);
1333
1334// Store each byte
1335SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1336for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1337// Shift the byte to the last byte position
1338SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,
1339 DAG.getConstant(i * 8, dl, MVT::i32));
1340SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),
1341 DAG.getConstant(Offset + i, dl, MVT::i32),
1342 ShiftVal, InGlue};
1343// Trunc store only the last byte by using
1344// st.param.b8
1345// The register type can be larger than b8.
1346 Chain = DAG.getMemIntrinsicNode(
1347NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,
1348MachinePointerInfo(),Align(1),MachineMemOperand::MOStore);
1349 InGlue = Chain.getValue(1);
1350 }
1351return Chain;
1352}
1353
1354// Use byte-load when the param adress of the returned value is unaligned.
1355// This may happen when the returned value is a field of a packed structure.
1356staticSDValue
1357LowerUnalignedLoadRetParam(SelectionDAG &DAG,SDValue &Chain,uint64_tOffset,
1358EVT ElementType,SDValue &InGlue,
1359SmallVectorImpl<SDValue> &TempProxyRegOps,
1360constSDLoc &dl) {
1361// Bit logic only works on integer types
1362EVT MergedType = ElementType;
1363adjustElementType(MergedType);
1364
1365// Load each byte and construct the whole value. Initial value to 0
1366SDValue RetVal = DAG.getConstant(0, dl, MergedType);
1367// LoadParamMemI8 loads into i16 register only
1368SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);
1369for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
1370SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
1371 DAG.getConstant(Offset + i, dl, MVT::i32),
1372 InGlue};
1373// This will be selected to LoadParamMemI8
1374SDValue LdVal =
1375 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,
1376 MVT::i8,MachinePointerInfo(),Align(1));
1377SDValue TmpLdVal = LdVal.getValue(0);
1378 Chain = LdVal.getValue(1);
1379 InGlue = LdVal.getValue(2);
1380
1381 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,
1382 TmpLdVal.getSimpleValueType(), TmpLdVal);
1383 TempProxyRegOps.push_back(TmpLdVal);
1384
1385SDValue CMask = DAG.getConstant(255, dl, MergedType);
1386SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);
1387// Need to extend the i16 register to the whole width.
1388 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);
1389// Mask off the high bits. Leave only the lower 8bits.
1390// Do this because we are using loadparam.b8.
1391 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);
1392// Shift and merge
1393 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);
1394 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);
1395 }
1396if (ElementType != MergedType)
1397 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
1398
1399return RetVal;
1400}
1401
1402staticboolshouldConvertToIndirectCall(constCallBase *CB,
1403constGlobalAddressSDNode *Func) {
1404if (!Func)
1405returnfalse;
1406if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))
1407return CB->getFunctionType() != CalleeFunc->getFunctionType();
1408returnfalse;
1409}
1410
1411SDValueNVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
1412SmallVectorImpl<SDValue> &InVals) const{
1413
1414if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1415report_fatal_error(
1416"Support for variadic functions (unsized array parameter) introduced "
1417"in PTX ISA version 6.0 and requires target sm_30.");
1418
1419SelectionDAG &DAG = CLI.DAG;
1420SDLoc dl = CLI.DL;
1421SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
1422SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
1423SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1424SDValue Chain = CLI.Chain;
1425SDValue Callee = CLI.Callee;
1426bool &isTailCall = CLI.IsTailCall;
1427ArgListTy &Args = CLI.getArgs();
1428Type *RetTy = CLI.RetTy;
1429constCallBase *CB = CLI.CB;
1430constDataLayout &DL = DAG.getDataLayout();
1431
1432bool isABI = (STI.getSmVersion() >= 20);
1433assert(isABI &&"Non-ABI compilation is not supported");
1434if (!isABI)
1435return Chain;
1436
1437// Variadic arguments.
1438//
1439// Normally, for each argument, we declare a param scalar or a param
1440// byte array in the .param space, and store the argument value to that
1441// param scalar or array starting at offset 0.
1442//
1443// In the case of the first variadic argument, we declare a vararg byte array
1444// with size 0. The exact size of this array isn't known at this point, so
1445// it'll be patched later. All the variadic arguments will be stored to this
1446// array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1447// initially set to 0, so it can be used for non-variadic arguments (which use
1448// 0 offset) to simplify the code.
1449//
1450// After all vararg is processed, 'VAOffset' holds the size of the
1451// vararg byte array.
1452
1453SDValue VADeclareParam;// vararg byte array
1454unsigned FirstVAArg = CLI.NumFixedArgs;// position of the first variadic
1455unsigned VAOffset = 0;// current offset in the param array
1456
1457unsigned UniqueCallSite =GlobalUniqueCallSite.fetch_add(1);
1458SDValue TempChain = Chain;
1459 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);
1460SDValue InGlue = Chain.getValue(1);
1461
1462unsigned ParamCount = 0;
1463// Args.size() and Outs.size() need not match.
1464// Outs.size() will be larger
1465// * if there is an aggregate argument with multiple fields (each field
1466// showing up separately in Outs)
1467// * if there is a vector argument with more than typical vector-length
1468// elements (generally if more than 4) where each vector element is
1469// individually present in Outs.
1470// So a different index should be used for indexing into Outs/OutVals.
1471// See similar issue in LowerFormalArguments.
1472unsigned OIdx = 0;
1473// Declare the .params or .reg need to pass values
1474// to the function
1475for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {
1476EVT VT = Outs[OIdx].VT;
1477Type *Ty = Args[i].Ty;
1478bool IsVAArg = (i >= CLI.NumFixedArgs);
1479bool IsByVal = Outs[OIdx].Flags.isByVal();
1480
1481SmallVector<EVT, 16> VTs;
1482SmallVector<uint64_t, 16> Offsets;
1483
1484assert((!IsByVal || Args[i].IndirectType) &&
1485"byval arg must have indirect type");
1486Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);
1487ComputePTXValueVTs(*this,DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);
1488
1489Align ArgAlign;
1490if (IsByVal) {
1491// The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1492// so we don't need to worry whether it's naturally aligned or not.
1493// See TargetLowering::LowerCallTo().
1494Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
1495 ArgAlign =getFunctionByValParamAlign(CB->getCalledFunction(), ETy,
1496 InitialAlign,DL);
1497if (IsVAArg)
1498 VAOffset =alignTo(VAOffset, ArgAlign);
1499 }else {
1500 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1,DL);
1501 }
1502
1503unsignedTypeSize =
1504 (IsByVal ? Outs[OIdx].Flags.getByValSize() :DL.getTypeAllocSize(Ty));
1505SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1506
1507bool NeedAlign;// Does argument declaration specify alignment?
1508bool PassAsArray = IsByVal ||IsTypePassedAsArray(Ty);
1509if (IsVAArg) {
1510if (ParamCount == FirstVAArg) {
1511SDValue DeclareParamOps[] = {
1512 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),
1513 DAG.getConstant(ParamCount, dl, MVT::i32),
1514 DAG.getConstant(1, dl, MVT::i32), InGlue};
1515 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,
1516 DeclareParamVTs, DeclareParamOps);
1517 }
1518 NeedAlign = PassAsArray;
1519 }elseif (PassAsArray) {
1520// declare .param .align <align> .b8 .param<n>[<size>];
1521SDValue DeclareParamOps[] = {
1522 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
1523 DAG.getConstant(ParamCount, dl, MVT::i32),
1524 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};
1525 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
1526 DeclareParamOps);
1527 NeedAlign =true;
1528 }else {
1529// declare .param .b<size> .param<n>;
1530if (VT.isInteger() || VT.isFloatingPoint()) {
1531// PTX ABI requires integral types to be at least 32 bits in
1532// size. FP16 is loaded/stored using i16, so it's handled
1533// here as well.
1534TypeSize =promoteScalarArgumentSize(TypeSize * 8) / 8;
1535 }
1536SDValue DeclareScalarParamOps[] = {
1537 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),
1538 DAG.getConstant(TypeSize * 8, dl, MVT::i32),
1539 DAG.getConstant(0, dl, MVT::i32), InGlue};
1540 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
1541 DeclareScalarParamOps);
1542 NeedAlign =false;
1543 }
1544 InGlue = Chain.getValue(1);
1545
1546// PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1547// than 32-bits are sign extended or zero extended, depending on
1548// whether they are signed or unsigned types. This case applies
1549// only to scalar parameters and not to aggregate values.
1550bool ExtendIntegerParam =
1551 Ty->isIntegerTy() &&DL.getTypeAllocSizeInBits(Ty) < 32;
1552
1553auto VectorInfo =VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1554SmallVector<SDValue, 6> StoreOperands;
1555for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
1556EVT EltVT = VTs[j];
1557int CurOffset = Offsets[j];
1558MaybeAlign PartAlign;
1559if (NeedAlign)
1560 PartAlign =commonAlignment(ArgAlign, CurOffset);
1561
1562SDValue StVal = OutVals[OIdx];
1563
1564MVT PromotedVT;
1565if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
1566 EltVT =EVT(PromotedVT);
1567 }
1568if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {
1569llvm::ISD::NodeType Ext =
1570 Outs[OIdx].Flags.isSExt() ?ISD::SIGN_EXTEND :ISD::ZERO_EXTEND;
1571 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);
1572 }
1573
1574if (IsByVal) {
1575auto PtrVT =getPointerTy(DL);
1576SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,
1577 DAG.getConstant(CurOffset, dl, PtrVT));
1578 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr,MachinePointerInfo(),
1579 PartAlign);
1580 }elseif (ExtendIntegerParam) {
1581assert(VTs.size() == 1 &&"Scalar can't have multiple parts.");
1582// zext/sext to i32
1583 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ?ISD::SIGN_EXTEND
1584 :ISD::ZERO_EXTEND,
1585 dl, MVT::i32, StVal);
1586 }
1587
1588if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {
1589// Use 16-bit registers for small stores as it's the
1590// smallest general purpose register size supported by NVPTX.
1591 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
1592 }
1593
1594// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1595// scalar store. In such cases, fall back to byte stores.
1596if (VectorInfo[j] ==PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&
1597 PartAlign.value() <
1598DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {
1599assert(StoreOperands.empty() &&"Unfinished preceeding store.");
1600 Chain =LowerUnalignedStoreParam(
1601 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,
1602 StVal, InGlue, ParamCount, dl);
1603
1604// LowerUnalignedStoreParam took care of inserting the necessary nodes
1605// into the SDAG, so just move on to the next element.
1606if (!IsByVal)
1607 ++OIdx;
1608continue;
1609 }
1610
1611// New store.
1612if (VectorInfo[j] &PVF_FIRST) {
1613assert(StoreOperands.empty() &&"Unfinished preceding store.");
1614 StoreOperands.push_back(Chain);
1615 StoreOperands.push_back(
1616 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));
1617
1618 StoreOperands.push_back(DAG.getConstant(
1619 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),
1620 dl, MVT::i32));
1621 }
1622
1623// Record the value to store.
1624 StoreOperands.push_back(StVal);
1625
1626if (VectorInfo[j] &PVF_LAST) {
1627unsigned NumElts = StoreOperands.size() - 3;
1628NVPTXISD::NodeTypeOp;
1629switch (NumElts) {
1630case 1:
1631Op =NVPTXISD::StoreParam;
1632break;
1633case 2:
1634Op =NVPTXISD::StoreParamV2;
1635break;
1636case 4:
1637Op =NVPTXISD::StoreParamV4;
1638break;
1639default:
1640llvm_unreachable("Invalid vector info.");
1641 }
1642
1643 StoreOperands.push_back(InGlue);
1644
1645// Adjust type of the store op if we've extended the scalar
1646// return value.
1647EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1648
1649 Chain = DAG.getMemIntrinsicNode(
1650Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
1651 TheStoreType,MachinePointerInfo(), PartAlign,
1652MachineMemOperand::MOStore);
1653 InGlue = Chain.getValue(1);
1654
1655// Cleanup.
1656 StoreOperands.clear();
1657
1658// TODO: We may need to support vector types that can be passed
1659// as scalars in variadic arguments.
1660if (!IsByVal && IsVAArg) {
1661assert(NumElts == 1 &&
1662"Vectorization is expected to be disabled for variadics.");
1663 VAOffset +=DL.getTypeAllocSize(
1664 TheStoreType.getTypeForEVT(*DAG.getContext()));
1665 }
1666 }
1667if (!IsByVal)
1668 ++OIdx;
1669 }
1670assert(StoreOperands.empty() &&"Unfinished parameter store.");
1671if (!IsByVal && VTs.size() > 0)
1672 --OIdx;
1673 ++ParamCount;
1674if (IsByVal && IsVAArg)
1675 VAOffset +=TypeSize;
1676 }
1677
1678GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1679MaybeAlign retAlignment = std::nullopt;
1680
1681// Handle Result
1682if (Ins.size() > 0) {
1683SmallVector<EVT, 16> resvtparts;
1684ComputeValueVTs(*this,DL,RetTy, resvtparts);
1685
1686// Declare
1687// .param .align N .b8 retval0[<size-in-bytes>], or
1688// .param .b<size-in-bits> retval0
1689unsigned resultsz =DL.getTypeAllocSizeInBits(RetTy);
1690if (!IsTypePassedAsArray(RetTy)) {
1691 resultsz =promoteScalarArgumentSize(resultsz);
1692SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1693SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1694 DAG.getConstant(resultsz, dl, MVT::i32),
1695 DAG.getConstant(0, dl, MVT::i32), InGlue };
1696 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
1697 DeclareRetOps);
1698 InGlue = Chain.getValue(1);
1699 }else {
1700 retAlignment = getArgumentAlignment(CB,RetTy, 0,DL);
1701assert(retAlignment &&"retAlignment is guaranteed to be set");
1702SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1703SDValue DeclareRetOps[] = {
1704 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
1705 DAG.getConstant(resultsz / 8, dl, MVT::i32),
1706 DAG.getConstant(0, dl, MVT::i32), InGlue};
1707 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
1708 DeclareRetOps);
1709 InGlue = Chain.getValue(1);
1710 }
1711 }
1712
1713bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);
1714// Set the size of the vararg param byte array if the callee is a variadic
1715// function and the variadic part is not empty.
1716if (HasVAArgs) {
1717SDValue DeclareParamOps[] = {
1718 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),
1719 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),
1720 VADeclareParam.getOperand(4)};
1721 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1722 VADeclareParam->getVTList(), DeclareParamOps);
1723 }
1724
1725// If the type of the callsite does not match that of the function, convert
1726// the callsite to an indirect call.
1727bool ConvertToIndirectCall =shouldConvertToIndirectCall(CB, Func);
1728
1729// Both indirect calls and libcalls have nullptr Func. In order to distinguish
1730// between them we must rely on the call site value which is valid for
1731// indirect calls but is always null for libcalls.
1732boolisIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1733
1734if (isa<ExternalSymbolSDNode>(Callee)) {
1735Function* CalleeFunc =nullptr;
1736
1737// Try to find the callee in the current module.
1738 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1739assert(CalleeFunc !=nullptr &&"Libcall callee must be set.");
1740
1741// Set the "libcall callee" attribute to indicate that the function
1742// must always have a declaration.
1743 CalleeFunc->addFnAttr("nvptx-libcall-callee","true");
1744 }
1745
1746if (isIndirectCall) {
1747// This is indirect function call case : PTX requires a prototype of the
1748// form
1749// proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1750// to be emitted, and the label has to used as the last arg of call
1751// instruction.
1752// The prototype is embedded in a string and put as the operand for a
1753// CallPrototype SDNode which will print out to the value of the string.
1754SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1755 std::string Proto =getPrototype(
1756DL,RetTy, Args, Outs, retAlignment,
1757 HasVAArgs
1758 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(
1759 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))
1760 : std::nullopt,
1761 *CB, UniqueCallSite);
1762constchar *ProtoStr =nvTM->getStrPool().save(Proto).data();
1763SDValue ProtoOps[] = {
1764 Chain,
1765 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),
1766 InGlue,
1767 };
1768 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
1769 InGlue = Chain.getValue(1);
1770 }
1771// Op to just print "call"
1772SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1773SDValue PrintCallOps[] = {
1774 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue
1775 };
1776// We model convergent calls as separate opcodes.
1777unsigned Opcode =isIndirectCall ?NVPTXISD::PrintCall :NVPTXISD::PrintCallUni;
1778if (CLI.IsConvergent)
1779 Opcode = Opcode ==NVPTXISD::PrintCallUni ?NVPTXISD::PrintConvergentCallUni
1780 :NVPTXISD::PrintConvergentCall;
1781 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);
1782 InGlue = Chain.getValue(1);
1783
1784if (ConvertToIndirectCall) {
1785// Copy the function ptr to a ptx register and use the register to call the
1786// function.
1787EVT DestVT = Callee.getValueType();
1788MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
1789constTargetLowering &TLI = DAG.getTargetLoweringInfo();
1790unsigned DestReg =
1791 RegInfo.createVirtualRegister(TLI.getRegClassFor(DestVT.getSimpleVT()));
1792auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee);
1793 Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);
1794 }
1795
1796// Ops to print out the function name
1797SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1798SDValue CallVoidOps[] = { Chain, Callee, InGlue };
1799 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
1800 InGlue = Chain.getValue(1);
1801
1802// Ops to print out the param list
1803SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1804SDValue CallArgBeginOps[] = { Chain, InGlue };
1805 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
1806 CallArgBeginOps);
1807 InGlue = Chain.getValue(1);
1808
1809for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;
1810 ++i) {
1811unsigned opcode;
1812if (i == (e - 1))
1813 opcode =NVPTXISD::LastCallArg;
1814else
1815 opcode =NVPTXISD::CallArg;
1816SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1817SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),
1818 DAG.getConstant(i, dl, MVT::i32), InGlue };
1819 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
1820 InGlue = Chain.getValue(1);
1821 }
1822SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1823SDValue CallArgEndOps[] = { Chain,
1824 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),
1825 InGlue };
1826 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
1827 InGlue = Chain.getValue(1);
1828
1829if (isIndirectCall) {
1830SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
1831SDValue PrototypeOps[] = {
1832 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};
1833 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
1834 InGlue = Chain.getValue(1);
1835 }
1836
1837SmallVector<SDValue, 16> ProxyRegOps;
1838SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;
1839// An item of the vector is filled if the element does not need a ProxyReg
1840// operation on it and should be added to InVals as is. ProxyRegOps and
1841// ProxyRegTruncates contain empty/none items at the same index.
1842SmallVector<SDValue, 16> RetElts;
1843// A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`
1844// to use the values of `LoadParam`s and to be replaced later then
1845// `CALLSEQ_END` is added.
1846SmallVector<SDValue, 16> TempProxyRegOps;
1847
1848// Generate loads from param memory/moves from registers for result
1849if (Ins.size() > 0) {
1850SmallVector<EVT, 16> VTs;
1851SmallVector<uint64_t, 16> Offsets;
1852ComputePTXValueVTs(*this,DL,RetTy, VTs, &Offsets, 0);
1853assert(VTs.size() == Ins.size() &&"Bad value decomposition");
1854
1855Align RetAlign = getArgumentAlignment(CB,RetTy, 0,DL);
1856auto VectorInfo =VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1857
1858SmallVector<EVT, 6> LoadVTs;
1859int VecIdx = -1;// Index of the first element of the vector.
1860
1861// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1862// 32-bits are sign extended or zero extended, depending on whether
1863// they are signed or unsigned types.
1864bool ExtendIntegerRetVal =
1865RetTy->isIntegerTy() &&DL.getTypeAllocSizeInBits(RetTy) < 32;
1866
1867for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
1868bool needTruncate =false;
1869EVT TheLoadType = VTs[i];
1870EVT EltType = Ins[i].VT;
1871Align EltAlign =commonAlignment(RetAlign, Offsets[i]);
1872MVT PromotedVT;
1873
1874if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {
1875 TheLoadType =EVT(PromotedVT);
1876 EltType =EVT(PromotedVT);
1877 needTruncate =true;
1878 }
1879
1880if (ExtendIntegerRetVal) {
1881 TheLoadType = MVT::i32;
1882 EltType = MVT::i32;
1883 needTruncate =true;
1884 }elseif (TheLoadType.getSizeInBits() < 16) {
1885if (VTs[i].isInteger())
1886 needTruncate =true;
1887 EltType = MVT::i16;
1888 }
1889
1890// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a
1891// scalar load. In such cases, fall back to byte loads.
1892if (VectorInfo[i] ==PVF_SCALAR &&RetTy->isAggregateType() &&
1893 EltAlign <DL.getABITypeAlign(
1894 TheLoadType.getTypeForEVT(*DAG.getContext()))) {
1895assert(VecIdx == -1 && LoadVTs.empty() &&"Orphaned operand list.");
1896SDValue Ret =LowerUnalignedLoadRetParam(
1897 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);
1898 ProxyRegOps.push_back(SDValue());
1899 ProxyRegTruncates.push_back(std::optional<MVT>());
1900 RetElts.resize(i);
1901 RetElts.push_back(Ret);
1902
1903continue;
1904 }
1905
1906// Record index of the very first element of the vector.
1907if (VectorInfo[i] &PVF_FIRST) {
1908assert(VecIdx == -1 && LoadVTs.empty() &&"Orphaned operand list.");
1909 VecIdx = i;
1910 }
1911
1912 LoadVTs.push_back(EltType);
1913
1914if (VectorInfo[i] &PVF_LAST) {
1915unsigned NumElts = LoadVTs.size();
1916 LoadVTs.push_back(MVT::Other);
1917 LoadVTs.push_back(MVT::Glue);
1918NVPTXISD::NodeTypeOp;
1919switch (NumElts) {
1920case 1:
1921Op =NVPTXISD::LoadParam;
1922break;
1923case 2:
1924Op =NVPTXISD::LoadParamV2;
1925break;
1926case 4:
1927Op =NVPTXISD::LoadParamV4;
1928break;
1929default:
1930llvm_unreachable("Invalid vector info.");
1931 }
1932
1933SDValue LoadOperands[] = {
1934 Chain, DAG.getConstant(1, dl, MVT::i32),
1935 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};
1936SDValue RetVal = DAG.getMemIntrinsicNode(
1937Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
1938MachinePointerInfo(), EltAlign,
1939MachineMemOperand::MOLoad);
1940
1941for (unsigned j = 0; j < NumElts; ++j) {
1942 ProxyRegOps.push_back(RetVal.getValue(j));
1943
1944if (needTruncate)
1945 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));
1946else
1947 ProxyRegTruncates.push_back(std::optional<MVT>());
1948 }
1949
1950 Chain = RetVal.getValue(NumElts);
1951 InGlue = RetVal.getValue(NumElts + 1);
1952
1953// Cleanup
1954 VecIdx = -1;
1955 LoadVTs.clear();
1956 }
1957 }
1958 }
1959
1960 Chain =
1961 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);
1962 InGlue = Chain.getValue(1);
1963
1964// Append ProxyReg instructions to the chain to make sure that `callseq_end`
1965// will not get lost. Otherwise, during libcalls expansion, the nodes can become
1966// dangling.
1967for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {
1968if (i < RetElts.size() && RetElts[i]) {
1969 InVals.push_back(RetElts[i]);
1970continue;
1971 }
1972
1973SDValue Ret = DAG.getNode(
1974NVPTXISD::ProxyReg, dl,
1975 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),
1976 { Chain, ProxyRegOps[i], InGlue }
1977 );
1978
1979 Chain = Ret.getValue(1);
1980 InGlue = Ret.getValue(2);
1981
1982if (ProxyRegTruncates[i]) {
1983 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);
1984 }
1985
1986 InVals.push_back(Ret);
1987 }
1988
1989for (SDValue &T : TempProxyRegOps) {
1990SDValue Repl = DAG.getNode(
1991NVPTXISD::ProxyReg, dl,
1992 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),
1993 {Chain, T.getOperand(0), InGlue});
1994 DAG.ReplaceAllUsesWith(T, Repl);
1995 DAG.RemoveDeadNode(T.getNode());
1996
1997 Chain = Repl.getValue(1);
1998 InGlue = Repl.getValue(2);
1999 }
2000
2001// set isTailCall to false for now, until we figure out how to express
2002// tail call optimization in PTX
2003 isTailCall =false;
2004return Chain;
2005}
2006
2007SDValueNVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValueOp,
2008SelectionDAG &DAG) const{
2009
2010if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2011constFunction &Fn = DAG.getMachineFunction().getFunction();
2012
2013DiagnosticInfoUnsupported NoDynamicAlloca(
2014 Fn,
2015"Support for dynamic alloca introduced in PTX ISA version 7.3 and "
2016"requires target sm_52.",
2017SDLoc(Op).getDebugLoc());
2018 DAG.getContext()->diagnose(NoDynamicAlloca);
2019auto Ops = {DAG.getConstant(0,SDLoc(),Op.getValueType()),
2020Op.getOperand(0)};
2021return DAG.getMergeValues(Ops,SDLoc());
2022 }
2023
2024SDValue Chain =Op.getOperand(0);
2025SDValueSize =Op.getOperand(1);
2026uint64_tAlign = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
2027SDLocDL(Op.getNode());
2028
2029// The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
2030MVT ValueSizeTy =nvTM->is64Bit() ? MVT::i64 : MVT::i32;
2031
2032SDValue AllocOps[] = {Chain, DAG.getZExtOrTrunc(Size,DL, ValueSizeTy),
2033 DAG.getTargetConstant(Align,DL, MVT::i32)};
2034EVT RetTypes[] = {ValueSizeTy, MVT::Other};
2035return DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC,DL, RetTypes, AllocOps);
2036}
2037
2038SDValueNVPTXTargetLowering::LowerSTACKRESTORE(SDValueOp,
2039SelectionDAG &DAG) const{
2040SDLocDL(Op.getNode());
2041if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2042constFunction &Fn = DAG.getMachineFunction().getFunction();
2043
2044DiagnosticInfoUnsupported NoStackRestore(
2045 Fn,
2046"Support for stackrestore requires PTX ISA version >= 7.3 and target "
2047">= sm_52.",
2048DL.getDebugLoc());
2049 DAG.getContext()->diagnose(NoStackRestore);
2050returnOp.getOperand(0);
2051 }
2052
2053constMVT LocalVT =getPointerTy(DAG.getDataLayout(),ADDRESS_SPACE_LOCAL);
2054SDValue Chain =Op.getOperand(0);
2055SDValuePtr =Op.getOperand(1);
2056SDValue ASC = DAG.getAddrSpaceCast(DL, LocalVT,Ptr,ADDRESS_SPACE_GENERIC,
2057ADDRESS_SPACE_LOCAL);
2058return DAG.getNode(NVPTXISD::STACKRESTORE,DL, MVT::Other, {Chain, ASC});
2059}
2060
2061SDValueNVPTXTargetLowering::LowerSTACKSAVE(SDValueOp,
2062SelectionDAG &DAG) const{
2063SDLocDL(Op.getNode());
2064if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
2065constFunction &Fn = DAG.getMachineFunction().getFunction();
2066
2067DiagnosticInfoUnsupported NoStackSave(
2068 Fn,
2069"Support for stacksave requires PTX ISA version >= 7.3 and target >= "
2070"sm_52.",
2071DL.getDebugLoc());
2072 DAG.getContext()->diagnose(NoStackSave);
2073auto Ops = {DAG.getConstant(0,DL,Op.getValueType()),Op.getOperand(0)};
2074return DAG.getMergeValues(Ops,DL);
2075 }
2076
2077constMVT LocalVT =getPointerTy(DAG.getDataLayout(),ADDRESS_SPACE_LOCAL);
2078SDValue Chain =Op.getOperand(0);
2079SDValue SS =
2080 DAG.getNode(NVPTXISD::STACKSAVE,DL, {LocalVT, MVT::Other}, Chain);
2081SDValue ASC = DAG.getAddrSpaceCast(
2082DL,Op.getValueType(), SS,ADDRESS_SPACE_LOCAL,ADDRESS_SPACE_GENERIC);
2083return DAG.getMergeValues({ASC,SDValue(SS.getNode(), 1)},DL);
2084}
2085
2086// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
2087// (see LegalizeDAG.cpp). This is slow and uses local memory.
2088// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
2089SDValue
2090NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValueOp,SelectionDAG &DAG) const{
2091SDNode *Node =Op.getNode();
2092SDLoc dl(Node);
2093SmallVector<SDValue, 8> Ops;
2094unsigned NumOperands = Node->getNumOperands();
2095for (unsigned i = 0; i < NumOperands; ++i) {
2096SDValue SubOp = Node->getOperand(i);
2097EVT VVT = SubOp.getNode()->getValueType(0);
2098EVT EltVT = VVT.getVectorElementType();
2099unsigned NumSubElem = VVT.getVectorNumElements();
2100for (unsigned j = 0; j < NumSubElem; ++j) {
2101 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
2102 DAG.getIntPtrConstant(j, dl)));
2103 }
2104 }
2105return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
2106}
2107
2108SDValue NVPTXTargetLowering::LowerBITCAST(SDValueOp,SelectionDAG &DAG) const{
2109// Handle bitcasting from v2i8 without hitting the default promotion
2110// strategy which goes through stack memory.
2111EVT FromVT =Op->getOperand(0)->getValueType(0);
2112if (FromVT != MVT::v2i8) {
2113returnOp;
2114 }
2115
2116// Pack vector elements into i16 and bitcast to final type
2117SDLocDL(Op);
2118SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i8,
2119Op->getOperand(0), DAG.getIntPtrConstant(0,DL));
2120SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i8,
2121Op->getOperand(0), DAG.getIntPtrConstant(1,DL));
2122SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND,DL, MVT::i16, Vec0);
2123SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND,DL, MVT::i16, Vec1);
2124SDValue Const8 = DAG.getConstant(8,DL, MVT::i16);
2125SDValue AsInt = DAG.getNode(
2126ISD::OR,DL, MVT::i16,
2127 {Extend0, DAG.getNode(ISD::SHL,DL, MVT::i16, {Extend1, Const8})});
2128EVT ToVT =Op->getValueType(0);
2129returnMaybeBitcast(DAG,DL, ToVT, AsInt);
2130}
2131
2132// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
2133// would get lowered as two constant loads and vector-packing move.
2134// Instead we want just a constant move:
2135// mov.b32 %r2, 0x40003C00
2136SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValueOp,
2137SelectionDAG &DAG) const{
2138EVT VT =Op->getValueType(0);
2139if (!(Isv2x16VT(VT) || VT == MVT::v4i8))
2140returnOp;
2141SDLocDL(Op);
2142
2143if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
2144 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
2145 isa<ConstantFPSDNode>(Operand);
2146 })) {
2147if (VT != MVT::v4i8)
2148returnOp;
2149// Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
2150// to optimize calculation of constant parts.
2151auto GetPRMT = [&](constSDValueLeft,constSDValueRight,bool Cast,
2152uint64_t SelectionValue) ->SDValue {
2153SDValueL =Left;
2154SDValueR =Right;
2155if (Cast) {
2156L = DAG.getAnyExtOrTrunc(L,DL, MVT::i32);
2157R = DAG.getAnyExtOrTrunc(R,DL, MVT::i32);
2158 }
2159return DAG.getNode(
2160NVPTXISD::PRMT,DL, MVT::v4i8,
2161 {L,R, DAG.getConstant(SelectionValue,DL, MVT::i32),
2162 DAG.getConstant(NVPTX::PTXPrmtMode::NONE,DL, MVT::i32)});
2163 };
2164auto PRMT__10 = GetPRMT(Op->getOperand(0),Op->getOperand(1),true, 0x3340);
2165auto PRMT__32 = GetPRMT(Op->getOperand(2),Op->getOperand(3),true, 0x3340);
2166auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32,false, 0x5410);
2167return DAG.getNode(ISD::BITCAST,DL, VT, PRMT3210);
2168 }
2169
2170// Get value or the Nth operand as an APInt(32). Undef values treated as 0.
2171auto GetOperand = [](SDValueOp,intN) ->APInt {
2172constSDValue &Operand =Op->getOperand(N);
2173EVT VT =Op->getValueType(0);
2174if (Operand->isUndef())
2175returnAPInt(32, 0);
2176APIntValue;
2177if (VT == MVT::v2f16 || VT == MVT::v2bf16)
2178Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
2179elseif (VT == MVT::v2i16 || VT == MVT::v4i8)
2180Value = Operand->getAsAPIntVal();
2181else
2182llvm_unreachable("Unsupported type");
2183// i8 values are carried around as i16, so we need to zero out upper bits,
2184// so they do not get in the way of combining individual byte values
2185if (VT == MVT::v4i8)
2186Value =Value.trunc(8);
2187returnValue.zext(32);
2188 };
2189APIntValue;
2190if (Isv2x16VT(VT)) {
2191Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);
2192 }elseif (VT == MVT::v4i8) {
2193Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |
2194 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);
2195 }else {
2196llvm_unreachable("Unsupported type");
2197 }
2198SDValueConst = DAG.getConstant(Value,DL, MVT::i32);
2199return DAG.getNode(ISD::BITCAST,DL,Op->getValueType(0), Const);
2200}
2201
2202SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValueOp,
2203SelectionDAG &DAG) const{
2204SDValueIndex =Op->getOperand(1);
2205SDValueVector =Op->getOperand(0);
2206SDLocDL(Op);
2207EVT VectorVT =Vector.getValueType();
2208
2209if (VectorVT == MVT::v4i8) {
2210SDValueBFE =
2211 DAG.getNode(NVPTXISD::BFE,DL, MVT::i32,
2212 {Vector,
2213 DAG.getNode(ISD::MUL,DL, MVT::i32,
2214 DAG.getZExtOrTrunc(Index,DL, MVT::i32),
2215 DAG.getConstant(8,DL, MVT::i32)),
2216 DAG.getConstant(8,DL, MVT::i32)});
2217return DAG.getAnyExtOrTrunc(BFE,DL,Op->getValueType(0));
2218 }
2219
2220// Constant index will be matched by tablegen.
2221if (isa<ConstantSDNode>(Index.getNode()))
2222returnOp;
2223
2224// Extract individual elements and select one of them.
2225assert(Isv2x16VT(VectorVT) &&"Unexpected vector type.");
2226EVT EltVT = VectorVT.getVectorElementType();
2227
2228SDLoc dl(Op.getNode());
2229SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,Vector,
2230 DAG.getIntPtrConstant(0, dl));
2231SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,Vector,
2232 DAG.getIntPtrConstant(1, dl));
2233return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2234ISD::CondCode::SETEQ);
2235}
2236
2237SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValueOp,
2238SelectionDAG &DAG) const{
2239SDValueVector =Op->getOperand(0);
2240EVT VectorVT =Vector.getValueType();
2241
2242if (VectorVT != MVT::v4i8)
2243returnOp;
2244SDLocDL(Op);
2245SDValueValue =Op->getOperand(1);
2246if (Value->isUndef())
2247returnVector;
2248
2249SDValueIndex =Op->getOperand(2);
2250
2251SDValueBFI =
2252 DAG.getNode(NVPTXISD::BFI,DL, MVT::i32,
2253 {DAG.getZExtOrTrunc(Value,DL, MVT::i32),Vector,
2254 DAG.getNode(ISD::MUL,DL, MVT::i32,
2255 DAG.getZExtOrTrunc(Index,DL, MVT::i32),
2256 DAG.getConstant(8,DL, MVT::i32)),
2257 DAG.getConstant(8,DL, MVT::i32)});
2258return DAG.getNode(ISD::BITCAST,DL,Op->getValueType(0), BFI);
2259}
2260
2261SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValueOp,
2262SelectionDAG &DAG) const{
2263SDValue V1 =Op.getOperand(0);
2264EVT VectorVT = V1.getValueType();
2265if (VectorVT != MVT::v4i8 ||Op.getValueType() != MVT::v4i8)
2266returnOp;
2267
2268// Lower shuffle to PRMT instruction.
2269constShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2270SDValueV2 =Op.getOperand(1);
2271uint32_t Selector = 0;
2272for (autoI :llvm::enumerate(SVN->getMask())) {
2273if (I.value() != -1)// -1 is a placeholder for undef.
2274 Selector |= (I.value() << (I.index() * 4));
2275 }
2276
2277SDLocDL(Op);
2278return DAG.getNode(NVPTXISD::PRMT,DL, MVT::v4i8, V1, V2,
2279 DAG.getConstant(Selector,DL, MVT::i32),
2280 DAG.getConstant(NVPTX::PTXPrmtMode::NONE,DL, MVT::i32));
2281}
2282/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2283/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2284/// amount, or
2285/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2286/// amount.
2287SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValueOp,
2288SelectionDAG &DAG) const{
2289assert(Op.getNumOperands() == 3 &&"Not a double-shift!");
2290assert(Op.getOpcode() ==ISD::SRA_PARTS ||Op.getOpcode() ==ISD::SRL_PARTS);
2291
2292EVT VT =Op.getValueType();
2293unsigned VTBits = VT.getSizeInBits();
2294SDLoc dl(Op);
2295SDValue ShOpLo =Op.getOperand(0);
2296SDValue ShOpHi =Op.getOperand(1);
2297SDValue ShAmt =Op.getOperand(2);
2298unsigned Opc = (Op.getOpcode() ==ISD::SRA_PARTS) ?ISD::SRA :ISD::SRL;
2299
2300if (VTBits == 32 && STI.getSmVersion() >= 35) {
2301// For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2302// {dHi, dLo} = {aHi, aLo} >> Amt
2303// dHi = aHi >> Amt
2304// dLo = shf.r.clamp aLo, aHi, Amt
2305
2306SDValueHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2307SDValueLo =
2308 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2309
2310SDValue Ops[2] = {Lo,Hi };
2311return DAG.getMergeValues(Ops, dl);
2312 }
2313else {
2314// {dHi, dLo} = {aHi, aLo} >> Amt
2315// - if (Amt>=size) then
2316// dLo = aHi >> (Amt-size)
2317// dHi = aHi >> Amt (this is either all 0 or all 1)
2318// else
2319// dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2320// dHi = aHi >> Amt
2321
2322SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2323 DAG.getConstant(VTBits, dl, MVT::i32),
2324 ShAmt);
2325SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2326SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2327 DAG.getConstant(VTBits, dl, MVT::i32));
2328SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2329SDValueFalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2330SDValueTrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2331
2332SDValueCmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2333 DAG.getConstant(VTBits, dl, MVT::i32),
2334ISD::SETGE);
2335SDValueHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2336SDValueLo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2337
2338SDValue Ops[2] = {Lo,Hi };
2339return DAG.getMergeValues(Ops, dl);
2340 }
2341}
2342
2343/// LowerShiftLeftParts - Lower SHL_PARTS, which
2344/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2345/// amount, or
2346/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2347/// amount.
2348SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValueOp,
2349SelectionDAG &DAG) const{
2350assert(Op.getNumOperands() == 3 &&"Not a double-shift!");
2351assert(Op.getOpcode() ==ISD::SHL_PARTS);
2352
2353EVT VT =Op.getValueType();
2354unsigned VTBits = VT.getSizeInBits();
2355SDLoc dl(Op);
2356SDValue ShOpLo =Op.getOperand(0);
2357SDValue ShOpHi =Op.getOperand(1);
2358SDValue ShAmt =Op.getOperand(2);
2359
2360if (VTBits == 32 && STI.getSmVersion() >= 35) {
2361// For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2362// {dHi, dLo} = {aHi, aLo} << Amt
2363// dHi = shf.l.clamp aLo, aHi, Amt
2364// dLo = aLo << Amt
2365
2366SDValueHi =
2367 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2368SDValueLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2369
2370SDValue Ops[2] = {Lo,Hi };
2371return DAG.getMergeValues(Ops, dl);
2372 }
2373else {
2374// {dHi, dLo} = {aHi, aLo} << Amt
2375// - if (Amt>=size) then
2376// dLo = aLo << Amt (all 0)
2377// dLo = aLo << (Amt-size)
2378// else
2379// dLo = aLo << Amt
2380// dHi = (aHi << Amt) | (aLo >> (size-Amt))
2381
2382SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2383 DAG.getConstant(VTBits, dl, MVT::i32),
2384 ShAmt);
2385SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2386SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2387 DAG.getConstant(VTBits, dl, MVT::i32));
2388SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2389SDValueFalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2390SDValueTrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2391
2392SDValueCmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2393 DAG.getConstant(VTBits, dl, MVT::i32),
2394ISD::SETGE);
2395SDValueLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2396SDValueHi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2397
2398SDValue Ops[2] = {Lo,Hi };
2399return DAG.getMergeValues(Ops, dl);
2400 }
2401}
2402
2403/// If the types match, convert the generic copysign to the NVPTXISD version,
2404/// otherwise bail ensuring that mismatched cases are properly expaned.
2405SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValueOp,
2406SelectionDAG &DAG) const{
2407EVT VT =Op.getValueType();
2408SDLocDL(Op);
2409
2410SDValue In1 =Op.getOperand(0);
2411SDValue In2 =Op.getOperand(1);
2412EVT SrcVT = In2.getValueType();
2413
2414if (!SrcVT.bitsEq(VT))
2415returnSDValue();
2416
2417return DAG.getNode(NVPTXISD::FCOPYSIGN,DL, VT, In1, In2);
2418}
2419
2420SDValue NVPTXTargetLowering::LowerFROUND(SDValueOp,SelectionDAG &DAG) const{
2421EVT VT =Op.getValueType();
2422
2423if (VT == MVT::f32)
2424return LowerFROUND32(Op, DAG);
2425
2426if (VT == MVT::f64)
2427return LowerFROUND64(Op, DAG);
2428
2429llvm_unreachable("unhandled type");
2430}
2431
2432// This is the the rounding method used in CUDA libdevice in C like code:
2433// float roundf(float A)
2434// {
2435// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2436// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2437// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2438// }
2439SDValue NVPTXTargetLowering::LowerFROUND32(SDValueOp,
2440SelectionDAG &DAG) const{
2441SDLoc SL(Op);
2442SDValueA =Op.getOperand(0);
2443EVT VT =Op.getValueType();
2444
2445SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT,A);
2446
2447// RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2448SDValueBitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32,A);
2449constunsigned SignBitMask = 0x80000000;
2450SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2451 DAG.getConstant(SignBitMask, SL, MVT::i32));
2452constunsigned PointFiveInBits = 0x3F000000;
2453SDValue PointFiveWithSignRaw =
2454 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2455 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2456SDValue PointFiveWithSign =
2457 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2458SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT,A, PointFiveWithSign);
2459SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2460
2461// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2462EVT SetCCVT =getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2463SDValue IsLarge =
2464 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2465ISD::SETOGT);
2466 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge,A, RoundedA);
2467
2468// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2469SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2470 DAG.getConstantFP(0.5, SL, VT),ISD::SETOLT);
2471SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT,A);
2472return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2473}
2474
2475// The implementation of round(double) is similar to that of round(float) in
2476// that they both separate the value range into three regions and use a method
2477// specific to the region to round the values. However, round(double) first
2478// calculates the round of the absolute value and then adds the sign back while
2479// round(float) directly rounds the value with sign.
2480SDValue NVPTXTargetLowering::LowerFROUND64(SDValueOp,
2481SelectionDAG &DAG) const{
2482SDLoc SL(Op);
2483SDValueA =Op.getOperand(0);
2484EVT VT =Op.getValueType();
2485
2486SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT,A);
2487
2488// double RoundedA = (double) (int) (abs(A) + 0.5f);
2489SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2490 DAG.getConstantFP(0.5, SL, VT));
2491SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2492
2493// RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2494EVT SetCCVT =getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2495SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2496 DAG.getConstantFP(0.5, SL, VT),ISD::SETOLT);
2497 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2498 DAG.getConstantFP(0, SL, VT),
2499 RoundedA);
2500
2501// Add sign to rounded_A
2502 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA,A);
2503 DAG.getNode(ISD::FTRUNC, SL, VT,A);
2504
2505// RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2506SDValue IsLarge =
2507 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2508ISD::SETOGT);
2509return DAG.getNode(ISD::SELECT, SL, VT, IsLarge,A, RoundedA);
2510}
2511
2512staticSDValuePromoteBinOpToF32(SDNode *N,SelectionDAG &DAG) {
2513EVT VT =N->getValueType(0);
2514EVT NVT = MVT::f32;
2515if (VT.isVector()) {
2516 NVT =EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2517 }
2518SDLocDL(N);
2519SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0),DL, NVT);
2520SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1),DL, NVT);
2521SDValue Res = DAG.getNode(N->getOpcode(),DL, NVT, Tmp0, Tmp1,N->getFlags());
2522return DAG.getFPExtendOrRound(Res,DL, VT);
2523}
2524
2525SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValueOp,
2526SelectionDAG &DAG) const{
2527if (useF32FTZ(DAG.getMachineFunction())) {
2528returnPromoteBinOpToF32(Op.getNode(), DAG);
2529 }
2530returnOp;
2531}
2532
2533SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValueOp,
2534SelectionDAG &DAG) const{
2535assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2536
2537if (Op.getValueType() == MVT::bf16) {
2538SDLoc Loc(Op);
2539return DAG.getNode(
2540ISD::FP_ROUND, Loc, MVT::bf16,
2541 DAG.getNode(Op.getOpcode(), Loc, MVT::f32,Op.getOperand(0)),
2542 DAG.getIntPtrConstant(0, Loc,/*isTarget=*/true));
2543 }
2544
2545// Everything else is considered legal.
2546returnOp;
2547}
2548
2549SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValueOp,
2550SelectionDAG &DAG) const{
2551assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2552
2553if (Op.getOperand(0).getValueType() == MVT::bf16) {
2554SDLoc Loc(Op);
2555return DAG.getNode(
2556Op.getOpcode(), Loc,Op.getValueType(),
2557 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32,Op.getOperand(0)));
2558 }
2559
2560// Everything else is considered legal.
2561returnOp;
2562}
2563
2564SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValueOp,
2565SelectionDAG &DAG) const{
2566EVT NarrowVT =Op.getValueType();
2567SDValue Wide =Op.getOperand(0);
2568EVT WideVT = Wide.getValueType();
2569if (NarrowVT.getScalarType() == MVT::bf16) {
2570constTargetLowering *TLI = STI.getTargetLowering();
2571if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2572return TLI->expandFP_ROUND(Op.getNode(), DAG);
2573 }
2574if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2575// This combination was the first to support f32 -> bf16.
2576if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2577if (WideVT.getScalarType() == MVT::f32) {
2578returnOp;
2579 }
2580if (WideVT.getScalarType() == MVT::f64) {
2581SDLoc Loc(Op);
2582// Round-inexact-to-odd f64 to f32, then do the final rounding using
2583// the hardware f32 -> bf16 instruction.
2584SDValue rod = TLI->expandRoundInexactToOdd(
2585 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)
2586 : MVT::f32,
2587 Wide, Loc, DAG);
2588return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2589 }
2590 }
2591return TLI->expandFP_ROUND(Op.getNode(), DAG);
2592 }
2593 }
2594
2595// Everything else is considered legal.
2596returnOp;
2597}
2598
2599SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValueOp,
2600SelectionDAG &DAG) const{
2601SDValue Narrow =Op.getOperand(0);
2602EVT NarrowVT = Narrow.getValueType();
2603EVT WideVT =Op.getValueType();
2604if (NarrowVT.getScalarType() == MVT::bf16) {
2605if (WideVT.getScalarType() == MVT::f32 &&
2606 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2607SDLoc Loc(Op);
2608return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2609 }
2610if (WideVT.getScalarType() == MVT::f64 &&
2611 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2612EVTF32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)
2613 : MVT::f32;
2614SDLoc Loc(Op);
2615if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2616Op = DAG.getNode(ISD::FP_EXTEND, Loc,F32, Narrow);
2617 }else {
2618Op = DAG.getNode(ISD::BF16_TO_FP, Loc,F32, Narrow);
2619 }
2620return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT,Op);
2621 }
2622 }
2623
2624// Everything else is considered legal.
2625returnOp;
2626}
2627
2628staticSDValueLowerVectorArith(SDValueOp,SelectionDAG &DAG) {
2629SDLocDL(Op);
2630if (Op.getValueType() != MVT::v2i16)
2631returnOp;
2632EVT EltVT =Op.getValueType().getVectorElementType();
2633SmallVector<SDValue> VecElements;
2634for (intI = 0, E =Op.getValueType().getVectorNumElements();I < E;I++) {
2635SmallVector<SDValue> ScalarArgs;
2636llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2637 [&](constSDUse &O) {
2638 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2639 O.get(), DAG.getIntPtrConstant(I, DL));
2640 });
2641 VecElements.push_back(DAG.getNode(Op.getOpcode(),DL, EltVT, ScalarArgs));
2642 }
2643SDValue V =
2644 DAG.getNode(ISD::BUILD_VECTOR,DL,Op.getValueType(), VecElements);
2645return V;
2646}
2647
2648SDValue
2649NVPTXTargetLowering::LowerOperation(SDValueOp,SelectionDAG &DAG) const{
2650switch (Op.getOpcode()) {
2651caseISD::RETURNADDR:
2652returnSDValue();
2653caseISD::FRAMEADDR:
2654returnSDValue();
2655caseISD::GlobalAddress:
2656returnLowerGlobalAddress(Op, DAG);
2657caseISD::INTRINSIC_W_CHAIN:
2658returnOp;
2659caseISD::BUILD_VECTOR:
2660return LowerBUILD_VECTOR(Op, DAG);
2661caseISD::BITCAST:
2662return LowerBITCAST(Op, DAG);
2663caseISD::EXTRACT_SUBVECTOR:
2664returnOp;
2665caseISD::EXTRACT_VECTOR_ELT:
2666return LowerEXTRACT_VECTOR_ELT(Op, DAG);
2667caseISD::INSERT_VECTOR_ELT:
2668return LowerINSERT_VECTOR_ELT(Op, DAG);
2669caseISD::VECTOR_SHUFFLE:
2670return LowerVECTOR_SHUFFLE(Op, DAG);
2671caseISD::CONCAT_VECTORS:
2672return LowerCONCAT_VECTORS(Op, DAG);
2673caseISD::STORE:
2674return LowerSTORE(Op, DAG);
2675caseISD::LOAD:
2676return LowerLOAD(Op, DAG);
2677caseISD::SHL_PARTS:
2678return LowerShiftLeftParts(Op, DAG);
2679caseISD::SRA_PARTS:
2680caseISD::SRL_PARTS:
2681return LowerShiftRightParts(Op, DAG);
2682caseISD::SELECT:
2683return LowerSelect(Op, DAG);
2684caseISD::FROUND:
2685return LowerFROUND(Op, DAG);
2686caseISD::FCOPYSIGN:
2687return LowerFCOPYSIGN(Op, DAG);
2688caseISD::SINT_TO_FP:
2689caseISD::UINT_TO_FP:
2690return LowerINT_TO_FP(Op, DAG);
2691caseISD::FP_TO_SINT:
2692caseISD::FP_TO_UINT:
2693return LowerFP_TO_INT(Op, DAG);
2694caseISD::FP_ROUND:
2695return LowerFP_ROUND(Op, DAG);
2696caseISD::FP_EXTEND:
2697return LowerFP_EXTEND(Op, DAG);
2698caseISD::BR_JT:
2699return LowerBR_JT(Op, DAG);
2700caseISD::VAARG:
2701return LowerVAARG(Op, DAG);
2702caseISD::VASTART:
2703return LowerVASTART(Op, DAG);
2704caseISD::ABS:
2705caseISD::SMIN:
2706caseISD::SMAX:
2707caseISD::UMIN:
2708caseISD::UMAX:
2709caseISD::ADD:
2710caseISD::SUB:
2711caseISD::MUL:
2712caseISD::SHL:
2713caseISD::SREM:
2714caseISD::UREM:
2715returnLowerVectorArith(Op, DAG);
2716caseISD::DYNAMIC_STACKALLOC:
2717returnLowerDYNAMIC_STACKALLOC(Op, DAG);
2718caseISD::STACKRESTORE:
2719returnLowerSTACKRESTORE(Op, DAG);
2720caseISD::STACKSAVE:
2721returnLowerSTACKSAVE(Op, DAG);
2722caseISD::CopyToReg:
2723return LowerCopyToReg_128(Op, DAG);
2724caseISD::FADD:
2725caseISD::FSUB:
2726caseISD::FMUL:
2727// Used only for bf16 on SM80, where we select fma for non-ftz operation
2728return PromoteBinOpIfF32FTZ(Op, DAG);
2729
2730default:
2731llvm_unreachable("Custom lowering not defined for operation");
2732 }
2733}
2734
2735SDValue NVPTXTargetLowering::LowerBR_JT(SDValueOp,SelectionDAG &DAG) const{
2736SDLocDL(Op);
2737SDValue Chain =Op.getOperand(0);
2738constauto *JT = cast<JumpTableSDNode>(Op.getOperand(1));
2739SDValue Index =Op.getOperand(2);
2740
2741unsigned JId = JT->getIndex();
2742MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();
2743ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;
2744
2745SDValue IdV = DAG.getConstant(JId,DL, MVT::i32);
2746
2747// Generate BrxStart node
2748SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2749 Chain = DAG.getNode(NVPTXISD::BrxStart,DL, VTs, Chain, IdV);
2750
2751// Generate BrxItem nodes
2752assert(!MBBs.empty());
2753for (MachineBasicBlock *MBB : MBBs.drop_back())
2754 Chain = DAG.getNode(NVPTXISD::BrxItem,DL, VTs, Chain.getValue(0),
2755 DAG.getBasicBlock(MBB), Chain.getValue(1));
2756
2757// Generate BrxEnd nodes
2758SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,
2759 IdV, Chain.getValue(1)};
2760SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd,DL, VTs, EndOps);
2761
2762return BrxEnd;
2763}
2764
2765// This will prevent AsmPrinter from trying to print the jump tables itself.
2766unsignedNVPTXTargetLowering::getJumpTableEncoding() const{
2767returnMachineJumpTableInfo::EK_Inline;
2768}
2769
2770// This function is almost a copy of SelectionDAG::expandVAArg().
2771// The only diff is that this one produces loads from local address space.
2772SDValue NVPTXTargetLowering::LowerVAARG(SDValueOp,SelectionDAG &DAG) const{
2773constTargetLowering *TLI = STI.getTargetLowering();
2774SDLocDL(Op);
2775
2776SDNode *Node =Op.getNode();
2777constValue *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
2778EVT VT = Node->getValueType(0);
2779auto *Ty = VT.getTypeForEVT(*DAG.getContext());
2780SDValue Tmp1 = Node->getOperand(0);
2781SDValue Tmp2 = Node->getOperand(1);
2782constMaybeAlign MA(Node->getConstantOperandVal(3));
2783
2784SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()),DL,
2785 Tmp1, Tmp2,MachinePointerInfo(V));
2786SDValue VAList = VAListLoad;
2787
2788if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
2789 VAList = DAG.getNode(
2790ISD::ADD,DL, VAList.getValueType(), VAList,
2791 DAG.getConstant(MA->value() - 1,DL, VAList.getValueType()));
2792
2793 VAList = DAG.getNode(ISD::AND,DL, VAList.getValueType(), VAList,
2794 DAG.getSignedConstant(-(int64_t)MA->value(),DL,
2795 VAList.getValueType()));
2796 }
2797
2798// Increment the pointer, VAList, to the next vaarg
2799 Tmp1 = DAG.getNode(ISD::ADD,DL, VAList.getValueType(), VAList,
2800 DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),
2801DL, VAList.getValueType()));
2802
2803// Store the incremented VAList to the legalized pointer
2804 Tmp1 = DAG.getStore(VAListLoad.getValue(1),DL, Tmp1, Tmp2,
2805MachinePointerInfo(V));
2806
2807constValue *SrcV =Constant::getNullValue(
2808PointerType::get(*DAG.getContext(),ADDRESS_SPACE_LOCAL));
2809
2810// Load the actual argument out of the pointer VAList
2811return DAG.getLoad(VT,DL, Tmp1, VAList,MachinePointerInfo(SrcV));
2812}
2813
2814SDValue NVPTXTargetLowering::LowerVASTART(SDValueOp,SelectionDAG &DAG) const{
2815constTargetLowering *TLI = STI.getTargetLowering();
2816SDLocDL(Op);
2817EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
2818
2819// Store the address of unsized array <function>_vararg[] in the ap object.
2820SDValue Arg = getParamSymbol(DAG,/* vararg */ -1, PtrVT);
2821SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper,DL, PtrVT, Arg);
2822
2823constValue *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
2824return DAG.getStore(Op.getOperand(0),DL, VAReg,Op.getOperand(1),
2825MachinePointerInfo(SV));
2826}
2827
2828SDValue NVPTXTargetLowering::LowerSelect(SDValueOp,SelectionDAG &DAG) const{
2829SDValue Op0 =Op->getOperand(0);
2830SDValue Op1 =Op->getOperand(1);
2831SDValue Op2 =Op->getOperand(2);
2832SDLocDL(Op.getNode());
2833
2834assert(Op.getValueType() == MVT::i1 &&"Custom lowering enabled only for i1");
2835
2836 Op1 = DAG.getNode(ISD::ANY_EXTEND,DL, MVT::i32, Op1);
2837 Op2 = DAG.getNode(ISD::ANY_EXTEND,DL, MVT::i32, Op2);
2838SDValueSelect = DAG.getNode(ISD::SELECT,DL, MVT::i32, Op0, Op1, Op2);
2839SDValue Trunc = DAG.getNode(ISD::TRUNCATE,DL, MVT::i1,Select);
2840
2841return Trunc;
2842}
2843
2844SDValue NVPTXTargetLowering::LowerLOAD(SDValueOp,SelectionDAG &DAG) const{
2845if (Op.getValueType() == MVT::i1)
2846return LowerLOADi1(Op, DAG);
2847
2848// v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
2849// unaligned loads and have to handle it here.
2850EVT VT =Op.getValueType();
2851if (Isv2x16VT(VT) || VT == MVT::v4i8) {
2852LoadSDNode *Load = cast<LoadSDNode>(Op);
2853EVT MemVT =Load->getMemoryVT();
2854if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2855 MemVT, *Load->getMemOperand())) {
2856SDValue Ops[2];
2857 std::tie(Ops[0], Ops[1]) =expandUnalignedLoad(Load, DAG);
2858return DAG.getMergeValues(Ops,SDLoc(Op));
2859 }
2860 }
2861
2862returnSDValue();
2863}
2864
2865// v = ld i1* addr
2866// =>
2867// v1 = ld i8* addr (-> i16)
2868// v = trunc i16 to i1
2869SDValue NVPTXTargetLowering::LowerLOADi1(SDValueOp,SelectionDAG &DAG) const{
2870SDNode *Node =Op.getNode();
2871LoadSDNode *LD = cast<LoadSDNode>(Node);
2872SDLoc dl(Node);
2873assert(LD->getExtensionType() ==ISD::NON_EXTLOAD);
2874assert(Node->getValueType(0) == MVT::i1 &&
2875"Custom lowering for i1 load only");
2876SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16,LD->getChain(),
2877LD->getBasePtr(),LD->getPointerInfo(),
2878 MVT::i8,LD->getAlign(),
2879LD->getMemOperand()->getFlags());
2880SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
2881// The legalizer (the caller) is expecting two values from the legalized
2882// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
2883// in LegalizeDAG.cpp which also uses MergeValues.
2884SDValue Ops[] = { result,LD->getChain() };
2885return DAG.getMergeValues(Ops, dl);
2886}
2887
2888SDValue NVPTXTargetLowering::LowerSTORE(SDValueOp,SelectionDAG &DAG) const{
2889StoreSDNode *Store = cast<StoreSDNode>(Op);
2890EVT VT =Store->getMemoryVT();
2891
2892if (VT == MVT::i1)
2893return LowerSTOREi1(Op, DAG);
2894
2895// v2f16 is legal, so we can't rely on legalizer to handle unaligned
2896// stores and have to handle it here.
2897if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&
2898 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
2899 VT, *Store->getMemOperand()))
2900returnexpandUnalignedStore(Store, DAG);
2901
2902// v2f16, v2bf16 and v2i16 don't need special handling.
2903if (Isv2x16VT(VT) || VT == MVT::v4i8)
2904returnSDValue();
2905
2906if (VT.isVector())
2907return LowerSTOREVector(Op, DAG);
2908
2909returnSDValue();
2910}
2911
2912SDValue
2913NVPTXTargetLowering::LowerSTOREVector(SDValueOp,SelectionDAG &DAG) const{
2914SDNode *N =Op.getNode();
2915SDValue Val =N->getOperand(1);
2916SDLocDL(N);
2917EVT ValVT = Val.getValueType();
2918
2919auto NumEltsAndEltVT =getVectorLoweringShape(ValVT);
2920if (!NumEltsAndEltVT)
2921returnSDValue();
2922auto [NumElts, EltVT] = NumEltsAndEltVT.value();
2923
2924MemSDNode *MemSD = cast<MemSDNode>(N);
2925constDataLayout &TD = DAG.getDataLayout();
2926
2927Align Alignment = MemSD->getAlign();
2928Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
2929if (Alignment < PrefAlign) {
2930// This store is not sufficiently aligned, so bail out and let this vector
2931// store be scalarized. Note that we may still be able to emit smaller
2932// vector stores. For example, if we are storing a <4 x float> with an
2933// alignment of 8, this check will fail but the legalizer will try again
2934// with 2 x <2 x float>, which will succeed with an alignment of 8.
2935returnSDValue();
2936 }
2937
2938// Since StoreV2 is a target node, we cannot rely on DAG type legalization.
2939// Therefore, we must ensure the type is legal. For i1 and i8, we set the
2940// stored type to i16 and propagate the "real" type as the memory type.
2941bool NeedExt =false;
2942if (EltVT.getSizeInBits() < 16)
2943 NeedExt =true;
2944
2945unsigned Opcode = 0;
2946switch (NumElts) {
2947default:
2948returnSDValue();
2949case 2:
2950 Opcode =NVPTXISD::StoreV2;
2951break;
2952case 4:
2953 Opcode =NVPTXISD::StoreV4;
2954break;
2955 }
2956
2957SmallVector<SDValue, 8> Ops;
2958
2959// First is the chain
2960 Ops.push_back(N->getOperand(0));
2961
2962// Then the split values
2963assert(NumElts <= ValVT.getVectorNumElements() &&
2964"NumElts should not increase, only decrease or stay the same.");
2965if (NumElts < ValVT.getVectorNumElements()) {
2966// If the number of elements has decreased, getVectorLoweringShape has
2967// upsized the element types
2968assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&
2969 EltVT.getVectorNumElements() <= 4 &&"Unexpected upsized type.");
2970// Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
2971// stored as b32s
2972unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
2973for (unsigned i = 0; i < NumElts; ++i) {
2974SmallVector<SDValue, 4> SubVectorElts;
2975 DAG.ExtractVectorElements(Val, SubVectorElts, i * NumEltsPerSubVector,
2976 NumEltsPerSubVector);
2977SDValue SubVector = DAG.getBuildVector(EltVT,DL, SubVectorElts);
2978 Ops.push_back(SubVector);
2979 }
2980 }else {
2981for (unsigned i = 0; i < NumElts; ++i) {
2982SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, EltVT, Val,
2983 DAG.getIntPtrConstant(i,DL));
2984if (NeedExt)
2985 ExtVal = DAG.getNode(ISD::ANY_EXTEND,DL, MVT::i16, ExtVal);
2986 Ops.push_back(ExtVal);
2987 }
2988 }
2989
2990// Then any remaining arguments
2991 Ops.append(N->op_begin() + 2,N->op_end());
2992
2993SDValue NewSt =
2994 DAG.getMemIntrinsicNode(Opcode,DL, DAG.getVTList(MVT::Other), Ops,
2995 MemSD->getMemoryVT(), MemSD->getMemOperand());
2996
2997// return DCI.CombineTo(N, NewSt, true);
2998return NewSt;
2999}
3000
3001// st i1 v, addr
3002// =>
3003// v1 = zxt v to i16
3004// st.u8 i16, addr
3005SDValue NVPTXTargetLowering::LowerSTOREi1(SDValueOp,SelectionDAG &DAG) const{
3006SDNode *Node =Op.getNode();
3007SDLoc dl(Node);
3008StoreSDNode *ST = cast<StoreSDNode>(Node);
3009SDValue Tmp1 =ST->getChain();
3010SDValue Tmp2 =ST->getBasePtr();
3011SDValue Tmp3 =ST->getValue();
3012assert(Tmp3.getValueType() == MVT::i1 &&"Custom lowering for i1 store only");
3013 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3014SDValueResult =
3015 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,ST->getPointerInfo(), MVT::i8,
3016ST->getAlign(),ST->getMemOperand()->getFlags());
3017returnResult;
3018}
3019
3020SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValueOp,
3021SelectionDAG &DAG) const{
3022// Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3023// operand so that it can pass the legalization.
3024
3025assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3026"Custom lowering for 128-bit CopyToReg only");
3027
3028SDNode *Node =Op.getNode();
3029SDLocDL(Node);
3030
3031SDValue Cast = DAG.getBitcast(MVT::v2i64,Op->getOperand(2));
3032SDValueLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i64, Cast,
3033 DAG.getIntPtrConstant(0,DL));
3034SDValueHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i64, Cast,
3035 DAG.getIntPtrConstant(1,DL));
3036
3037SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1);
3038SmallVector<EVT, 3> ResultsType(Node->values());
3039
3040 NewOps[0] =Op->getOperand(0);// Chain
3041 NewOps[1] =Op->getOperand(1);// Dst Reg
3042 NewOps[2] =Lo;// Lower 64-bit
3043 NewOps[3] =Hi;// Higher 64-bit
3044if (Op.getNumOperands() == 4)
3045 NewOps[4] =Op->getOperand(3);// Glue if exists
3046
3047return DAG.getNode(ISD::CopyToReg,DL, ResultsType, NewOps);
3048}
3049
3050unsigned NVPTXTargetLowering::getNumRegisters(
3051LLVMContext &Context,EVT VT,
3052 std::optional<MVT> RegisterVT = std::nullopt) const{
3053if (VT == MVT::i128 && RegisterVT == MVT::i128)
3054return 1;
3055returnTargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
3056}
3057
3058bool NVPTXTargetLowering::splitValueIntoRegisterParts(
3059SelectionDAG &DAG,constSDLoc &DL,SDValue Val,SDValue *Parts,
3060unsigned NumParts,MVT PartVT, std::optional<CallingConv::ID>CC) const{
3061if (Val.getValueType() == MVT::i128 && NumParts == 1) {
3062 Parts[0] = Val;
3063returntrue;
3064 }
3065returnfalse;
3066}
3067
3068// This creates target external symbol for a function parameter.
3069// Name of the symbol is composed from its index and the function name.
3070// Negative index corresponds to special parameter (unsized array) used for
3071// passing variable arguments.
3072SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG,int idx,
3073EVT v) const{
3074StringRef SavedStr =nvTM->getStrPool().save(
3075getParamName(&DAG.getMachineFunction().getFunction(), idx));
3076return DAG.getTargetExternalSymbol(SavedStr.data(), v);
3077}
3078
3079SDValueNVPTXTargetLowering::LowerFormalArguments(
3080SDValue Chain,CallingConv::ID CallConv,bool isVarArg,
3081constSmallVectorImpl<ISD::InputArg> &Ins,constSDLoc &dl,
3082SelectionDAG &DAG,SmallVectorImpl<SDValue> &InVals) const{
3083MachineFunction &MF = DAG.getMachineFunction();
3084constDataLayout &DL = DAG.getDataLayout();
3085auto PtrVT =getPointerTy(DAG.getDataLayout());
3086
3087constFunction *F = &MF.getFunction();
3088constAttributeList &PAL =F->getAttributes();
3089constTargetLowering *TLI = STI.getTargetLowering();
3090
3091SDValue Root = DAG.getRoot();
3092 std::vector<SDValue> OutChains;
3093
3094bool isABI = (STI.getSmVersion() >= 20);
3095assert(isABI &&"Non-ABI compilation is not supported");
3096if (!isABI)
3097return Chain;
3098
3099 std::vector<Type *> argTypes;
3100 std::vector<const Argument *> theArgs;
3101for (constArgument &I :F->args()) {
3102 theArgs.push_back(&I);
3103 argTypes.push_back(I.getType());
3104 }
3105// argTypes.size() (or theArgs.size()) and Ins.size() need not match.
3106// Ins.size() will be larger
3107// * if there is an aggregate argument with multiple fields (each field
3108// showing up separately in Ins)
3109// * if there is a vector argument with more than typical vector-length
3110// elements (generally if more than 4) where each vector element is
3111// individually present in Ins.
3112// So a different index should be used for indexing into Ins.
3113// See similar issue in LowerCall.
3114unsigned InsIdx = 0;
3115
3116for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {
3117Type *Ty = argTypes[i];
3118
3119if (theArgs[i]->use_empty()) {
3120// argument is dead
3121if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {
3122SmallVector<EVT, 16> vtparts;
3123
3124ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);
3125if (vtparts.empty())
3126report_fatal_error("Empty parameter types are not supported");
3127
3128for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
3129 ++parti) {
3130 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3131 ++InsIdx;
3132 }
3133if (vtparts.size() > 0)
3134 --InsIdx;
3135continue;
3136 }
3137if (Ty->isVectorTy()) {
3138EVT ObjectVT =getValueType(DL, Ty);
3139unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);
3140for (unsigned parti = 0; parti < NumRegs; ++parti) {
3141 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3142 ++InsIdx;
3143 }
3144if (NumRegs > 0)
3145 --InsIdx;
3146continue;
3147 }
3148 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));
3149continue;
3150 }
3151
3152// In the following cases, assign a node order of "i+1"
3153// to newly created nodes. The SDNodes for params have to
3154// appear in the same order as their order of appearance
3155// in the original function. "i+1" holds that order.
3156if (!PAL.hasParamAttr(i, Attribute::ByVal)) {
3157bool aggregateIsPacked =false;
3158if (StructType *STy = dyn_cast<StructType>(Ty))
3159 aggregateIsPacked = STy->isPacked();
3160
3161SmallVector<EVT, 16> VTs;
3162SmallVector<uint64_t, 16> Offsets;
3163ComputePTXValueVTs(*this,DL, Ty, VTs, &Offsets, 0);
3164if (VTs.empty())
3165report_fatal_error("Empty parameter types are not supported");
3166
3167Align ArgAlign =getFunctionArgumentAlignment(
3168F, Ty, i +AttributeList::FirstArgIndex,DL);
3169auto VectorInfo =VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
3170
3171SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3172int VecIdx = -1;// Index of the first element of the current vector.
3173for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
3174if (VectorInfo[parti] &PVF_FIRST) {
3175assert(VecIdx == -1 &&"Orphaned vector.");
3176 VecIdx = parti;
3177 }
3178
3179// That's the last element of this store op.
3180if (VectorInfo[parti] &PVF_LAST) {
3181unsigned NumElts = parti - VecIdx + 1;
3182EVT EltVT = VTs[parti];
3183// i1 is loaded/stored as i8.
3184EVT LoadVT = EltVT;
3185if (EltVT == MVT::i1)
3186 LoadVT = MVT::i8;
3187elseif (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)
3188// getLoad needs a vector type, but it can't handle
3189// vectors which contain v2f16 or v2bf16 elements. So we must load
3190// using i32 here and then bitcast back.
3191 LoadVT = MVT::i32;
3192
3193EVT VecVT =EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
3194SDValue VecAddr =
3195 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
3196 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
3197Value *srcValue =Constant::getNullValue(
3198PointerType::get(F->getContext(),ADDRESS_SPACE_PARAM));
3199
3200constMaybeAlign PartAlign = [&]() ->MaybeAlign {
3201if (aggregateIsPacked)
3202returnAlign(1);
3203if (NumElts != 1)
3204return std::nullopt;
3205Align PartAlign =
3206DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));
3207returncommonAlignment(PartAlign, Offsets[parti]);
3208 }();
3209SDValueP = DAG.getLoad(VecVT, dl, Root, VecAddr,
3210MachinePointerInfo(srcValue), PartAlign,
3211MachineMemOperand::MODereferenceable |
3212MachineMemOperand::MOInvariant);
3213if (P.getNode())
3214P.getNode()->setIROrder(i + 1);
3215for (unsigned j = 0; j < NumElts; ++j) {
3216SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT,P,
3217 DAG.getIntPtrConstant(j, dl));
3218// We've loaded i1 as an i8 and now must truncate it back to i1
3219if (EltVT == MVT::i1)
3220 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
3221// v2f16 was loaded as an i32. Now we must bitcast it back.
3222elseif (EltVT != LoadVT)
3223 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);
3224
3225// If a promoted integer type is used, truncate down to the original
3226MVT PromotedVT;
3227if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {
3228 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
3229 }
3230
3231// Extend the element if necessary (e.g. an i8 is loaded
3232// into an i16 register)
3233if (Ins[InsIdx].VT.isInteger() &&
3234 Ins[InsIdx].VT.getFixedSizeInBits() >
3235 LoadVT.getFixedSizeInBits()) {
3236unsigned Extend = Ins[InsIdx].Flags.isSExt() ?ISD::SIGN_EXTEND
3237 :ISD::ZERO_EXTEND;
3238 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
3239 }
3240 InVals.push_back(Elt);
3241 }
3242
3243// Reset vector tracking state.
3244 VecIdx = -1;
3245 }
3246 ++InsIdx;
3247 }
3248if (VTs.size() > 0)
3249 --InsIdx;
3250continue;
3251 }
3252
3253// Param has ByVal attribute
3254// Return MoveParam(param symbol).
3255// Ideally, the param symbol can be returned directly,
3256// but when SDNode builder decides to use it in a CopyToReg(),
3257// machine instruction fails because TargetExternalSymbol
3258// (not lowered) is target dependent, and CopyToReg assumes
3259// the source is lowered.
3260EVT ObjectVT =getValueType(DL, Ty);
3261assert(ObjectVT == Ins[InsIdx].VT &&
3262"Ins type did not match function type");
3263SDValue Arg = getParamSymbol(DAG, i, PtrVT);
3264SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
3265if (p.getNode())
3266 p.getNode()->setIROrder(i + 1);
3267 InVals.push_back(p);
3268 }
3269
3270if (!OutChains.empty())
3271 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
3272
3273return Chain;
3274}
3275
3276// Use byte-store when the param adress of the return value is unaligned.
3277// This may happen when the return value is a field of a packed structure.
3278staticSDValueLowerUnalignedStoreRet(SelectionDAG &DAG,SDValue Chain,
3279uint64_tOffset,EVT ElementType,
3280SDValue RetVal,constSDLoc &dl) {
3281// Bit logic only works on integer types
3282if (adjustElementType(ElementType))
3283 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);
3284
3285// Store each byte
3286for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {
3287// Shift the byte to the last byte position
3288SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,
3289 DAG.getConstant(i * 8, dl, MVT::i32));
3290SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),
3291 ShiftVal};
3292// Trunc store only the last byte by using
3293// st.param.b8
3294// The register type can be larger than b8.
3295 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
3296 DAG.getVTList(MVT::Other), StoreOperands,
3297 MVT::i8,MachinePointerInfo(), std::nullopt,
3298MachineMemOperand::MOStore);
3299 }
3300return Chain;
3301}
3302
3303SDValue
3304NVPTXTargetLowering::LowerReturn(SDValue Chain,CallingConv::ID CallConv,
3305bool isVarArg,
3306constSmallVectorImpl<ISD::OutputArg> &Outs,
3307constSmallVectorImpl<SDValue> &OutVals,
3308constSDLoc &dl,SelectionDAG &DAG) const{
3309constMachineFunction &MF = DAG.getMachineFunction();
3310constFunction &F = MF.getFunction();
3311Type *RetTy = MF.getFunction().getReturnType();
3312
3313bool isABI = (STI.getSmVersion() >= 20);
3314assert(isABI &&"Non-ABI compilation is not supported");
3315if (!isABI)
3316return Chain;
3317
3318constDataLayout &DL = DAG.getDataLayout();
3319SmallVector<SDValue, 16> PromotedOutVals;
3320SmallVector<EVT, 16> VTs;
3321SmallVector<uint64_t, 16> Offsets;
3322ComputePTXValueVTs(*this,DL,RetTy, VTs, &Offsets);
3323assert(VTs.size() == OutVals.size() &&"Bad return value decomposition");
3324
3325for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3326SDValue PromotedOutVal = OutVals[i];
3327MVT PromotedVT;
3328if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {
3329 VTs[i] =EVT(PromotedVT);
3330 }
3331if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {
3332llvm::ISD::NodeType Ext =
3333 Outs[i].Flags.isSExt() ?ISD::SIGN_EXTEND :ISD::ZERO_EXTEND;
3334 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);
3335 }
3336 PromotedOutVals.push_back(PromotedOutVal);
3337 }
3338
3339auto VectorInfo =VectorizePTXValueVTs(
3340 VTs, Offsets,
3341RetTy->isSized() ?getFunctionParamOptimizedAlign(&F,RetTy,DL)
3342 :Align(1));
3343
3344// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
3345// 32-bits are sign extended or zero extended, depending on whether
3346// they are signed or unsigned types.
3347bool ExtendIntegerRetVal =
3348RetTy->isIntegerTy() &&DL.getTypeAllocSizeInBits(RetTy) < 32;
3349
3350SmallVector<SDValue, 6> StoreOperands;
3351for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
3352SDValue OutVal = OutVals[i];
3353SDValue RetVal = PromotedOutVals[i];
3354
3355if (ExtendIntegerRetVal) {
3356 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ?ISD::SIGN_EXTEND
3357 :ISD::ZERO_EXTEND,
3358 dl, MVT::i32, RetVal);
3359 }elseif (OutVal.getValueSizeInBits() < 16) {
3360// Use 16-bit registers for small load-stores as it's the
3361// smallest general purpose register size supported by NVPTX.
3362 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
3363 }
3364
3365// If we have a PVF_SCALAR entry, it may not even be sufficiently aligned
3366// for a scalar store. In such cases, fall back to byte stores.
3367if (VectorInfo[i] ==PVF_SCALAR &&RetTy->isAggregateType()) {
3368EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3369Align ElementTypeAlign =
3370DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));
3371Align ElementAlign =
3372commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);
3373if (ElementAlign < ElementTypeAlign) {
3374assert(StoreOperands.empty() &&"Orphaned operand list.");
3375 Chain =LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,
3376 RetVal, dl);
3377
3378// The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes
3379// into the graph, so just move on to the next element.
3380continue;
3381 }
3382 }
3383
3384// New load/store. Record chain and offset operands.
3385if (VectorInfo[i] &PVF_FIRST) {
3386assert(StoreOperands.empty() &&"Orphaned operand list.");
3387 StoreOperands.push_back(Chain);
3388 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
3389 }
3390
3391// Record the value to return.
3392 StoreOperands.push_back(RetVal);
3393
3394// That's the last element of this store op.
3395if (VectorInfo[i] &PVF_LAST) {
3396NVPTXISD::NodeTypeOp;
3397unsigned NumElts = StoreOperands.size() - 2;
3398switch (NumElts) {
3399case 1:
3400Op =NVPTXISD::StoreRetval;
3401break;
3402case 2:
3403Op =NVPTXISD::StoreRetvalV2;
3404break;
3405case 4:
3406Op =NVPTXISD::StoreRetvalV4;
3407break;
3408default:
3409llvm_unreachable("Invalid vector info.");
3410 }
3411
3412// Adjust type of load/store op if we've extended the scalar
3413// return value.
3414EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
3415 Chain = DAG.getMemIntrinsicNode(
3416Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
3417MachinePointerInfo(),Align(1),MachineMemOperand::MOStore);
3418// Cleanup vector state.
3419 StoreOperands.clear();
3420 }
3421 }
3422
3423return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
3424}
3425
3426voidNVPTXTargetLowering::LowerAsmOperandForConstraint(
3427SDValueOp,StringRef Constraint, std::vector<SDValue> &Ops,
3428SelectionDAG &DAG) const{
3429if (Constraint.size() > 1)
3430return;
3431TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
3432}
3433
3434// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
3435// TgtMemIntrinsic
3436// because we need the information that is only available in the "Value" type
3437// of destination
3438// pointer. In particular, the address space information.
3439boolNVPTXTargetLowering::getTgtMemIntrinsic(
3440IntrinsicInfo &Info,constCallInst &I,
3441MachineFunction &MF,unsigned Intrinsic) const{
3442switch (Intrinsic) {
3443default:
3444returnfalse;
3445case Intrinsic::nvvm_match_all_sync_i32p:
3446case Intrinsic::nvvm_match_all_sync_i64p:
3447Info.opc =ISD::INTRINSIC_W_CHAIN;
3448// memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
3449// in order to model data exchange with other threads, but perform no real
3450// memory accesses.
3451Info.memVT = MVT::i1;
3452
3453// Our result depends on both our and other thread's arguments.
3454Info.flags =MachineMemOperand::MOLoad |MachineMemOperand::MOStore;
3455returntrue;
3456case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
3457case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
3458case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
3459case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
3460case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
3461case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
3462case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
3463case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
3464case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
3465case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
3466case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
3467case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
3468case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
3469case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
3470case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
3471case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
3472case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
3473case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
3474case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
3475case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
3476case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
3477case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
3478case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
3479case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
3480Info.opc =ISD::INTRINSIC_W_CHAIN;
3481Info.memVT = MVT::v8f16;
3482Info.ptrVal =I.getArgOperand(0);
3483Info.offset = 0;
3484Info.flags =MachineMemOperand::MOLoad;
3485Info.align =Align(16);
3486returntrue;
3487 }
3488case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
3489case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
3490case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
3491case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
3492case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
3493case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
3494case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
3495case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
3496case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
3497case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
3498case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
3499case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
3500case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
3501case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
3502case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
3503case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
3504case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
3505case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
3506case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
3507case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
3508case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
3509case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
3510case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
3511case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
3512Info.opc =ISD::INTRINSIC_W_CHAIN;
3513Info.memVT = MVT::v2i32;
3514Info.ptrVal =I.getArgOperand(0);
3515Info.offset = 0;
3516Info.flags =MachineMemOperand::MOLoad;
3517Info.align =Align(8);
3518returntrue;
3519 }
3520
3521case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
3522case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
3523case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
3524case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
3525case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
3526case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
3527case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
3528case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
3529case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
3530case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
3531case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
3532case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
3533case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
3534case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
3535case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
3536case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
3537
3538case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
3539case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
3540case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
3541case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
3542case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
3543case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
3544case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
3545case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
3546case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
3547case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
3548case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
3549case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
3550case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
3551case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
3552case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
3553case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
3554case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
3555case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {
3556Info.opc =ISD::INTRINSIC_W_CHAIN;
3557Info.memVT = MVT::v4i32;
3558Info.ptrVal =I.getArgOperand(0);
3559Info.offset = 0;
3560Info.flags =MachineMemOperand::MOLoad;
3561Info.align =Align(16);
3562returntrue;
3563 }
3564
3565case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
3566case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
3567case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
3568case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
3569case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
3570case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
3571case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
3572case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
3573
3574case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
3575case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
3576case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
3577case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
3578case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
3579case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
3580case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
3581case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
3582case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
3583case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
3584case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
3585case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
3586case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
3587case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
3588case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
3589case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
3590case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
3591case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
3592case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
3593case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
3594case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
3595case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {
3596Info.opc =ISD::INTRINSIC_W_CHAIN;
3597Info.memVT = MVT::i32;
3598Info.ptrVal =I.getArgOperand(0);
3599Info.offset = 0;
3600Info.flags =MachineMemOperand::MOLoad;
3601Info.align =Align(4);
3602returntrue;
3603 }
3604
3605case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
3606case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
3607case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
3608case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
3609case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
3610case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
3611case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
3612case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
3613case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
3614case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
3615case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
3616case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
3617Info.opc =ISD::INTRINSIC_W_CHAIN;
3618Info.memVT = MVT::v4f16;
3619Info.ptrVal =I.getArgOperand(0);
3620Info.offset = 0;
3621Info.flags =MachineMemOperand::MOLoad;
3622Info.align =Align(16);
3623returntrue;
3624 }
3625
3626case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
3627case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
3628case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
3629case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
3630case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
3631case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
3632case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
3633case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
3634case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
3635case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
3636case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
3637case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
3638case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
3639case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
3640case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
3641case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
3642Info.opc =ISD::INTRINSIC_W_CHAIN;
3643Info.memVT = MVT::v8f32;
3644Info.ptrVal =I.getArgOperand(0);
3645Info.offset = 0;
3646Info.flags =MachineMemOperand::MOLoad;
3647Info.align =Align(16);
3648returntrue;
3649 }
3650
3651case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
3652case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
3653case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
3654case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
3655
3656case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
3657case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
3658case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
3659case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
3660
3661case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
3662case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
3663case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
3664case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
3665case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
3666case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
3667case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
3668case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
3669case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
3670case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
3671case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
3672case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
3673Info.opc =ISD::INTRINSIC_W_CHAIN;
3674Info.memVT = MVT::v8i32;
3675Info.ptrVal =I.getArgOperand(0);
3676Info.offset = 0;
3677Info.flags =MachineMemOperand::MOLoad;
3678Info.align =Align(16);
3679returntrue;
3680 }
3681
3682case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
3683case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
3684case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
3685case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
3686case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
3687case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
3688case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
3689case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
3690case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
3691case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {
3692Info.opc =ISD::INTRINSIC_W_CHAIN;
3693Info.memVT = MVT::v2i32;
3694Info.ptrVal =I.getArgOperand(0);
3695Info.offset = 0;
3696Info.flags =MachineMemOperand::MOLoad;
3697Info.align =Align(8);
3698returntrue;
3699 }
3700
3701case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
3702case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
3703case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
3704case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
3705
3706case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
3707case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
3708case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
3709case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
3710Info.opc =ISD::INTRINSIC_W_CHAIN;
3711Info.memVT = MVT::f64;
3712Info.ptrVal =I.getArgOperand(0);
3713Info.offset = 0;
3714Info.flags =MachineMemOperand::MOLoad;
3715Info.align =Align(8);
3716returntrue;
3717 }
3718
3719case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
3720case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
3721case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
3722case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
3723Info.opc =ISD::INTRINSIC_W_CHAIN;
3724Info.memVT = MVT::v2f64;
3725Info.ptrVal =I.getArgOperand(0);
3726Info.offset = 0;
3727Info.flags =MachineMemOperand::MOLoad;
3728Info.align =Align(16);
3729returntrue;
3730 }
3731
3732case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
3733case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
3734case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
3735case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
3736case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
3737case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
3738case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
3739case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
3740case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
3741case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
3742case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
3743case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
3744Info.opc =ISD::INTRINSIC_VOID;
3745Info.memVT = MVT::v4f16;
3746Info.ptrVal =I.getArgOperand(0);
3747Info.offset = 0;
3748Info.flags =MachineMemOperand::MOStore;
3749Info.align =Align(16);
3750returntrue;
3751 }
3752
3753case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
3754case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
3755case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
3756case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
3757case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
3758case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
3759case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
3760case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
3761case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
3762case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
3763case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
3764case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
3765case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
3766case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
3767case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
3768case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
3769Info.opc =ISD::INTRINSIC_VOID;
3770Info.memVT = MVT::v8f32;
3771Info.ptrVal =I.getArgOperand(0);
3772Info.offset = 0;
3773Info.flags =MachineMemOperand::MOStore;
3774Info.align =Align(16);
3775returntrue;
3776 }
3777
3778case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
3779case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
3780case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
3781case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
3782case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
3783case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
3784case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
3785case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
3786case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
3787case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
3788case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
3789case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
3790Info.opc =ISD::INTRINSIC_VOID;
3791Info.memVT = MVT::v8i32;
3792Info.ptrVal =I.getArgOperand(0);
3793Info.offset = 0;
3794Info.flags =MachineMemOperand::MOStore;
3795Info.align =Align(16);
3796returntrue;
3797 }
3798
3799case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
3800case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
3801case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
3802case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
3803case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
3804case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
3805case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
3806case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {
3807Info.opc =ISD::INTRINSIC_VOID;
3808Info.memVT = MVT::v2i32;
3809Info.ptrVal =I.getArgOperand(0);
3810Info.offset = 0;
3811Info.flags =MachineMemOperand::MOStore;
3812Info.align =Align(8);
3813returntrue;
3814 }
3815
3816case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
3817case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
3818case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
3819case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
3820Info.opc =ISD::INTRINSIC_VOID;
3821Info.memVT = MVT::v2f64;
3822Info.ptrVal =I.getArgOperand(0);
3823Info.offset = 0;
3824Info.flags =MachineMemOperand::MOStore;
3825Info.align =Align(16);
3826returntrue;
3827 }
3828
3829case Intrinsic::nvvm_atomic_load_inc_32:
3830case Intrinsic::nvvm_atomic_load_dec_32:
3831
3832case Intrinsic::nvvm_atomic_add_gen_f_cta:
3833case Intrinsic::nvvm_atomic_add_gen_f_sys:
3834case Intrinsic::nvvm_atomic_add_gen_i_cta:
3835case Intrinsic::nvvm_atomic_add_gen_i_sys:
3836case Intrinsic::nvvm_atomic_and_gen_i_cta:
3837case Intrinsic::nvvm_atomic_and_gen_i_sys:
3838case Intrinsic::nvvm_atomic_cas_gen_i_cta:
3839case Intrinsic::nvvm_atomic_cas_gen_i_sys:
3840case Intrinsic::nvvm_atomic_dec_gen_i_cta:
3841case Intrinsic::nvvm_atomic_dec_gen_i_sys:
3842case Intrinsic::nvvm_atomic_inc_gen_i_cta:
3843case Intrinsic::nvvm_atomic_inc_gen_i_sys:
3844case Intrinsic::nvvm_atomic_max_gen_i_cta:
3845case Intrinsic::nvvm_atomic_max_gen_i_sys:
3846case Intrinsic::nvvm_atomic_min_gen_i_cta:
3847case Intrinsic::nvvm_atomic_min_gen_i_sys:
3848case Intrinsic::nvvm_atomic_or_gen_i_cta:
3849case Intrinsic::nvvm_atomic_or_gen_i_sys:
3850case Intrinsic::nvvm_atomic_exch_gen_i_cta:
3851case Intrinsic::nvvm_atomic_exch_gen_i_sys:
3852case Intrinsic::nvvm_atomic_xor_gen_i_cta:
3853case Intrinsic::nvvm_atomic_xor_gen_i_sys: {
3854auto &DL =I.getDataLayout();
3855Info.opc =ISD::INTRINSIC_W_CHAIN;
3856Info.memVT =getValueType(DL,I.getType());
3857Info.ptrVal =I.getArgOperand(0);
3858Info.offset = 0;
3859Info.flags =MachineMemOperand::MOLoad |MachineMemOperand::MOStore;
3860Info.align.reset();
3861returntrue;
3862 }
3863
3864case Intrinsic::nvvm_ldu_global_i:
3865case Intrinsic::nvvm_ldu_global_f:
3866case Intrinsic::nvvm_ldu_global_p: {
3867auto &DL =I.getDataLayout();
3868Info.opc =ISD::INTRINSIC_W_CHAIN;
3869if (Intrinsic == Intrinsic::nvvm_ldu_global_i)
3870Info.memVT =getValueType(DL,I.getType());
3871elseif(Intrinsic == Intrinsic::nvvm_ldu_global_p)
3872Info.memVT =getPointerTy(DL);
3873else
3874Info.memVT =getValueType(DL,I.getType());
3875Info.ptrVal =I.getArgOperand(0);
3876Info.offset = 0;
3877Info.flags =MachineMemOperand::MOLoad;
3878Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
3879
3880returntrue;
3881 }
3882case Intrinsic::nvvm_tex_1d_v4f32_s32:
3883case Intrinsic::nvvm_tex_1d_v4f32_f32:
3884case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
3885case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
3886case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
3887case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
3888case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
3889case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
3890case Intrinsic::nvvm_tex_2d_v4f32_s32:
3891case Intrinsic::nvvm_tex_2d_v4f32_f32:
3892case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
3893case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
3894case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
3895case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
3896case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
3897case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
3898case Intrinsic::nvvm_tex_3d_v4f32_s32:
3899case Intrinsic::nvvm_tex_3d_v4f32_f32:
3900case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
3901case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
3902case Intrinsic::nvvm_tex_cube_v4f32_f32:
3903case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
3904case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
3905case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
3906case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
3907case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
3908case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
3909case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
3910case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
3911case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
3912case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
3913case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
3914case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
3915case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
3916case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
3917case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
3918case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
3919case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
3920case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
3921case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
3922case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
3923case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
3924case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
3925case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
3926case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
3927case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
3928case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
3929case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
3930case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
3931case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
3932case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
3933case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
3934case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
3935case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
3936case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
3937case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
3938case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
3939case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
3940Info.opc =ISD::INTRINSIC_W_CHAIN;
3941Info.memVT = MVT::v4f32;
3942Info.ptrVal =nullptr;
3943Info.offset = 0;
3944Info.flags =MachineMemOperand::MOLoad;
3945Info.align =Align(16);
3946returntrue;
3947
3948case Intrinsic::nvvm_tex_1d_v4s32_s32:
3949case Intrinsic::nvvm_tex_1d_v4s32_f32:
3950case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
3951case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
3952case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
3953case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
3954case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
3955case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
3956case Intrinsic::nvvm_tex_2d_v4s32_s32:
3957case Intrinsic::nvvm_tex_2d_v4s32_f32:
3958case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
3959case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
3960case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
3961case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
3962case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
3963case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
3964case Intrinsic::nvvm_tex_3d_v4s32_s32:
3965case Intrinsic::nvvm_tex_3d_v4s32_f32:
3966case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
3967case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
3968case Intrinsic::nvvm_tex_cube_v4s32_f32:
3969case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
3970case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
3971case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
3972case Intrinsic::nvvm_tex_cube_v4u32_f32:
3973case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
3974case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
3975case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
3976case Intrinsic::nvvm_tex_1d_v4u32_s32:
3977case Intrinsic::nvvm_tex_1d_v4u32_f32:
3978case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
3979case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
3980case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
3981case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
3982case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
3983case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
3984case Intrinsic::nvvm_tex_2d_v4u32_s32:
3985case Intrinsic::nvvm_tex_2d_v4u32_f32:
3986case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
3987case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
3988case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
3989case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
3990case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
3991case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
3992case Intrinsic::nvvm_tex_3d_v4u32_s32:
3993case Intrinsic::nvvm_tex_3d_v4u32_f32:
3994case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
3995case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
3996case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
3997case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
3998case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
3999case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4000case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4001case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4002case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4003case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4004case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4005case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4006case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4007case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4008case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4009case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4010case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4011case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4012case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4013case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4014case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4015case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4016case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4017case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4018case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4019case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4020case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4021case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4022case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4023case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4024case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4025case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4026case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4027case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4028case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4029case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4030case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4031case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4032case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4033case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4034case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4035case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4036case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4037case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4038case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4039case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4040case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4041case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4042case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4043case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4044case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4045case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4046case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4047case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4048case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4049case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4050case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4051case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4052case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4053case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4054case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4055case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4056case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4057case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4058case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4059case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4060case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4061case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4062case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4063case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4064Info.opc =ISD::INTRINSIC_W_CHAIN;
4065Info.memVT = MVT::v4i32;
4066Info.ptrVal =nullptr;
4067Info.offset = 0;
4068Info.flags =MachineMemOperand::MOLoad;
4069Info.align =Align(16);
4070returntrue;
4071
4072case Intrinsic::nvvm_suld_1d_i8_clamp:
4073case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4074case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4075case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4076case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4077case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4078case Intrinsic::nvvm_suld_2d_i8_clamp:
4079case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4080case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4081case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4082case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4083case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4084case Intrinsic::nvvm_suld_3d_i8_clamp:
4085case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4086case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4087case Intrinsic::nvvm_suld_1d_i8_trap:
4088case Intrinsic::nvvm_suld_1d_v2i8_trap:
4089case Intrinsic::nvvm_suld_1d_v4i8_trap:
4090case Intrinsic::nvvm_suld_1d_array_i8_trap:
4091case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4092case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4093case Intrinsic::nvvm_suld_2d_i8_trap:
4094case Intrinsic::nvvm_suld_2d_v2i8_trap:
4095case Intrinsic::nvvm_suld_2d_v4i8_trap:
4096case Intrinsic::nvvm_suld_2d_array_i8_trap:
4097case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4098case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4099case Intrinsic::nvvm_suld_3d_i8_trap:
4100case Intrinsic::nvvm_suld_3d_v2i8_trap:
4101case Intrinsic::nvvm_suld_3d_v4i8_trap:
4102case Intrinsic::nvvm_suld_1d_i8_zero:
4103case Intrinsic::nvvm_suld_1d_v2i8_zero:
4104case Intrinsic::nvvm_suld_1d_v4i8_zero:
4105case Intrinsic::nvvm_suld_1d_array_i8_zero:
4106case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4107case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4108case Intrinsic::nvvm_suld_2d_i8_zero:
4109case Intrinsic::nvvm_suld_2d_v2i8_zero:
4110case Intrinsic::nvvm_suld_2d_v4i8_zero:
4111case Intrinsic::nvvm_suld_2d_array_i8_zero:
4112case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4113case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4114case Intrinsic::nvvm_suld_3d_i8_zero:
4115case Intrinsic::nvvm_suld_3d_v2i8_zero:
4116case Intrinsic::nvvm_suld_3d_v4i8_zero:
4117Info.opc =ISD::INTRINSIC_W_CHAIN;
4118Info.memVT = MVT::i8;
4119Info.ptrVal =nullptr;
4120Info.offset = 0;
4121Info.flags =MachineMemOperand::MOLoad;
4122Info.align =Align(16);
4123returntrue;
4124
4125case Intrinsic::nvvm_suld_1d_i16_clamp:
4126case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4127case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4128case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4129case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4130case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
4131case Intrinsic::nvvm_suld_2d_i16_clamp:
4132case Intrinsic::nvvm_suld_2d_v2i16_clamp:
4133case Intrinsic::nvvm_suld_2d_v4i16_clamp:
4134case Intrinsic::nvvm_suld_2d_array_i16_clamp:
4135case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
4136case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
4137case Intrinsic::nvvm_suld_3d_i16_clamp:
4138case Intrinsic::nvvm_suld_3d_v2i16_clamp:
4139case Intrinsic::nvvm_suld_3d_v4i16_clamp:
4140case Intrinsic::nvvm_suld_1d_i16_trap:
4141case Intrinsic::nvvm_suld_1d_v2i16_trap:
4142case Intrinsic::nvvm_suld_1d_v4i16_trap:
4143case Intrinsic::nvvm_suld_1d_array_i16_trap:
4144case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
4145case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
4146case Intrinsic::nvvm_suld_2d_i16_trap:
4147case Intrinsic::nvvm_suld_2d_v2i16_trap:
4148case Intrinsic::nvvm_suld_2d_v4i16_trap:
4149case Intrinsic::nvvm_suld_2d_array_i16_trap:
4150case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
4151case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
4152case Intrinsic::nvvm_suld_3d_i16_trap:
4153case Intrinsic::nvvm_suld_3d_v2i16_trap:
4154case Intrinsic::nvvm_suld_3d_v4i16_trap:
4155case Intrinsic::nvvm_suld_1d_i16_zero:
4156case Intrinsic::nvvm_suld_1d_v2i16_zero:
4157case Intrinsic::nvvm_suld_1d_v4i16_zero:
4158case Intrinsic::nvvm_suld_1d_array_i16_zero:
4159case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
4160case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
4161case Intrinsic::nvvm_suld_2d_i16_zero:
4162case Intrinsic::nvvm_suld_2d_v2i16_zero:
4163case Intrinsic::nvvm_suld_2d_v4i16_zero:
4164case Intrinsic::nvvm_suld_2d_array_i16_zero:
4165case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
4166case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
4167case Intrinsic::nvvm_suld_3d_i16_zero:
4168case Intrinsic::nvvm_suld_3d_v2i16_zero:
4169case Intrinsic::nvvm_suld_3d_v4i16_zero:
4170Info.opc =ISD::INTRINSIC_W_CHAIN;
4171Info.memVT = MVT::i16;
4172Info.ptrVal =nullptr;
4173Info.offset = 0;
4174Info.flags =MachineMemOperand::MOLoad;
4175Info.align =Align(16);
4176returntrue;
4177
4178case Intrinsic::nvvm_suld_1d_i32_clamp:
4179case Intrinsic::nvvm_suld_1d_v2i32_clamp:
4180case Intrinsic::nvvm_suld_1d_v4i32_clamp:
4181case Intrinsic::nvvm_suld_1d_array_i32_clamp:
4182case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
4183case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
4184case Intrinsic::nvvm_suld_2d_i32_clamp:
4185case Intrinsic::nvvm_suld_2d_v2i32_clamp:
4186case Intrinsic::nvvm_suld_2d_v4i32_clamp:
4187case Intrinsic::nvvm_suld_2d_array_i32_clamp:
4188case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
4189case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
4190case Intrinsic::nvvm_suld_3d_i32_clamp:
4191case Intrinsic::nvvm_suld_3d_v2i32_clamp:
4192case Intrinsic::nvvm_suld_3d_v4i32_clamp:
4193case Intrinsic::nvvm_suld_1d_i32_trap:
4194case Intrinsic::nvvm_suld_1d_v2i32_trap:
4195case Intrinsic::nvvm_suld_1d_v4i32_trap:
4196case Intrinsic::nvvm_suld_1d_array_i32_trap:
4197case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
4198case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
4199case Intrinsic::nvvm_suld_2d_i32_trap:
4200case Intrinsic::nvvm_suld_2d_v2i32_trap:
4201case Intrinsic::nvvm_suld_2d_v4i32_trap:
4202case Intrinsic::nvvm_suld_2d_array_i32_trap:
4203case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
4204case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
4205case Intrinsic::nvvm_suld_3d_i32_trap:
4206case Intrinsic::nvvm_suld_3d_v2i32_trap:
4207case Intrinsic::nvvm_suld_3d_v4i32_trap:
4208case Intrinsic::nvvm_suld_1d_i32_zero:
4209case Intrinsic::nvvm_suld_1d_v2i32_zero:
4210case Intrinsic::nvvm_suld_1d_v4i32_zero:
4211case Intrinsic::nvvm_suld_1d_array_i32_zero:
4212case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
4213case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
4214case Intrinsic::nvvm_suld_2d_i32_zero:
4215case Intrinsic::nvvm_suld_2d_v2i32_zero:
4216case Intrinsic::nvvm_suld_2d_v4i32_zero:
4217case Intrinsic::nvvm_suld_2d_array_i32_zero:
4218case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
4219case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
4220case Intrinsic::nvvm_suld_3d_i32_zero:
4221case Intrinsic::nvvm_suld_3d_v2i32_zero:
4222case Intrinsic::nvvm_suld_3d_v4i32_zero:
4223Info.opc =ISD::INTRINSIC_W_CHAIN;
4224Info.memVT = MVT::i32;
4225Info.ptrVal =nullptr;
4226Info.offset = 0;
4227Info.flags =MachineMemOperand::MOLoad;
4228Info.align =Align(16);
4229returntrue;
4230
4231case Intrinsic::nvvm_suld_1d_i64_clamp:
4232case Intrinsic::nvvm_suld_1d_v2i64_clamp:
4233case Intrinsic::nvvm_suld_1d_array_i64_clamp:
4234case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
4235case Intrinsic::nvvm_suld_2d_i64_clamp:
4236case Intrinsic::nvvm_suld_2d_v2i64_clamp:
4237case Intrinsic::nvvm_suld_2d_array_i64_clamp:
4238case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
4239case Intrinsic::nvvm_suld_3d_i64_clamp:
4240case Intrinsic::nvvm_suld_3d_v2i64_clamp:
4241case Intrinsic::nvvm_suld_1d_i64_trap:
4242case Intrinsic::nvvm_suld_1d_v2i64_trap:
4243case Intrinsic::nvvm_suld_1d_array_i64_trap:
4244case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
4245case Intrinsic::nvvm_suld_2d_i64_trap:
4246case Intrinsic::nvvm_suld_2d_v2i64_trap:
4247case Intrinsic::nvvm_suld_2d_array_i64_trap:
4248case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
4249case Intrinsic::nvvm_suld_3d_i64_trap:
4250case Intrinsic::nvvm_suld_3d_v2i64_trap:
4251case Intrinsic::nvvm_suld_1d_i64_zero:
4252case Intrinsic::nvvm_suld_1d_v2i64_zero:
4253case Intrinsic::nvvm_suld_1d_array_i64_zero:
4254case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
4255case Intrinsic::nvvm_suld_2d_i64_zero:
4256case Intrinsic::nvvm_suld_2d_v2i64_zero:
4257case Intrinsic::nvvm_suld_2d_array_i64_zero:
4258case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
4259case Intrinsic::nvvm_suld_3d_i64_zero:
4260case Intrinsic::nvvm_suld_3d_v2i64_zero:
4261Info.opc =ISD::INTRINSIC_W_CHAIN;
4262Info.memVT = MVT::i64;
4263Info.ptrVal =nullptr;
4264Info.offset = 0;
4265Info.flags =MachineMemOperand::MOLoad;
4266Info.align =Align(16);
4267returntrue;
4268 }
4269returnfalse;
4270}
4271
4272/// getFunctionParamOptimizedAlign - since function arguments are passed via
4273/// .param space, we may want to increase their alignment in a way that
4274/// ensures that we can effectively vectorize their loads & stores. We can
4275/// increase alignment only if the function has internal or has private
4276/// linkage as for other linkage types callers may already rely on default
4277/// alignment. To allow using 128-bit vectorized loads/stores, this function
4278/// ensures that alignment is 16 or greater.
4279AlignNVPTXTargetLowering::getFunctionParamOptimizedAlign(
4280constFunction *F,Type *ArgTy,constDataLayout &DL) const{
4281// Capping the alignment to 128 bytes as that is the maximum alignment
4282// supported by PTX.
4283constAlign ABITypeAlign = std::min(Align(128),DL.getABITypeAlign(ArgTy));
4284
4285// If a function has linkage different from internal or private, we
4286// must use default ABI alignment as external users rely on it. Same
4287// for a function that may be called from a function pointer.
4288if (!F || !F->hasLocalLinkage() ||
4289F->hasAddressTaken(/*Users=*/nullptr,
4290/*IgnoreCallbackUses=*/false,
4291/*IgnoreAssumeLikeCalls=*/true,
4292/*IgnoreLLVMUsed=*/true))
4293return ABITypeAlign;
4294
4295assert(!isKernelFunction(*F) &&"Expect kernels to have non-local linkage");
4296return std::max(Align(16), ABITypeAlign);
4297}
4298
4299/// Helper for computing alignment of a device function byval parameter.
4300AlignNVPTXTargetLowering::getFunctionByValParamAlign(
4301constFunction *F,Type *ArgTy,Align InitialAlign,
4302constDataLayout &DL) const{
4303Align ArgAlign = InitialAlign;
4304// Try to increase alignment to enhance vectorization options.
4305if (F)
4306 ArgAlign = std::max(ArgAlign,getFunctionParamOptimizedAlign(F, ArgTy,DL));
4307
4308// Old ptx versions have a bug. When PTX code takes address of
4309// byval parameter with alignment < 4, ptxas generates code to
4310// spill argument into memory. Alas on sm_50+ ptxas generates
4311// SASS code that fails with misaligned access. To work around
4312// the problem, make sure that we align byval parameters by at
4313// least 4. This bug seems to be fixed at least starting from
4314// ptxas > 9.0.
4315// TODO: remove this after verifying the bug is not reproduced
4316// on non-deprecated ptxas versions.
4317if (ForceMinByValParamAlign)
4318 ArgAlign = std::max(ArgAlign,Align(4));
4319
4320return ArgAlign;
4321}
4322
4323// Helper for getting a function parameter name. Name is composed from
4324// its index and the function name. Negative index corresponds to special
4325// parameter (unsized array) used for passing variable arguments.
4326std::stringNVPTXTargetLowering::getParamName(constFunction *F,
4327intIdx) const{
4328 std::string ParamName;
4329raw_string_ostream ParamStr(ParamName);
4330
4331 ParamStr <<getTargetMachine().getSymbol(F)->getName();
4332if (Idx < 0)
4333 ParamStr <<"_vararg";
4334else
4335 ParamStr <<"_param_" <<Idx;
4336
4337return ParamName;
4338}
4339
4340/// isLegalAddressingMode - Return true if the addressing mode represented
4341/// by AM is legal for this target, for a load/store of the specified type.
4342/// Used to guide target specific optimizations, like loop strength reduction
4343/// (LoopStrengthReduce.cpp) and memory optimization for address mode
4344/// (CodeGenPrepare.cpp)
4345boolNVPTXTargetLowering::isLegalAddressingMode(constDataLayout &DL,
4346constAddrMode &AM,Type *Ty,
4347unsigned AS,Instruction *I) const{
4348// AddrMode - This represents an addressing mode of:
4349// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
4350//
4351// The legal address modes are
4352// - [avar]
4353// - [areg]
4354// - [areg+immoff]
4355// - [immAddr]
4356
4357// immoff must fit in a signed 32-bit int
4358if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
4359returnfalse;
4360
4361if (AM.BaseGV)
4362return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
4363
4364switch (AM.Scale) {
4365case 0:// "r", "r+i" or "i" is allowed
4366break;
4367case 1:
4368if (AM.HasBaseReg)// "r+r+i" or "r+r" is not allowed.
4369returnfalse;
4370// Otherwise we have r+i.
4371break;
4372default:
4373// No scale > 1 is allowed
4374returnfalse;
4375 }
4376returntrue;
4377}
4378
4379//===----------------------------------------------------------------------===//
4380// NVPTX Inline Assembly Support
4381//===----------------------------------------------------------------------===//
4382
4383/// getConstraintType - Given a constraint letter, return the type of
4384/// constraint it is for this target.
4385NVPTXTargetLowering::ConstraintType
4386NVPTXTargetLowering::getConstraintType(StringRef Constraint) const{
4387if (Constraint.size() == 1) {
4388switch (Constraint[0]) {
4389default:
4390break;
4391case'b':
4392case'r':
4393case'h':
4394case'c':
4395case'l':
4396case'f':
4397case'd':
4398case'q':
4399case'0':
4400case'N':
4401returnC_RegisterClass;
4402 }
4403 }
4404returnTargetLowering::getConstraintType(Constraint);
4405}
4406
4407std::pair<unsigned, const TargetRegisterClass *>
4408NVPTXTargetLowering::getRegForInlineAsmConstraint(constTargetRegisterInfo *TRI,
4409StringRef Constraint,
4410MVT VT) const{
4411if (Constraint.size() == 1) {
4412switch (Constraint[0]) {
4413case'b':
4414return std::make_pair(0U, &NVPTX::Int1RegsRegClass);
4415case'c':
4416return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4417case'h':
4418return std::make_pair(0U, &NVPTX::Int16RegsRegClass);
4419case'r':
4420return std::make_pair(0U, &NVPTX::Int32RegsRegClass);
4421case'l':
4422case'N':
4423return std::make_pair(0U, &NVPTX::Int64RegsRegClass);
4424case'q': {
4425if (STI.getSmVersion() < 70)
4426report_fatal_error("Inline asm with 128 bit operands is only "
4427"supported for sm_70 and higher!");
4428return std::make_pair(0U, &NVPTX::Int128RegsRegClass);
4429 }
4430case'f':
4431return std::make_pair(0U, &NVPTX::Float32RegsRegClass);
4432case'd':
4433return std::make_pair(0U, &NVPTX::Float64RegsRegClass);
4434 }
4435 }
4436returnTargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
4437}
4438
4439//===----------------------------------------------------------------------===//
4440// NVPTX DAG Combining
4441//===----------------------------------------------------------------------===//
4442
4443boolNVPTXTargetLowering::allowFMA(MachineFunction &MF,
4444CodeGenOptLevel OptLevel) const{
4445// Always honor command-line argument
4446if (FMAContractLevelOpt.getNumOccurrences() > 0)
4447returnFMAContractLevelOpt > 0;
4448
4449// Do not contract if we're not optimizing the code.
4450if (OptLevel ==CodeGenOptLevel::None)
4451returnfalse;
4452
4453// Honor TargetOptions flags that explicitly say fusion is okay.
4454if (MF.getTarget().Options.AllowFPOpFusion ==FPOpFusion::Fast)
4455returntrue;
4456
4457returnallowUnsafeFPMath(MF);
4458}
4459
4460boolNVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const{
4461// Honor TargetOptions flags that explicitly say unsafe math is okay.
4462if (MF.getTarget().Options.UnsafeFPMath)
4463returntrue;
4464
4465// Allow unsafe math if unsafe-fp-math attribute explicitly says so.
4466constFunction &F = MF.getFunction();
4467returnF.getFnAttribute("unsafe-fp-math").getValueAsBool();
4468}
4469
4470staticboolisConstZero(constSDValue &Operand) {
4471constauto *Const = dyn_cast<ConstantSDNode>(Operand);
4472return Const && Const->getZExtValue() == 0;
4473}
4474
4475/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
4476/// operands N0 and N1. This is a helper for PerformADDCombine that is
4477/// called with the default operands, and if that fails, with commuted
4478/// operands.
4479staticSDValue
4480PerformADDCombineWithOperands(SDNode *N,SDValue N0,SDValue N1,
4481TargetLowering::DAGCombinerInfo &DCI) {
4482EVT VT = N0.getValueType();
4483
4484// Since integer multiply-add costs the same as integer multiply
4485// but is more costly than integer add, do the fusion only when
4486// the mul is only used in the add.
4487// TODO: this may not be true for later architectures, consider relaxing this
4488if (!N0.getNode()->hasOneUse())
4489returnSDValue();
4490
4491// fold (add (select cond, 0, (mul a, b)), c)
4492// -> (select cond, c, (add (mul a, b), c))
4493//
4494if (N0.getOpcode() ==ISD::SELECT) {
4495unsigned ZeroOpNum;
4496if (isConstZero(N0->getOperand(1)))
4497 ZeroOpNum = 1;
4498elseif (isConstZero(N0->getOperand(2)))
4499 ZeroOpNum = 2;
4500else
4501returnSDValue();
4502
4503SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
4504if (M->getOpcode() !=ISD::MUL || !M.getNode()->hasOneUse())
4505returnSDValue();
4506
4507SDLocDL(N);
4508SDValueMul =
4509 DCI.DAG.getNode(ISD::MUL,DL, VT, M->getOperand(0), M->getOperand(1));
4510SDValue MAD = DCI.DAG.getNode(ISD::ADD,DL, VT,Mul, N1);
4511return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
4512 ((ZeroOpNum == 1) ? N1 : MAD),
4513 ((ZeroOpNum == 1) ? MAD : N1));
4514 }
4515
4516returnSDValue();
4517}
4518
4519staticSDValue
4520PerformFADDCombineWithOperands(SDNode *N,SDValue N0,SDValue N1,
4521TargetLowering::DAGCombinerInfo &DCI,
4522CodeGenOptLevel OptLevel) {
4523EVT VT = N0.getValueType();
4524if (N0.getOpcode() ==ISD::FMUL) {
4525constauto *TLI =static_cast<constNVPTXTargetLowering *>(
4526 &DCI.DAG.getTargetLoweringInfo());
4527if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))
4528returnSDValue();
4529
4530// For floating point:
4531// Do the fusion only when the mul has less than 5 uses and all
4532// are add.
4533// The heuristic is that if a use is not an add, then that use
4534// cannot be fused into fma, therefore mul is still needed anyway.
4535// If there are more than 4 uses, even if they are all add, fusing
4536// them will increase register pressue.
4537//
4538int numUses = 0;
4539int nonAddCount = 0;
4540for (constSDNode *User : N0.getNode()->users()) {
4541 numUses++;
4542if (User->getOpcode() !=ISD::FADD)
4543 ++nonAddCount;
4544if (numUses >= 5)
4545returnSDValue();
4546 }
4547if (nonAddCount) {
4548int orderNo =N->getIROrder();
4549int orderNo2 = N0.getNode()->getIROrder();
4550// simple heuristics here for considering potential register
4551// pressure, the logics here is that the differnce are used
4552// to measure the distance between def and use, the longer distance
4553// more likely cause register pressure.
4554if (orderNo - orderNo2 < 500)
4555returnSDValue();
4556
4557// Now, check if at least one of the FMUL's operands is live beyond the
4558// node N, which guarantees that the FMA will not increase register
4559// pressure at node N.
4560bool opIsLive =false;
4561constSDNode *left = N0.getOperand(0).getNode();
4562constSDNode *right = N0.getOperand(1).getNode();
4563
4564if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
4565 opIsLive =true;
4566
4567if (!opIsLive)
4568for (constSDNode *User : left->users()) {
4569int orderNo3 =User->getIROrder();
4570if (orderNo3 > orderNo) {
4571 opIsLive =true;
4572break;
4573 }
4574 }
4575
4576if (!opIsLive)
4577for (constSDNode *User : right->users()) {
4578int orderNo3 =User->getIROrder();
4579if (orderNo3 > orderNo) {
4580 opIsLive =true;
4581break;
4582 }
4583 }
4584
4585if (!opIsLive)
4586returnSDValue();
4587 }
4588
4589return DCI.DAG.getNode(ISD::FMA,SDLoc(N), VT, N0.getOperand(0),
4590 N0.getOperand(1), N1);
4591 }
4592
4593returnSDValue();
4594}
4595
4596staticSDValuePerformStoreCombineHelper(SDNode *N, std::size_t Front,
4597 std::size_t Back) {
4598if (all_of(N->ops().drop_front(Front).drop_back(Back),
4599 [](constSDUse &U) { return U.get()->isUndef(); }))
4600// Operand 0 is the previous value in the chain. Cannot return EntryToken
4601// as the previous value will become unused and eliminated later.
4602returnN->getOperand(0);
4603
4604returnSDValue();
4605}
4606
4607staticSDValuePerformStoreParamCombine(SDNode *N) {
4608// Operands from the 3rd to the 2nd last one are the values to be stored.
4609// {Chain, ArgID, Offset, Val, Glue}
4610returnPerformStoreCombineHelper(N, 3, 1);
4611}
4612
4613staticSDValuePerformStoreRetvalCombine(SDNode *N) {
4614// Operands from the 2nd to the last one are the values to be stored
4615returnPerformStoreCombineHelper(N, 2, 0);
4616}
4617
4618/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
4619///
4620staticSDValuePerformADDCombine(SDNode *N,
4621TargetLowering::DAGCombinerInfo &DCI,
4622CodeGenOptLevel OptLevel) {
4623if (OptLevel ==CodeGenOptLevel::None)
4624returnSDValue();
4625
4626SDValue N0 =N->getOperand(0);
4627SDValue N1 =N->getOperand(1);
4628
4629// Skip non-integer, non-scalar case
4630EVT VT = N0.getValueType();
4631if (VT.isVector() || VT != MVT::i32)
4632returnSDValue();
4633
4634// First try with the default operand order.
4635if (SDValue Result =PerformADDCombineWithOperands(N, N0, N1, DCI))
4636return Result;
4637
4638// If that didn't work, try again with the operands commuted.
4639returnPerformADDCombineWithOperands(N, N1, N0, DCI);
4640}
4641
4642/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.
4643///
4644staticSDValuePerformFADDCombine(SDNode *N,
4645TargetLowering::DAGCombinerInfo &DCI,
4646CodeGenOptLevel OptLevel) {
4647SDValue N0 =N->getOperand(0);
4648SDValue N1 =N->getOperand(1);
4649
4650EVT VT = N0.getValueType();
4651if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
4652returnSDValue();
4653
4654// First try with the default operand order.
4655if (SDValue Result =PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
4656return Result;
4657
4658// If that didn't work, try again with the operands commuted.
4659returnPerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
4660}
4661
4662staticSDValuePerformANDCombine(SDNode *N,
4663TargetLowering::DAGCombinerInfo &DCI) {
4664// The type legalizer turns a vector load of i8 values into a zextload to i16
4665// registers, optionally ANY_EXTENDs it (if target type is integer),
4666// and ANDs off the high 8 bits. Since we turn this load into a
4667// target-specific DAG node, the DAG combiner fails to eliminate these AND
4668// nodes. Do that here.
4669SDValue Val =N->getOperand(0);
4670SDValue Mask =N->getOperand(1);
4671
4672if (isa<ConstantSDNode>(Val)) {
4673std::swap(Val, Mask);
4674 }
4675
4676SDValue AExt;
4677
4678// Convert BFE-> truncate i16 -> and 255
4679// To just BFE-> truncate i16, as the value already has all the bits in the
4680// right places.
4681if (Val.getOpcode() ==ISD::TRUNCATE) {
4682SDValue BFE = Val.getOperand(0);
4683if (BFE.getOpcode() !=NVPTXISD::BFE)
4684returnSDValue();
4685
4686ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));
4687if (!BFEBits)
4688returnSDValue();
4689uint64_t BFEBitsVal = BFEBits->getZExtValue();
4690
4691ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4692if (!MaskCnst) {
4693// Not an AND with a constant
4694returnSDValue();
4695 }
4696uint64_t MaskVal = MaskCnst->getZExtValue();
4697
4698if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)
4699returnSDValue();
4700// If we get here, the AND is unnecessary. Just replace it with the trunc
4701 DCI.CombineTo(N, Val,false);
4702 }
4703// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and
4704if (Val.getOpcode() ==ISD::ANY_EXTEND) {
4705 AExt = Val;
4706 Val = Val->getOperand(0);
4707 }
4708
4709if (Val->getOpcode() ==NVPTXISD::LoadV2 ||
4710 Val->getOpcode() ==NVPTXISD::LoadV4) {
4711ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);
4712if (!MaskCnst) {
4713// Not an AND with a constant
4714returnSDValue();
4715 }
4716
4717uint64_t MaskVal = MaskCnst->getZExtValue();
4718if (MaskVal != 0xff) {
4719// Not an AND that chops off top 8 bits
4720returnSDValue();
4721 }
4722
4723MemSDNode *Mem = dyn_cast<MemSDNode>(Val);
4724if (!Mem) {
4725// Not a MemSDNode?!?
4726returnSDValue();
4727 }
4728
4729EVT MemVT = Mem->getMemoryVT();
4730if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {
4731// We only handle the i8 case
4732returnSDValue();
4733 }
4734
4735unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);
4736if (ExtType ==ISD::SEXTLOAD) {
4737// If for some reason the load is a sextload, the and is needed to zero
4738// out the high 8 bits
4739returnSDValue();
4740 }
4741
4742bool AddTo =false;
4743if (AExt.getNode() !=nullptr) {
4744// Re-insert the ext as a zext.
4745 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND,SDLoc(N),
4746 AExt.getValueType(), Val);
4747 AddTo =true;
4748 }
4749
4750// If we get here, the AND is unnecessary. Just replace it with the load
4751 DCI.CombineTo(N, Val, AddTo);
4752 }
4753
4754returnSDValue();
4755}
4756
4757staticSDValuePerformREMCombine(SDNode *N,
4758TargetLowering::DAGCombinerInfo &DCI,
4759CodeGenOptLevel OptLevel) {
4760assert(N->getOpcode() ==ISD::SREM ||N->getOpcode() ==ISD::UREM);
4761
4762// Don't do anything at less than -O2.
4763if (OptLevel <CodeGenOptLevel::Default)
4764returnSDValue();
4765
4766SelectionDAG &DAG = DCI.DAG;
4767SDLocDL(N);
4768EVT VT =N->getValueType(0);
4769bool IsSigned =N->getOpcode() ==ISD::SREM;
4770unsigned DivOpc = IsSigned ?ISD::SDIV :ISD::UDIV;
4771
4772constSDValue &Num =N->getOperand(0);
4773constSDValue &Den =N->getOperand(1);
4774
4775for (constSDNode *U : Num->users()) {
4776if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
4777 U->getOperand(1) == Den) {
4778// Num % Den -> Num - (Num / Den) * Den
4779return DAG.getNode(ISD::SUB,DL, VT, Num,
4780 DAG.getNode(ISD::MUL,DL, VT,
4781 DAG.getNode(DivOpc,DL, VT, Num, Den),
4782 Den));
4783 }
4784 }
4785returnSDValue();
4786}
4787
4788enumOperandSignedness {
4789Signed = 0,
4790Unsigned,
4791Unknown
4792};
4793
4794/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
4795/// that can be demoted to \p OptSize bits without loss of information. The
4796/// signedness of the operand, if determinable, is placed in \p S.
4797staticboolIsMulWideOperandDemotable(SDValueOp,
4798unsigned OptSize,
4799OperandSignedness &S) {
4800 S =Unknown;
4801
4802if (Op.getOpcode() ==ISD::SIGN_EXTEND ||
4803Op.getOpcode() ==ISD::SIGN_EXTEND_INREG) {
4804EVT OrigVT =Op.getOperand(0).getValueType();
4805if (OrigVT.getFixedSizeInBits() <= OptSize) {
4806 S =Signed;
4807returntrue;
4808 }
4809 }elseif (Op.getOpcode() ==ISD::ZERO_EXTEND) {
4810EVT OrigVT =Op.getOperand(0).getValueType();
4811if (OrigVT.getFixedSizeInBits() <= OptSize) {
4812 S =Unsigned;
4813returntrue;
4814 }
4815 }
4816
4817returnfalse;
4818}
4819
4820/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
4821/// be demoted to \p OptSize bits without loss of information. If the operands
4822/// contain a constant, it should appear as the RHS operand. The signedness of
4823/// the operands is placed in \p IsSigned.
4824staticboolAreMulWideOperandsDemotable(SDValue LHS,SDValue RHS,
4825unsigned OptSize,
4826bool &IsSigned) {
4827OperandSignedness LHSSign;
4828
4829// The LHS operand must be a demotable op
4830if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
4831returnfalse;
4832
4833// We should have been able to determine the signedness from the LHS
4834if (LHSSign ==Unknown)
4835returnfalse;
4836
4837 IsSigned = (LHSSign ==Signed);
4838
4839// The RHS can be a demotable op or a constant
4840if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {
4841constAPInt &Val = CI->getAPIntValue();
4842if (LHSSign ==Unsigned) {
4843return Val.isIntN(OptSize);
4844 }else {
4845return Val.isSignedIntN(OptSize);
4846 }
4847 }else {
4848OperandSignedness RHSSign;
4849if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
4850returnfalse;
4851
4852return LHSSign == RHSSign;
4853 }
4854}
4855
4856/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
4857/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
4858/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
4859/// amount.
4860staticSDValueTryMULWIDECombine(SDNode *N,
4861TargetLowering::DAGCombinerInfo &DCI) {
4862EVT MulType =N->getValueType(0);
4863if (MulType != MVT::i32 && MulType != MVT::i64) {
4864returnSDValue();
4865 }
4866
4867SDLocDL(N);
4868unsigned OptSize = MulType.getSizeInBits() >> 1;
4869SDValueLHS =N->getOperand(0);
4870SDValueRHS =N->getOperand(1);
4871
4872// Canonicalize the multiply so the constant (if any) is on the right
4873if (N->getOpcode() ==ISD::MUL) {
4874if (isa<ConstantSDNode>(LHS)) {
4875std::swap(LHS,RHS);
4876 }
4877 }
4878
4879// If we have a SHL, determine the actual multiply amount
4880if (N->getOpcode() ==ISD::SHL) {
4881ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);
4882if (!ShlRHS) {
4883returnSDValue();
4884 }
4885
4886APInt ShiftAmt = ShlRHS->getAPIntValue();
4887unsignedBitWidth = MulType.getSizeInBits();
4888if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
4889APInt MulVal =APInt(BitWidth, 1) << ShiftAmt;
4890RHS = DCI.DAG.getConstant(MulVal,DL, MulType);
4891 }else {
4892returnSDValue();
4893 }
4894 }
4895
4896boolSigned;
4897// Verify that our operands are demotable
4898if (!AreMulWideOperandsDemotable(LHS,RHS, OptSize,Signed)) {
4899returnSDValue();
4900 }
4901
4902EVT DemotedVT;
4903if (MulType == MVT::i32) {
4904 DemotedVT = MVT::i16;
4905 }else {
4906 DemotedVT = MVT::i32;
4907 }
4908
4909// Truncate the operands to the correct size. Note that these are just for
4910// type consistency and will (likely) be eliminated in later phases.
4911SDValue TruncLHS =
4912 DCI.DAG.getNode(ISD::TRUNCATE,DL, DemotedVT,LHS);
4913SDValue TruncRHS =
4914 DCI.DAG.getNode(ISD::TRUNCATE,DL, DemotedVT,RHS);
4915
4916unsigned Opc;
4917if (Signed) {
4918 Opc =NVPTXISD::MUL_WIDE_SIGNED;
4919 }else {
4920 Opc =NVPTXISD::MUL_WIDE_UNSIGNED;
4921 }
4922
4923return DCI.DAG.getNode(Opc,DL, MulType, TruncLHS, TruncRHS);
4924}
4925
4926staticboolisConstOne(constSDValue &Operand) {
4927constauto *Const = dyn_cast<ConstantSDNode>(Operand);
4928return Const && Const->getZExtValue() == 1;
4929}
4930
4931staticSDValuematchMADConstOnePattern(SDValueAdd) {
4932if (Add->getOpcode() !=ISD::ADD)
4933returnSDValue();
4934
4935if (isConstOne(Add->getOperand(0)))
4936returnAdd->getOperand(1);
4937
4938if (isConstOne(Add->getOperand(1)))
4939returnAdd->getOperand(0);
4940
4941returnSDValue();
4942}
4943
4944staticSDValuecombineMADConstOne(SDValueX,SDValueAdd,EVT VT,SDLocDL,
4945TargetLowering::DAGCombinerInfo &DCI) {
4946
4947if (SDValueY =matchMADConstOnePattern(Add)) {
4948SDValueMul = DCI.DAG.getNode(ISD::MUL,DL, VT,X,Y);
4949return DCI.DAG.getNode(ISD::ADD,DL, VT,Mul,X);
4950 }
4951
4952returnSDValue();
4953}
4954
4955staticSDValuecombineMulSelectConstOne(SDValueX,SDValueSelect,EVT VT,
4956SDLocDL,
4957TargetLowering::DAGCombinerInfo &DCI) {
4958if (Select->getOpcode() !=ISD::SELECT)
4959returnSDValue();
4960
4961SDValueCond =Select->getOperand(0);
4962
4963unsigned ConstOpNo;
4964if (isConstOne(Select->getOperand(1)))
4965 ConstOpNo = 1;
4966elseif (isConstOne(Select->getOperand(2)))
4967 ConstOpNo = 2;
4968else
4969returnSDValue();
4970
4971SDValueY =Select->getOperand((ConstOpNo == 1) ? 2 : 1);
4972
4973// Do not combine if the resulting sequence is not obviously profitable.
4974if (!matchMADConstOnePattern(Y))
4975returnSDValue();
4976
4977SDValue NewMul = DCI.DAG.getNode(ISD::MUL,DL, VT,X,Y);
4978
4979return DCI.DAG.getNode(ISD::SELECT,DL, VT,Cond,
4980 (ConstOpNo == 1) ?X : NewMul,
4981 (ConstOpNo == 1) ? NewMul :X);
4982}
4983
4984staticSDValue
4985PerformMULCombineWithOperands(SDNode *N,SDValue N0,SDValue N1,
4986TargetLowering::DAGCombinerInfo &DCI) {
4987
4988EVT VT = N0.getValueType();
4989if (VT.isVector())
4990returnSDValue();
4991
4992if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
4993returnSDValue();
4994
4995SDLocDL(N);
4996
4997// (mul x, (add y, 1)) -> (add (mul x, y), x)
4998if (SDValue Res =combineMADConstOne(N0, N1, VT,DL, DCI))
4999return Res;
5000if (SDValue Res =combineMADConstOne(N1, N0, VT,DL, DCI))
5001return Res;
5002
5003// (mul x, (select y, 1)) -> (select (mul x, y), x)
5004if (SDValue Res =combineMulSelectConstOne(N0, N1, VT,DL, DCI))
5005return Res;
5006if (SDValue Res =combineMulSelectConstOne(N1, N0, VT,DL, DCI))
5007return Res;
5008
5009returnSDValue();
5010}
5011
5012/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
5013staticSDValuePerformMULCombine(SDNode *N,
5014TargetLowering::DAGCombinerInfo &DCI,
5015CodeGenOptLevel OptLevel) {
5016if (OptLevel ==CodeGenOptLevel::None)
5017returnSDValue();
5018
5019if (SDValue Ret =TryMULWIDECombine(N, DCI))
5020return Ret;
5021
5022SDValue N0 =N->getOperand(0);
5023SDValue N1 =N->getOperand(1);
5024returnPerformMULCombineWithOperands(N, N0, N1, DCI);
5025}
5026
5027/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
5028staticSDValuePerformSHLCombine(SDNode *N,
5029TargetLowering::DAGCombinerInfo &DCI,
5030CodeGenOptLevel OptLevel) {
5031if (OptLevel >CodeGenOptLevel::None) {
5032// Try mul.wide combining at OptLevel > 0
5033if (SDValue Ret =TryMULWIDECombine(N, DCI))
5034return Ret;
5035 }
5036
5037returnSDValue();
5038}
5039
5040staticSDValuePerformSETCCCombine(SDNode *N,
5041TargetLowering::DAGCombinerInfo &DCI,
5042unsignedintSmVersion) {
5043EVT CCType =N->getValueType(0);
5044SDValueA =N->getOperand(0);
5045SDValueB =N->getOperand(1);
5046
5047EVT AType =A.getValueType();
5048if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
5049returnSDValue();
5050
5051if (A.getValueType() == MVT::v2bf16 &&SmVersion < 90)
5052returnSDValue();
5053
5054SDLocDL(N);
5055// setp.f16x2 returns two scalar predicates, which we need to
5056// convert back to v2i1. The returned result will be scalarized by
5057// the legalizer, but the comparison will remain a single vector
5058// instruction.
5059SDValue CCNode = DCI.DAG.getNode(
5060A.getValueType() == MVT::v2f16 ?NVPTXISD::SETP_F16X2
5061 :NVPTXISD::SETP_BF16X2,
5062DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
5063return DCI.DAG.getNode(ISD::BUILD_VECTOR,DL, CCType, CCNode.getValue(0),
5064 CCNode.getValue(1));
5065}
5066
5067staticSDValuePerformEXTRACTCombine(SDNode *N,
5068TargetLowering::DAGCombinerInfo &DCI) {
5069SDValueVector =N->getOperand(0);
5070if (Vector->getOpcode() ==ISD::FREEZE)
5071Vector =Vector->getOperand(0);
5072SDLocDL(N);
5073EVT VectorVT =Vector.getValueType();
5074if (Vector->getOpcode() ==ISD::LOAD && VectorVT.isSimple() &&
5075IsPTXVectorType(VectorVT.getSimpleVT()))
5076returnSDValue();// Native vector loads already combine nicely w/
5077// extract_vector_elt.
5078// Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already
5079// handle them OK.
5080if (VectorVT.getVectorNumElements() == 1 ||Isv2x16VT(VectorVT) ||
5081 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)
5082returnSDValue();
5083
5084// Don't mess with undef values as sra may be simplified to 0, not undef.
5085if (Vector->isUndef() ||ISD::allOperandsUndef(Vector.getNode()))
5086returnSDValue();
5087
5088uint64_t VectorBits = VectorVT.getSizeInBits();
5089// We only handle the types we can extract in-register.
5090if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
5091returnSDValue();
5092
5093ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
5094// Index == 0 is handled by generic DAG combiner.
5095if (!Index || Index->getZExtValue() == 0)
5096returnSDValue();
5097
5098MVT IVT =MVT::getIntegerVT(VectorBits);
5099EVT EltVT = VectorVT.getVectorElementType();
5100EVT EltIVT = EltVT.changeTypeToInteger();
5101uint64_t EltBits = EltVT.getScalarSizeInBits();
5102
5103SDValue Result = DCI.DAG.getNode(
5104ISD::TRUNCATE,DL, EltIVT,
5105 DCI.DAG.getNode(
5106ISD::SRA,DL, IVT, DCI.DAG.getNode(ISD::BITCAST,DL, IVT,Vector),
5107 DCI.DAG.getConstant(Index->getZExtValue() * EltBits,DL, IVT)));
5108
5109// If element has non-integer type, bitcast it back to the expected type.
5110if (EltVT != EltIVT)
5111 Result = DCI.DAG.getNode(ISD::BITCAST,DL, EltVT, Result);
5112// Past legalizer, we may need to extent i8 -> i16 to match the register type.
5113if (EltVT !=N->getValueType(0))
5114 Result = DCI.DAG.getNode(ISD::ANY_EXTEND,DL,N->getValueType(0), Result);
5115
5116return Result;
5117}
5118
5119staticSDValuePerformVSELECTCombine(SDNode *N,
5120TargetLowering::DAGCombinerInfo &DCI) {
5121SDValue VA =N->getOperand(1);
5122EVT VectorVT = VA.getValueType();
5123if (VectorVT != MVT::v4i8)
5124returnSDValue();
5125
5126// We need to split vselect into individual per-element operations Because we
5127// use BFE/BFI instruction for byte extraction/insertion, we do end up with
5128// 32-bit values, so we may as well do comparison as i32 to avoid conversions
5129// to/from i16 normally used for i8 values.
5130SmallVector<SDValue, 4> E;
5131SDLocDL(N);
5132SDValue VCond =N->getOperand(0);
5133SDValue VB =N->getOperand(2);
5134for (intI = 0;I < 4; ++I) {
5135SDValueC = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i1, VCond,
5136 DCI.DAG.getConstant(I,DL, MVT::i32));
5137SDValue EA = DCI.DAG.getAnyExtOrTrunc(
5138 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i8, VA,
5139 DCI.DAG.getConstant(I,DL, MVT::i32)),
5140DL, MVT::i32);
5141SDValue EB = DCI.DAG.getAnyExtOrTrunc(
5142 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i8, VB,
5143 DCI.DAG.getConstant(I,DL, MVT::i32)),
5144DL, MVT::i32);
5145 E.push_back(DCI.DAG.getAnyExtOrTrunc(
5146 DCI.DAG.getNode(ISD::SELECT,DL, MVT::i32,C, EA, EB),DL, MVT::i8));
5147 }
5148return DCI.DAG.getNode(ISD::BUILD_VECTOR,DL, MVT::v4i8, E);
5149}
5150
5151staticSDValue
5152PerformBUILD_VECTORCombine(SDNode *N,TargetLowering::DAGCombinerInfo &DCI) {
5153auto VT =N->getValueType(0);
5154if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT))
5155returnSDValue();
5156
5157auto Op0 =N->getOperand(0);
5158auto Op1 =N->getOperand(1);
5159
5160// Start out by assuming we want to take the lower 2 bytes of each i32
5161// operand.
5162uint64_t Op0Bytes = 0x10;
5163uint64_t Op1Bytes = 0x54;
5164
5165 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
5166 {&Op1, &Op1Bytes}};
5167
5168// Check that each operand is an i16, truncated from an i32 operand. We'll
5169// select individual bytes from those original operands. Optionally, fold in a
5170// shift right of that original operand.
5171for (auto &[Op, OpBytes] : OpData) {
5172// Eat up any bitcast
5173if (Op->getOpcode() ==ISD::BITCAST)
5174 *Op =Op->getOperand(0);
5175
5176if (!(Op->getValueType() == MVT::i16 &&Op->getOpcode() ==ISD::TRUNCATE &&
5177Op->getOperand(0).getValueType() == MVT::i32))
5178returnSDValue();
5179
5180// If the truncate has multiple uses, this optimization can increase
5181// register pressure
5182if (!Op->hasOneUse())
5183returnSDValue();
5184
5185 *Op =Op->getOperand(0);
5186
5187// Optionally, fold in a shift-right of the original operand and let permute
5188// pick the two higher bytes of the original value directly.
5189if (Op->getOpcode() ==ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
5190if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
5191// Shift the PRMT byte selector to pick upper bytes from each respective
5192// value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
5193assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
5194"PRMT selector values out of range");
5195 *OpBytes += 0x22;
5196 *Op =Op->getOperand(0);
5197 }
5198 }
5199 }
5200
5201SDLocDL(N);
5202auto &DAG = DCI.DAG;
5203
5204auto PRMT = DAG.getNode(
5205NVPTXISD::PRMT,DL, MVT::v4i8,
5206 {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes,DL, MVT::i32),
5207 DAG.getConstant(NVPTX::PTXPrmtMode::NONE,DL, MVT::i32)});
5208return DAG.getNode(ISD::BITCAST,DL, VT, PRMT);
5209}
5210
5211SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
5212 DAGCombinerInfo &DCI) const{
5213CodeGenOptLevel OptLevel =getTargetMachine().getOptLevel();
5214switch (N->getOpcode()) {
5215default:break;
5216caseISD::ADD:
5217returnPerformADDCombine(N, DCI, OptLevel);
5218caseISD::FADD:
5219returnPerformFADDCombine(N, DCI, OptLevel);
5220caseISD::MUL:
5221returnPerformMULCombine(N, DCI, OptLevel);
5222caseISD::SHL:
5223returnPerformSHLCombine(N, DCI, OptLevel);
5224caseISD::AND:
5225returnPerformANDCombine(N, DCI);
5226caseISD::UREM:
5227caseISD::SREM:
5228returnPerformREMCombine(N, DCI, OptLevel);
5229caseISD::SETCC:
5230returnPerformSETCCCombine(N, DCI, STI.getSmVersion());
5231caseNVPTXISD::StoreRetval:
5232caseNVPTXISD::StoreRetvalV2:
5233caseNVPTXISD::StoreRetvalV4:
5234returnPerformStoreRetvalCombine(N);
5235caseNVPTXISD::StoreParam:
5236caseNVPTXISD::StoreParamV2:
5237caseNVPTXISD::StoreParamV4:
5238returnPerformStoreParamCombine(N);
5239caseISD::EXTRACT_VECTOR_ELT:
5240returnPerformEXTRACTCombine(N, DCI);
5241caseISD::VSELECT:
5242returnPerformVSELECTCombine(N, DCI);
5243caseISD::BUILD_VECTOR:
5244returnPerformBUILD_VECTORCombine(N, DCI);
5245 }
5246returnSDValue();
5247}
5248
5249staticvoidReplaceBITCAST(SDNode *Node,SelectionDAG &DAG,
5250SmallVectorImpl<SDValue> &Results) {
5251// Handle bitcasting to v2i8 without hitting the default promotion
5252// strategy which goes through stack memory.
5253SDValueOp(Node, 0);
5254EVT ToVT =Op->getValueType(0);
5255if (ToVT != MVT::v2i8) {
5256return;
5257 }
5258
5259// Bitcast to i16 and unpack elements into a vector
5260SDLocDL(Node);
5261SDValue AsInt =MaybeBitcast(DAG,DL, MVT::i16,Op->getOperand(0));
5262SDValue Vec0 = DAG.getNode(ISD::TRUNCATE,DL, MVT::i8, AsInt);
5263SDValue Const8 = DAG.getConstant(8,DL, MVT::i16);
5264SDValue Vec1 =
5265 DAG.getNode(ISD::TRUNCATE,DL, MVT::i8,
5266 DAG.getNode(ISD::SRL,DL, MVT::i16, {AsInt, Const8}));
5267Results.push_back(
5268 DAG.getNode(ISD::BUILD_VECTOR,DL, MVT::v2i8, {Vec0, Vec1}));
5269}
5270
5271/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
5272staticvoidReplaceLoadVector(SDNode *N,SelectionDAG &DAG,
5273SmallVectorImpl<SDValue> &Results) {
5274EVT ResVT =N->getValueType(0);
5275SDLocDL(N);
5276
5277assert(ResVT.isVector() &&"Vector load must have vector type");
5278
5279auto NumEltsAndEltVT =getVectorLoweringShape(ResVT);
5280if (!NumEltsAndEltVT)
5281return;
5282auto [NumElts, EltVT] = NumEltsAndEltVT.value();
5283
5284LoadSDNode *LD = cast<LoadSDNode>(N);
5285
5286Align Alignment = LD->getAlign();
5287auto &TD = DAG.getDataLayout();
5288Align PrefAlign =
5289 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
5290if (Alignment < PrefAlign) {
5291// This load is not sufficiently aligned, so bail out and let this vector
5292// load be scalarized. Note that we may still be able to emit smaller
5293// vector loads. For example, if we are loading a <4 x float> with an
5294// alignment of 8, this check will fail but the legalizer will try again
5295// with 2 x <2 x float>, which will succeed with an alignment of 8.
5296return;
5297 }
5298
5299// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
5300// Therefore, we must ensure the type is legal. For i1 and i8, we set the
5301// loaded type to i16 and propagate the "real" type as the memory type.
5302bool NeedTrunc =false;
5303if (EltVT.getSizeInBits() < 16) {
5304 EltVT = MVT::i16;
5305 NeedTrunc =true;
5306 }
5307
5308unsigned Opcode = 0;
5309SDVTList LdResVTs;
5310
5311switch (NumElts) {
5312default:
5313return;
5314case 2:
5315 Opcode =NVPTXISD::LoadV2;
5316 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5317break;
5318case 4: {
5319 Opcode =NVPTXISD::LoadV4;
5320EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5321 LdResVTs = DAG.getVTList(ListVTs);
5322break;
5323 }
5324 }
5325
5326// Copy regular operands
5327SmallVector<SDValue, 8> OtherOps(N->ops());
5328
5329// The select routine does not have access to the LoadSDNode instance, so
5330// pass along the extension information
5331 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(),DL));
5332
5333SDValue NewLD = DAG.getMemIntrinsicNode(Opcode,DL, LdResVTs, OtherOps,
5334 LD->getMemoryVT(),
5335 LD->getMemOperand());
5336
5337SmallVector<SDValue> ScalarRes;
5338assert(NumElts <= ResVT.getVectorNumElements() &&
5339"NumElts should not increase, only decrease or stay the same.");
5340if (NumElts < ResVT.getVectorNumElements()) {
5341// If the number of elements has decreased, getVectorLoweringShape has
5342// upsized the element types
5343assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&
5344 EltVT.getVectorNumElements() <= 4 &&"Unexpected upsized type.");
5345// Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
5346// into individual elements.
5347for (unsigned i = 0; i < NumElts; ++i) {
5348SDValue SubVector = NewLD.getValue(i);
5349 DAG.ExtractVectorElements(SubVector, ScalarRes);
5350 }
5351 }else {
5352for (unsigned i = 0; i < NumElts; ++i) {
5353SDValue Res = NewLD.getValue(i);
5354if (NeedTrunc)
5355 Res = DAG.getNode(ISD::TRUNCATE,DL, ResVT.getVectorElementType(), Res);
5356 ScalarRes.push_back(Res);
5357 }
5358 }
5359
5360SDValue LoadChain = NewLD.getValue(NumElts);
5361
5362SDValue BuildVec = DAG.getBuildVector(ResVT,DL, ScalarRes);
5363
5364Results.push_back(BuildVec);
5365Results.push_back(LoadChain);
5366}
5367
5368staticvoidReplaceINTRINSIC_W_CHAIN(SDNode *N,SelectionDAG &DAG,
5369SmallVectorImpl<SDValue> &Results) {
5370SDValue Chain =N->getOperand(0);
5371SDValue Intrin =N->getOperand(1);
5372SDLocDL(N);
5373
5374// Get the intrinsic ID
5375unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
5376switch (IntrinNo) {
5377default:
5378return;
5379case Intrinsic::nvvm_ldu_global_i:
5380case Intrinsic::nvvm_ldu_global_f:
5381case Intrinsic::nvvm_ldu_global_p: {
5382EVT ResVT =N->getValueType(0);
5383
5384if (ResVT.isVector()) {
5385// Vector LDG/LDU
5386
5387unsigned NumElts = ResVT.getVectorNumElements();
5388EVT EltVT = ResVT.getVectorElementType();
5389
5390// Since LDU/LDG are target nodes, we cannot rely on DAG type
5391// legalization.
5392// Therefore, we must ensure the type is legal. For i1 and i8, we set the
5393// loaded type to i16 and propagate the "real" type as the memory type.
5394bool NeedTrunc =false;
5395if (EltVT.getSizeInBits() < 16) {
5396 EltVT = MVT::i16;
5397 NeedTrunc =true;
5398 }
5399
5400unsigned Opcode = 0;
5401SDVTList LdResVTs;
5402
5403switch (NumElts) {
5404default:
5405return;
5406case 2:
5407 Opcode =NVPTXISD::LDUV2;
5408 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
5409break;
5410case 4: {
5411 Opcode =NVPTXISD::LDUV4;
5412EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
5413 LdResVTs = DAG.getVTList(ListVTs);
5414break;
5415 }
5416 }
5417
5418SmallVector<SDValue, 8> OtherOps;
5419
5420// Copy regular operands
5421
5422 OtherOps.push_back(Chain);// Chain
5423// Skip operand 1 (intrinsic ID)
5424// Others
5425 OtherOps.append(N->op_begin() + 2,N->op_end());
5426
5427MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5428
5429SDValue NewLD = DAG.getMemIntrinsicNode(Opcode,DL, LdResVTs, OtherOps,
5430 MemSD->getMemoryVT(),
5431 MemSD->getMemOperand());
5432
5433SmallVector<SDValue, 4> ScalarRes;
5434
5435for (unsigned i = 0; i < NumElts; ++i) {
5436SDValue Res = NewLD.getValue(i);
5437if (NeedTrunc)
5438 Res =
5439 DAG.getNode(ISD::TRUNCATE,DL, ResVT.getVectorElementType(), Res);
5440 ScalarRes.push_back(Res);
5441 }
5442
5443SDValue LoadChain = NewLD.getValue(NumElts);
5444
5445SDValue BuildVec =
5446 DAG.getBuildVector(ResVT,DL, ScalarRes);
5447
5448Results.push_back(BuildVec);
5449Results.push_back(LoadChain);
5450 }else {
5451// i8 LDG/LDU
5452assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
5453"Custom handling of non-i8 ldu/ldg?");
5454
5455// Just copy all operands as-is
5456SmallVector<SDValue, 4> Ops(N->ops());
5457
5458// Force output to i16
5459SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
5460
5461MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
5462
5463// We make sure the memory type is i8, which will be used during isel
5464// to select the proper instruction.
5465SDValue NewLD =
5466 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,DL, LdResVTs, Ops,
5467 MVT::i8, MemSD->getMemOperand());
5468
5469Results.push_back(DAG.getNode(ISD::TRUNCATE,DL, MVT::i8,
5470 NewLD.getValue(0)));
5471Results.push_back(NewLD.getValue(1));
5472 }
5473 }
5474 }
5475}
5476
5477staticvoidReplaceCopyFromReg_128(SDNode *N,SelectionDAG &DAG,
5478SmallVectorImpl<SDValue> &Results) {
5479// Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
5480// result so that it can pass the legalization
5481SDLocDL(N);
5482SDValue Chain =N->getOperand(0);
5483SDValue Reg =N->getOperand(1);
5484SDValue Glue =N->getOperand(2);
5485
5486assert(Reg.getValueType() == MVT::i128 &&
5487"Custom lowering for CopyFromReg with 128-bit reg only");
5488SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64,N->getValueType(1),
5489N->getValueType(2)};
5490SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
5491
5492SDValue NewValue = DAG.getNode(ISD::CopyFromReg,DL, ResultsType, NewOps);
5493SDValue Pair = DAG.getNode(ISD::BUILD_PAIR,DL, MVT::i128,
5494 {NewValue.getValue(0), NewValue.getValue(1)});
5495
5496Results.push_back(Pair);
5497Results.push_back(NewValue.getValue(2));
5498Results.push_back(NewValue.getValue(3));
5499}
5500
5501void NVPTXTargetLowering::ReplaceNodeResults(
5502SDNode *N,SmallVectorImpl<SDValue> &Results,SelectionDAG &DAG) const{
5503switch (N->getOpcode()) {
5504default:
5505report_fatal_error("Unhandled custom legalization");
5506caseISD::BITCAST:
5507ReplaceBITCAST(N, DAG,Results);
5508return;
5509caseISD::LOAD:
5510ReplaceLoadVector(N, DAG,Results);
5511return;
5512caseISD::INTRINSIC_W_CHAIN:
5513ReplaceINTRINSIC_W_CHAIN(N, DAG,Results);
5514return;
5515caseISD::CopyFromReg:
5516ReplaceCopyFromReg_128(N, DAG,Results);
5517return;
5518 }
5519}
5520
5521NVPTXTargetLowering::AtomicExpansionKind
5522NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const{
5523Type *Ty = AI->getValOperand()->getType();
5524
5525if (AI->isFloatingPointOperation()) {
5526if (AI->getOperation() ==AtomicRMWInst::BinOp::FAdd) {
5527if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&
5528 STI.getPTXVersion() >= 63)
5529returnAtomicExpansionKind::None;
5530if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
5531 STI.getPTXVersion() >= 78)
5532returnAtomicExpansionKind::None;
5533if (Ty->isFloatTy())
5534returnAtomicExpansionKind::None;
5535if (Ty->isDoubleTy() && STI.hasAtomAddF64())
5536returnAtomicExpansionKind::None;
5537 }
5538returnAtomicExpansionKind::CmpXChg;
5539 }
5540
5541assert(Ty->isIntegerTy() &&"Ty should be integer at this point");
5542auto ITy = cast<llvm::IntegerType>(Ty);
5543
5544switch (AI->getOperation()) {
5545default:
5546returnAtomicExpansionKind::CmpXChg;
5547caseAtomicRMWInst::BinOp::And:
5548caseAtomicRMWInst::BinOp::Or:
5549caseAtomicRMWInst::BinOp::Xor:
5550caseAtomicRMWInst::BinOp::Xchg:
5551switch (ITy->getBitWidth()) {
5552case 8:
5553case 16:
5554returnAtomicExpansionKind::CmpXChg;
5555case 32:
5556returnAtomicExpansionKind::None;
5557case 64:
5558if (STI.hasAtomBitwise64())
5559returnAtomicExpansionKind::None;
5560returnAtomicExpansionKind::CmpXChg;
5561default:
5562llvm_unreachable("unsupported width encountered");
5563 }
5564caseAtomicRMWInst::BinOp::Add:
5565caseAtomicRMWInst::BinOp::Sub:
5566caseAtomicRMWInst::BinOp::Max:
5567caseAtomicRMWInst::BinOp::Min:
5568caseAtomicRMWInst::BinOp::UMax:
5569caseAtomicRMWInst::BinOp::UMin:
5570switch (ITy->getBitWidth()) {
5571case 8:
5572case 16:
5573returnAtomicExpansionKind::CmpXChg;
5574case 32:
5575returnAtomicExpansionKind::None;
5576case 64:
5577if (STI.hasAtomMinMax64())
5578returnAtomicExpansionKind::None;
5579returnAtomicExpansionKind::CmpXChg;
5580default:
5581llvm_unreachable("unsupported width encountered");
5582 }
5583 }
5584
5585returnAtomicExpansionKind::CmpXChg;
5586}
5587
5588// Pin NVPTXTargetObjectFile's vtables to this file.
5589NVPTXTargetObjectFile::~NVPTXTargetObjectFile() =default;
5590
5591MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(
5592constGlobalObject *GO,SectionKind Kind,constTargetMachine &TM) const{
5593returngetDataSection();
5594}
MAKE_CASE
#define MAKE_CASE(V)
F32
static const LLT F32
Definition:AMDGPULegalizerInfo.cpp:286
Select
AMDGPU Register Bank Select
Definition:AMDGPURegBankSelect.cpp:71
APInt.h
This file implements a class to represent arbitrary precision integral constant values and operations...
PerformADDCombineWithOperands
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
Definition:ARMISelLowering.cpp:13593
PerformADDCombine
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
Definition:ARMISelLowering.cpp:14078
PerformVSELECTCombine
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
Definition:ARMISelLowering.cpp:13475
PerformMULCombine
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
Definition:ARMISelLowering.cpp:14270
PerformFADDCombine
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
Definition:ARMISelLowering.cpp:17068
PerformANDCombine
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
Definition:ARMISelLowering.cpp:14468
PerformBUILD_VECTORCombine
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
Definition:ARMISelLowering.cpp:15326
MBB
MachineBasicBlock & MBB
Definition:ARMSLSHardening.cpp:71
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition:ARMSLSHardening.cpp:73
Results
Function Alias Analysis Results
Definition:AliasAnalysis.cpp:731
Alignment.h
Attributes.h
This file contains the simple types necessary to represent the attributes associated with functions a...
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Info
Analysis containing CSE Info
Definition:CSEInfo.cpp:27
Casting.h
CodeGen.h
CommandLine.h
Constants.h
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DataLayout.h
RetTy
return RetTy
Definition:DeadArgumentElimination.cpp:361
Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition:DeadArgumentElimination.cpp:353
DerivedTypes.h
DiagnosticInfo.h
Size
uint64_t Size
Definition:ELFObjHandler.cpp:81
X
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
FPEnv.h
This file contains the declarations of entities that describe floating point environment and related ...
GlobalValue.h
Argument.h
Function.h
Instruction.h
Module.h
Module.h This file contains the declarations for the Module class.
Type.h
Value.h
ISDOpcodes.h
Instructions.h
Options
static LVOptions Options
Definition:LVOptions.cpp:25
F
#define F(x, y, z)
Definition:MD5.cpp:55
I
#define I(x, y, z)
Definition:MD5.cpp:58
MachineFunction.h
getDebugLoc
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first found DebugLoc that has a DILocation, given a range of instructions.
Definition:MachineInstrBundle.cpp:109
MachineJumpTableInfo.h
MachineMemOperand.h
TRI
unsigned const TargetRegisterInfo * TRI
Definition:MachineSink.cpp:2029
MachineValueType.h
NVPTXAddrSpace.h
NVPTX address space definition.
NVPTXBaseInfo.h
shouldConvertToIndirectCall
static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)
Definition:NVPTXISelLowering.cpp:1402
sched4reg
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
PerformEXTRACTCombine
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Definition:NVPTXISelLowering.cpp:5067
isConstOne
static bool isConstOne(const SDValue &Operand)
Definition:NVPTXISelLowering.cpp:4926
FMAContractLevelOpt
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
IsPTXVectorType
static bool IsPTXVectorType(MVT VT)
Definition:NVPTXISelLowering.cpp:138
UsePrecDivF32
static cl::opt< int > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2))
PerformStoreParamCombine
static SDValue PerformStoreParamCombine(SDNode *N)
Definition:NVPTXISelLowering.cpp:4607
ReplaceLoadVector
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
Definition:NVPTXISelLowering.cpp:5272
ReplaceBITCAST
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
Definition:NVPTXISelLowering.cpp:5249
ReplaceCopyFromReg_128
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
Definition:NVPTXISelLowering.cpp:5477
Is16bitsType
static bool Is16bitsType(MVT VT)
Definition:NVPTXISelLowering.cpp:167
combineMADConstOne
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
Definition:NVPTXISelLowering.cpp:4944
IsTypePassedAsArray
static bool IsTypePassedAsArray(const Type *Ty)
Definition:NVPTXISelLowering.cpp:1147
VectorizePTXValueVTs
static SmallVector< ParamVectorizationFlags, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment, bool IsVAArg=false)
Definition:NVPTXISelLowering.cpp:443
CanMergeParamLoadStoresStartingAt
static unsigned CanMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment)
Definition:NVPTXISelLowering.cpp:380
ReplaceINTRINSIC_W_CHAIN
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
Definition:NVPTXISelLowering.cpp:5368
PerformFADDCombineWithOperands
static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
Definition:NVPTXISelLowering.cpp:4520
isConstZero
static bool isConstZero(const SDValue &Operand)
Definition:NVPTXISelLowering.cpp:4470
LowerVectorArith
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
Definition:NVPTXISelLowering.cpp:2628
ComputePTXValueVTs
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive EVTs that compose it.
Definition:NVPTXISelLowering.cpp:239
IsMulWideOperandDemotable
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
Definition:NVPTXISelLowering.cpp:4797
LowerUnalignedStoreParam
static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue StVal, SDValue &InGlue, unsigned ArgID, const SDLoc &dl)
Definition:NVPTXISelLowering.cpp:1326
PerformREMCombine
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
Definition:NVPTXISelLowering.cpp:4757
getVectorLoweringShape
static std::optional< std::pair< unsigned int, EVT > > getVectorLoweringShape(EVT VectorVT)
Definition:NVPTXISelLowering.cpp:180
PerformMULCombineWithOperands
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
Definition:NVPTXISelLowering.cpp:4985
PerformStoreRetvalCombine
static SDValue PerformStoreRetvalCombine(SDNode *N)
Definition:NVPTXISelLowering.cpp:4613
AreMulWideOperandsDemotable
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
Definition:NVPTXISelLowering.cpp:4824
PerformStoreCombineHelper
static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, std::size_t Back)
Definition:NVPTXISelLowering.cpp:4596
adjustElementType
static bool adjustElementType(EVT &ElementType)
Definition:NVPTXISelLowering.cpp:1303
TryMULWIDECombine
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
Definition:NVPTXISelLowering.cpp:4860
combineMulSelectConstOne
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
Definition:NVPTXISelLowering.cpp:4955
matchMADConstOnePattern
static SDValue matchMADConstOnePattern(SDValue Add)
Definition:NVPTXISelLowering.cpp:4931
MaybeBitcast
static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT, SDValue Value)
Definition:NVPTXISelLowering.cpp:491
UsePrecSqrtF32
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
ParamVectorizationFlags
ParamVectorizationFlags
Definition:NVPTXISelLowering.cpp:426
PVF_FIRST
@ PVF_FIRST
Definition:NVPTXISelLowering.cpp:428
PVF_SCALAR
@ PVF_SCALAR
Definition:NVPTXISelLowering.cpp:431
PVF_INNER
@ PVF_INNER
Definition:NVPTXISelLowering.cpp:427
PVF_LAST
@ PVF_LAST
Definition:NVPTXISelLowering.cpp:429
LowerUnalignedStoreRet
static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue RetVal, const SDLoc &dl)
Definition:NVPTXISelLowering.cpp:3278
PromoteBinOpToF32
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
Definition:NVPTXISelLowering.cpp:2512
PromoteScalarIntegerPTX
static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
Definition:NVPTXISelLowering.cpp:341
OperandSignedness
OperandSignedness
Definition:NVPTXISelLowering.cpp:4788
Unsigned
@ Unsigned
Definition:NVPTXISelLowering.cpp:4790
Signed
@ Signed
Definition:NVPTXISelLowering.cpp:4789
PerformSETCCCombine
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
Definition:NVPTXISelLowering.cpp:5040
LowerUnalignedLoadRetParam
static SDValue LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, EVT ElementType, SDValue &InGlue, SmallVectorImpl< SDValue > &TempProxyRegOps, const SDLoc &dl)
Definition:NVPTXISelLowering.cpp:1357
GlobalUniqueCallSite
static std::atomic< unsigned > GlobalUniqueCallSite
Definition:NVPTXISelLowering.cpp:74
ForceMinByValParamAlign
static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))
UseApproxLog2F32
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
PerformSHLCombine
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
Definition:NVPTXISelLowering.cpp:5028
NVPTXISelLowering.h
NVPTXSubtarget.h
NVPTXTargetMachine.h
NVPTXTargetObjectFile.h
NVPTXUtilities.h
NVPTX.h
SmVersion
unsigned SmVersion
Definition:NVVMReflect.cpp:81
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
P
#define P(N)
if
if(PassOpts->AAPipeline)
Definition:PassBuilderBindings.cpp:64
Cond
const SmallVectorImpl< MachineOperand > & Cond
Definition:RISCVRedundantCopyElimination.cpp:75
CC
auto CC
Definition:RISCVRedundantCopyElimination.cpp:79
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
STLExtras.h
This file contains some templates that are useful if you are working with the STL at all.
SelectionDAGNodes.h
SelectionDAG.h
SmallVector.h
This file defines the SmallVector class.
Enabled
static bool Enabled
Definition:Statistic.cpp:46
StringRef.h
TargetCallingConv.h
Ptr
@ Ptr
Definition:TargetLibraryInfo.cpp:77
TargetLowering.h
This file describes how to lower LLVM code to machine code.
TargetOptions.h
ValueTypes.h
RHS
Value * RHS
Definition:X86PartialReduction.cpp:74
LHS
Value * LHS
Definition:X86PartialReduction.cpp:73
ArrayType
Definition:ItaniumDemangle.h:785
Node
Definition:ItaniumDemangle.h:163
T
llvm::APInt
Class for arbitrary precision integers.
Definition:APInt.h:78
llvm::APInt::isSignedIntN
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition:APInt.h:435
llvm::APInt::slt
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition:APInt.h:1130
llvm::APInt::isIntN
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition:APInt.h:432
llvm::APInt::sge
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition:APInt.h:1237
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition:Argument.h:31
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition:ArrayRef.h:41
llvm::ArrayRef::back
const T & back() const
back - Get the last element.
Definition:ArrayRef.h:177
llvm::ArrayRef::drop_back
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition:ArrayRef.h:213
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition:ArrayRef.h:163
llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition:Instructions.h:704
llvm::AtomicRMWInst::Add
@ Add
*p = old + v
Definition:Instructions.h:720
llvm::AtomicRMWInst::FAdd
@ FAdd
*p = old + v
Definition:Instructions.h:741
llvm::AtomicRMWInst::Min
@ Min
*p = old <signed v ? old : v
Definition:Instructions.h:734
llvm::AtomicRMWInst::Or
@ Or
*p = old | v
Definition:Instructions.h:728
llvm::AtomicRMWInst::Sub
@ Sub
*p = old - v
Definition:Instructions.h:722
llvm::AtomicRMWInst::And
@ And
*p = old & v
Definition:Instructions.h:724
llvm::AtomicRMWInst::Xor
@ Xor
*p = old ^ v
Definition:Instructions.h:730
llvm::AtomicRMWInst::Max
@ Max
*p = old >signed v ? old : v
Definition:Instructions.h:732
llvm::AtomicRMWInst::UMin
@ UMin
*p = old <unsigned v ? old : v
Definition:Instructions.h:738
llvm::AtomicRMWInst::UMax
@ UMax
*p = old >unsigned v ? old : v
Definition:Instructions.h:736
llvm::AtomicRMWInst::Xchg
@ Xchg
*p = v
Definition:Instructions.h:718
llvm::AtomicRMWInst::isFloatingPointOperation
bool isFloatingPointOperation() const
Definition:Instructions.h:882
llvm::AtomicRMWInst::getOperation
BinOp getOperation() const
Definition:Instructions.h:805
llvm::AtomicRMWInst::getValOperand
Value * getValOperand()
Definition:Instructions.h:874
llvm::AttributeList
Definition:Attributes.h:490
llvm::AttributeList::hasParamAttr
bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const
Return true if the attribute exists for the given argument.
Definition:Attributes.h:833
llvm::AttributeList::FirstArgIndex
@ FirstArgIndex
Definition:Attributes.h:495
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition:InstrTypes.h:1112
llvm::CallBase::getCalledFunction
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition:InstrTypes.h:1341
llvm::CallBase::getFunctionType
FunctionType * getFunctionType() const
Definition:InstrTypes.h:1199
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition:Instructions.h:1479
llvm::ConstantSDNode
Definition:SelectionDAGNodes.h:1684
llvm::ConstantSDNode::getZExtValue
uint64_t getZExtValue() const
Definition:SelectionDAGNodes.h:1701
llvm::ConstantSDNode::getAPIntValue
const APInt & getAPIntValue() const
Definition:SelectionDAGNodes.h:1700
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition:Constants.cpp:373
llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition:DWARFExpression.h:32
llvm::DWARFExpression::Operation::getNumOperands
uint64_t getNumOperands() const
Definition:DWARFExpression.h:90
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition:DataLayout.h:63
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition:DataLayout.h:457
llvm::DataLayout::getPrefTypeAlign
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition:DataLayout.cpp:847
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition:DiagnosticInfo.h:1097
llvm::Function
Definition:Function.h:63
llvm::Function::addFnAttr
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition:Function.cpp:641
llvm::Function::getReturnType
Type * getReturnType() const
Returns the type of the ret val.
Definition:Function.h:221
llvm::GlobalAddressSDNode
Definition:SelectionDAGNodes.h:1876
llvm::GlobalAddressSDNode::getAddressSpace
unsigned getAddressSpace() const
Definition:SelectionDAG.cpp:13070
llvm::GlobalAddressSDNode::getGlobal
const GlobalValue * getGlobal() const
Definition:SelectionDAGNodes.h:1890
llvm::GlobalObject
Definition:GlobalObject.h:27
llvm::Instruction
Definition:Instruction.h:68
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition:LLVMContext.h:67
llvm::LLVMContext::diagnose
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
Definition:LLVMContext.cpp:245
llvm::LoadSDNode
This class is used to represent ISD::LOAD nodes.
Definition:SelectionDAGNodes.h:2464
llvm::MCObjectFileInfo::getDataSection
MCSection * getDataSection() const
Definition:MCObjectFileInfo.h:272
llvm::MCSection
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition:MCSection.h:36
llvm::MCSymbol::getName
StringRef getName() const
getName - Get the symbol name.
Definition:MCSymbol.h:205
llvm::MVT
Machine Value Type.
Definition:MachineValueType.h:35
llvm::MVT::SimpleTy
SimpleValueType SimpleTy
Definition:MachineValueType.h:55
llvm::MVT::getVectorNumElements
unsigned getVectorNumElements() const
Definition:MachineValueType.h:294
llvm::MVT::isScalableVector
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
Definition:MachineValueType.h:113
llvm::MVT::integer_valuetypes
static auto integer_valuetypes()
Definition:MachineValueType.h:525
llvm::MVT::fixedlen_vector_valuetypes
static auto fixedlen_vector_valuetypes()
Definition:MachineValueType.h:542
llvm::MVT::getVectorVT
static MVT getVectorVT(MVT VT, unsigned NumElements)
Definition:MachineValueType.h:451
llvm::MVT::getIntegerVT
static MVT getIntegerVT(unsigned BitWidth)
Definition:MachineValueType.h:441
llvm::MVT::getScalarType
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
Definition:MachineValueType.h:259
llvm::MachineBasicBlock
Definition:MachineBasicBlock.h:125
llvm::MachineFunction
Definition:MachineFunction.h:267
llvm::MachineFunction::getDenormalMode
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition:MachineFunction.cpp:324
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition:MachineFunction.h:743
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition:MachineFunction.h:704
llvm::MachineFunction::getJumpTableInfo
const MachineJumpTableInfo * getJumpTableInfo() const
getJumpTableInfo - Return the jump table info object for the current function.
Definition:MachineFunction.h:756
llvm::MachineFunction::getTarget
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition:MachineFunction.h:729
llvm::MachineJumpTableInfo
Definition:MachineJumpTableInfo.h:46
llvm::MachineJumpTableInfo::EK_Inline
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
Definition:MachineJumpTableInfo.h:82
llvm::MachineJumpTableInfo::getJumpTables
const std::vector< MachineJumpTableEntry > & getJumpTables() const
Definition:MachineJumpTableInfo.h:110
llvm::MachineMemOperand::MODereferenceable
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
Definition:MachineMemOperand.h:144
llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition:MachineMemOperand.h:136
llvm::MachineMemOperand::MOInvariant
@ MOInvariant
The memory access always returns the same value (or traps).
Definition:MachineMemOperand.h:146
llvm::MachineMemOperand::MOStore
@ MOStore
The memory access writes data.
Definition:MachineMemOperand.h:138
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition:MachineRegisterInfo.h:51
llvm::MachineRegisterInfo::createVirtualRegister
Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
Definition:MachineRegisterInfo.cpp:156
llvm::MemIntrinsicSDNode
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
Definition:SelectionDAGNodes.h:1601
llvm::MemSDNode
This is an abstract virtual class for memory operations.
Definition:SelectionDAGNodes.h:1352
llvm::MemSDNode::getAlign
Align getAlign() const
Definition:SelectionDAGNodes.h:1370
llvm::MemSDNode::getMemOperand
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
Definition:SelectionDAGNodes.h:1436
llvm::MemSDNode::getMemoryVT
EVT getMemoryVT() const
Return the type of the in-memory value.
Definition:SelectionDAGNodes.h:1432
llvm::NVPTXSubtarget
Definition:NVPTXSubtarget.h:30
llvm::NVPTXSubtarget::getMaxRequiredAlignment
unsigned getMaxRequiredAlignment() const
Definition:NVPTXSubtarget.h:130
llvm::NVPTXSubtarget::hasAtomMinMax64
bool hasAtomMinMax64() const
Definition:NVPTXSubtarget.h:78
llvm::NVPTXSubtarget::hasAtomAddF64
bool hasAtomAddF64() const
Definition:NVPTXSubtarget.h:75
llvm::NVPTXSubtarget::hasHWROT32
bool hasHWROT32() const
Definition:NVPTXSubtarget.h:82
llvm::NVPTXSubtarget::getTargetLowering
const NVPTXTargetLowering * getTargetLowering() const override
Definition:NVPTXSubtarget.h:69
llvm::NVPTXSubtarget::getMinCmpXchgSizeInBits
unsigned getMinCmpXchgSizeInBits() const
Definition:NVPTXSubtarget.h:132
llvm::NVPTXSubtarget::getPTXVersion
unsigned getPTXVersion() const
Definition:NVPTXSubtarget.h:134
llvm::NVPTXSubtarget::hasNativeBF16Support
bool hasNativeBF16Support(int Opcode) const
Definition:NVPTXSubtarget.cpp:73
llvm::NVPTXSubtarget::getRegisterInfo
const NVPTXRegisterInfo * getRegisterInfo() const override
Definition:NVPTXSubtarget.h:66
llvm::NVPTXSubtarget::getSmVersion
unsigned int getSmVersion() const
Definition:NVPTXSubtarget.h:106
llvm::NVPTXSubtarget::hasAtomBitwise64
bool hasAtomBitwise64() const
Definition:NVPTXSubtarget.h:77
llvm::NVPTXSubtarget::hasBF16Math
bool hasBF16Math() const
Definition:NVPTXSubtarget.h:84
llvm::NVPTXSubtarget::allowFP16Math
bool allowFP16Math() const
Definition:NVPTXSubtarget.cpp:69
llvm::NVPTXTargetLowering
Definition:NVPTXISelLowering.h:99
llvm::NVPTXTargetLowering::getConstraintType
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
Definition:NVPTXISelLowering.cpp:4386
llvm::NVPTXTargetLowering::LowerOperation
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
Definition:NVPTXISelLowering.cpp:2649
llvm::NVPTXTargetLowering::nvTM
const NVPTXTargetMachine * nvTM
Definition:NVPTXISelLowering.h:193
llvm::NVPTXTargetLowering::LowerGlobalAddress
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const
Definition:NVPTXISelLowering.cpp:1139
llvm::NVPTXTargetLowering::NVPTXTargetLowering
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
Definition:NVPTXISelLowering.cpp:499
llvm::NVPTXTargetLowering::useF32FTZ
bool useF32FTZ(const MachineFunction &MF) const
Definition:NVPTXISelLowering.cpp:133
llvm::NVPTXTargetLowering::LowerSTACKSAVE
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
Definition:NVPTXISelLowering.cpp:2061
llvm::NVPTXTargetLowering::getFunctionArgumentAlignment
Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const
Definition:NVPTXISelLowering.cpp:1266
llvm::NVPTXTargetLowering::getSqrtEstimate
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
Definition:NVPTXISelLowering.cpp:1089
llvm::NVPTXTargetLowering::LowerReturn
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
Definition:NVPTXISelLowering.cpp:3304
llvm::NVPTXTargetLowering::LowerFormalArguments
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
Definition:NVPTXISelLowering.cpp:3079
llvm::NVPTXTargetLowering::LowerAsmOperandForConstraint
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
Definition:NVPTXISelLowering.cpp:3426
llvm::NVPTXTargetLowering::LowerSTACKRESTORE
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Definition:NVPTXISelLowering.cpp:2038
llvm::NVPTXTargetLowering::getParamName
std::string getParamName(const Function *F, int Idx) const
Definition:NVPTXISelLowering.cpp:4326
llvm::NVPTXTargetLowering::getPreferredVectorAction
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
Definition:NVPTXISelLowering.cpp:1082
llvm::NVPTXTargetLowering::getPrototype
std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, MaybeAlign retAlignment, std::optional< std::pair< unsigned, const APInt & > > VAInfo, const CallBase &CB, unsigned UniqueCallSite) const
Definition:NVPTXISelLowering.cpp:1152
llvm::NVPTXTargetLowering::getFunctionParamOptimizedAlign
Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const
getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...
Definition:NVPTXISelLowering.cpp:4279
llvm::NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
Definition:NVPTXISelLowering.cpp:2007
llvm::NVPTXTargetLowering::getSetCCResultType
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
Definition:NVPTXISelLowering.h:153
llvm::NVPTXTargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition:NVPTXISelLowering.cpp:4408
llvm::NVPTXTargetLowering::isLegalAddressingMode
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Definition:NVPTXISelLowering.cpp:4345
llvm::NVPTXTargetLowering::shouldExpandAtomicRMWInIR
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Definition:NVPTXISelLowering.cpp:5522
llvm::NVPTXTargetLowering::getFunctionByValParamAlign
Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const
Helper for computing alignment of a device function byval parameter.
Definition:NVPTXISelLowering.cpp:4300
llvm::NVPTXTargetLowering::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
Definition:NVPTXISelLowering.cpp:3439
llvm::NVPTXTargetLowering::getTargetNodeName
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
Definition:NVPTXISelLowering.cpp:1003
llvm::NVPTXTargetLowering::allowFMA
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
Definition:NVPTXISelLowering.cpp:4443
llvm::NVPTXTargetLowering::usePrecSqrtF32
bool usePrecSqrtF32() const
Definition:NVPTXISelLowering.cpp:123
llvm::NVPTXTargetLowering::getJumpTableEncoding
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
Definition:NVPTXISelLowering.cpp:2766
llvm::NVPTXTargetLowering::allowUnsafeFPMath
bool allowUnsafeFPMath(MachineFunction &MF) const
Definition:NVPTXISelLowering.cpp:4460
llvm::NVPTXTargetLowering::getDivF32Level
int getDivF32Level() const
Definition:NVPTXISelLowering.cpp:110
llvm::NVPTXTargetLowering::LowerCall
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
Definition:NVPTXISelLowering.cpp:1411
llvm::NVPTXTargetMachine
NVPTXTargetMachine.
Definition:NVPTXTargetMachine.h:25
llvm::NVPTXTargetMachine::is64Bit
bool is64Bit() const
Definition:NVPTXTargetMachine.h:46
llvm::NVPTXTargetMachine::getStrPool
UniqueStringSaver & getStrPool() const
Definition:NVPTXTargetMachine.h:48
llvm::NVPTXTargetObjectFile::SelectSectionForGlobal
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
Definition:NVPTXISelLowering.cpp:5591
llvm::NVPTXTargetObjectFile::~NVPTXTargetObjectFile
~NVPTXTargetObjectFile() override
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
llvm::SDLoc
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Definition:SelectionDAGNodes.h:1182
llvm::SDNode
Represents one node in the SelectionDAG.
Definition:SelectionDAGNodes.h:496
llvm::SDNode::getAsAPIntVal
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
Definition:SelectionDAGNodes.h:1735
llvm::SDNode::getOpcode
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
Definition:SelectionDAGNodes.h:687
llvm::SDNode::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this node.
Definition:SelectionDAGNodes.h:739
llvm::SDNode::getIROrder
unsigned getIROrder() const
Return the node ordering.
Definition:SelectionDAGNodes.h:758
llvm::SDNode::getAsZExtVal
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
Definition:SelectionDAGNodes.h:1727
llvm::SDNode::getNumOperands
unsigned getNumOperands() const
Return the number of values used by this operation.
Definition:SelectionDAGNodes.h:973
llvm::SDNode::getVTList
SDVTList getVTList() const
Definition:SelectionDAGNodes.h:1020
llvm::SDNode::getOperand
const SDValue & getOperand(unsigned Num) const
Definition:SelectionDAGNodes.h:992
llvm::SDNode::getConstantOperandVal
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Definition:SelectionDAGNodes.h:1723
llvm::SDNode::getConstantOperandAPInt
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
Definition:SelectionDAGNodes.h:1731
llvm::SDNode::getValueType
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
Definition:SelectionDAGNodes.h:1062
llvm::SDNode::isUndef
bool isUndef() const
Return true if the type of the node type undefined.
Definition:SelectionDAGNodes.h:694
llvm::SDNode::users
iterator_range< user_iterator > users()
Definition:SelectionDAGNodes.h:871
llvm::SDUse
Represents a use of a SDNode.
Definition:SelectionDAGNodes.h:283
llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition:SelectionDAGNodes.h:145
llvm::SDValue::getNode
SDNode * getNode() const
get the SDNode which holds the desired result
Definition:SelectionDAGNodes.h:159
llvm::SDValue::getValue
SDValue getValue(unsigned R) const
Definition:SelectionDAGNodes.h:179
llvm::SDValue::getValueType
EVT getValueType() const
Return the ValueType of the referenced return value.
Definition:SelectionDAGNodes.h:1217
llvm::SDValue::getValueSizeInBits
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
Definition:SelectionDAGNodes.h:199
llvm::SDValue::getOperand
const SDValue & getOperand(unsigned i) const
Definition:SelectionDAGNodes.h:1225
llvm::SDValue::getSimpleValueType
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
Definition:SelectionDAGNodes.h:190
llvm::SDValue::getOpcode
unsigned getOpcode() const
Definition:SelectionDAGNodes.h:1213
llvm::SectionKind
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition:SectionKind.h:22
llvm::SelectionDAG
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition:SelectionDAG.h:228
llvm::SelectionDAG::getExtLoad
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition:SelectionDAG.cpp:9287
llvm::SelectionDAG::getTargetGlobalAddress
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition:SelectionDAG.h:751
llvm::SelectionDAG::getRoot
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition:SelectionDAG.h:577
llvm::SelectionDAG::getAddrSpaceCast
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
Definition:SelectionDAG.cpp:2440
llvm::SelectionDAG::getCopyToReg
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
Definition:SelectionDAG.h:802
llvm::SelectionDAG::getMergeValues
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
Definition:SelectionDAG.cpp:9034
llvm::SelectionDAG::getVTList
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
Definition:SelectionDAG.cpp:10708
llvm::SelectionDAG::ExtractVectorElements
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
Definition:SelectionDAG.cpp:13053
llvm::SelectionDAG::getSetCC
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
Definition:SelectionDAG.h:1251
llvm::SelectionDAG::getSymbolFunctionGlobalAddress
SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
Definition:SelectionDAG.cpp:12178
llvm::SelectionDAG::getConstantFP
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
Definition:SelectionDAG.cpp:1873
llvm::SelectionDAG::getLoad
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
Definition:SelectionDAG.cpp:9270
llvm::SelectionDAG::getTargetLoweringInfo
const TargetLowering & getTargetLoweringInfo() const
Definition:SelectionDAG.h:503
llvm::SelectionDAG::MorphNodeTo
SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
Definition:SelectionDAG.cpp:11048
llvm::SelectionDAG::getCALLSEQ_END
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
Definition:SelectionDAG.h:1106
llvm::SelectionDAG::getBuildVector
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition:SelectionDAG.h:857
llvm::SelectionDAG::getBitcast
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
Definition:SelectionDAG.cpp:2433
llvm::SelectionDAG::getCopyFromReg
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
Definition:SelectionDAG.h:828
llvm::SelectionDAG::getSelect
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
Definition:SelectionDAG.h:1280
llvm::SelectionDAG::getDataLayout
const DataLayout & getDataLayout() const
Definition:SelectionDAG.h:497
llvm::SelectionDAG::getConstant
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
Definition:SelectionDAG.cpp:1666
llvm::SelectionDAG::getTruncStore
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Definition:SelectionDAG.cpp:9371
llvm::SelectionDAG::ReplaceAllUsesWith
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
Definition:SelectionDAG.cpp:11653
llvm::SelectionDAG::getStore
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
Definition:SelectionDAG.cpp:9320
llvm::SelectionDAG::getSignedConstant
SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Definition:SelectionDAG.cpp:1794
llvm::SelectionDAG::getCALLSEQ_START
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
Definition:SelectionDAG.h:1094
llvm::SelectionDAG::RemoveDeadNode
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
Definition:SelectionDAG.cpp:1084
llvm::SelectionDAG::getBasicBlock
SDValue getBasicBlock(MachineBasicBlock *MBB)
Definition:SelectionDAG.cpp:2024
llvm::SelectionDAG::getAnyExtOrTrunc
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
Definition:SelectionDAG.cpp:1496
llvm::SelectionDAG::getSelectCC
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
Definition:SelectionDAG.h:1290
llvm::SelectionDAG::getIntPtrConstant
SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
Definition:SelectionDAG.cpp:1806
llvm::SelectionDAG::getNode
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
Definition:SelectionDAG.cpp:10327
llvm::SelectionDAG::getFPExtendOrRound
SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
Definition:SelectionDAG.cpp:1475
llvm::SelectionDAG::getTargetConstant
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition:SelectionDAG.h:701
llvm::SelectionDAG::getMachineFunction
MachineFunction & getMachineFunction() const
Definition:SelectionDAG.h:492
llvm::SelectionDAG::getZExtOrTrunc
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
Definition:SelectionDAG.cpp:1508
llvm::SelectionDAG::getContext
LLVMContext * getContext() const
Definition:SelectionDAG.h:510
llvm::SelectionDAG::setRoot
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition:SelectionDAG.h:586
llvm::SelectionDAG::getMemIntrinsicNode
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
Definition:SelectionDAG.cpp:9045
llvm::SelectionDAG::getTargetExternalSymbol
SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
Definition:SelectionDAG.cpp:2069
llvm::SelectionDAG::getEntryNode
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition:SelectionDAG.h:580
llvm::ShuffleVectorSDNode
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
Definition:SelectionDAGNodes.h:1625
llvm::ShuffleVectorSDNode::getMask
ArrayRef< int > getMask() const
Definition:SelectionDAGNodes.h:1638
llvm::SmallVectorBase::empty
bool empty() const
Definition:SmallVector.h:81
llvm::SmallVectorBase::size
size_t size() const
Definition:SmallVector.h:78
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition:SmallVector.h:573
llvm::SmallVectorImpl::assign
void assign(size_type NumElts, ValueParamT Elt)
Definition:SmallVector.h:704
llvm::SmallVectorImpl::append
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition:SmallVector.h:683
llvm::SmallVectorImpl::clear
void clear()
Definition:SmallVector.h:610
llvm::SmallVectorImpl::resize
void resize(size_type N)
Definition:SmallVector.h:638
llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition:SmallVector.h:413
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition:SmallVector.h:1196
llvm::StoreSDNode
This class is used to represent ISD::STORE nodes.
Definition:SelectionDAGNodes.h:2492
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition:StringRef.h:51
llvm::StringRef::size
constexpr size_t size() const
size - Get the string size.
Definition:StringRef.h:150
llvm::StringRef::data
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition:StringRef.h:144
llvm::StructType
Class to represent struct types.
Definition:DerivedTypes.h:218
llvm::TargetLoweringBase::setBooleanVectorContents
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
Definition:TargetLowering.h:2493
llvm::TargetLoweringBase::setOperationAction
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
Definition:TargetLowering.h:2562
llvm::TargetLoweringBase::setMaxDivRemBitWidthSupported
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
Definition:TargetLowering.h:2772
llvm::TargetLoweringBase::Enabled
@ Enabled
Definition:TargetLowering.h:576
llvm::TargetLoweringBase::Unspecified
@ Unspecified
Definition:TargetLowering.h:574
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition:TargetLowering.h:1677
llvm::TargetLoweringBase::LegalizeAction
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
Definition:TargetLowering.h:199
llvm::TargetLoweringBase::Custom
@ Custom
Definition:TargetLowering.h:204
llvm::TargetLoweringBase::Expand
@ Expand
Definition:TargetLowering.h:202
llvm::TargetLoweringBase::Promote
@ Promote
Definition:TargetLowering.h:201
llvm::TargetLoweringBase::MaxStoresPerMemcpyOptSize
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
Definition:TargetLowering.h:3718
llvm::TargetLoweringBase::getRegClassFor
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
Definition:TargetLowering.h:1042
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition:TargetLowering.h:364
llvm::TargetLoweringBase::setOperationPromotedToType
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
Definition:TargetLowering.h:2716
llvm::TargetLoweringBase::LegalizeTypeAction
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
Definition:TargetLowering.h:209
llvm::TargetLoweringBase::TypeSplitVector
@ TypeSplitVector
Definition:TargetLowering.h:216
llvm::TargetLoweringBase::addBypassSlowDiv
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
Definition:TargetLowering.h:2538
llvm::TargetLoweringBase::getNumRegisters
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
Definition:TargetLowering.h:1763
llvm::TargetLoweringBase::setMaxAtomicSizeInBitsSupported
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Definition:TargetLowering.h:2766
llvm::TargetLoweringBase::getPreferredVectorAction
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
Definition:TargetLowering.h:517
llvm::TargetLoweringBase::MaxStoresPerMemsetOptSize
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
Definition:TargetLowering.h:3703
llvm::TargetLoweringBase::setBooleanContents
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
Definition:TargetLowering.h:2479
llvm::TargetLoweringBase::MaxStoresPerMemmove
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
Definition:TargetLowering.h:3751
llvm::TargetLoweringBase::computeRegisterProperties
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
Definition:TargetLoweringBase.cpp:1275
llvm::TargetLoweringBase::MaxStoresPerMemmoveOptSize
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
Definition:TargetLowering.h:3753
llvm::TargetLoweringBase::addRegisterClass
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
Definition:TargetLowering.h:2545
llvm::TargetLoweringBase::getPointerTy
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
Definition:TargetLowering.h:371
llvm::TargetLoweringBase::MaxStoresPerMemset
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
Definition:TargetLowering.h:3701
llvm::TargetLoweringBase::setTruncStoreAction
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
Definition:TargetLowering.h:2625
llvm::TargetLoweringBase::ZeroOrNegativeOneBooleanContent
@ ZeroOrNegativeOneBooleanContent
Definition:TargetLowering.h:237
llvm::TargetLoweringBase::setMinCmpXchgSizeInBits
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
Definition:TargetLowering.h:2783
llvm::TargetLoweringBase::AddPromotedToType
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
Definition:TargetLowering.h:2710
llvm::TargetLoweringBase::AtomicExpansionKind
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
Definition:TargetLowering.h:253
llvm::TargetLoweringBase::AtomicExpansionKind::CmpXChg
@ CmpXChg
llvm::TargetLoweringBase::AtomicExpansionKind::None
@ None
llvm::TargetLoweringBase::setCondCodeAction
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
Definition:TargetLowering.h:2686
llvm::TargetLoweringBase::setTargetDAGCombine
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Definition:TargetLowering.h:2731
llvm::TargetLoweringBase::getMinStackArgumentAlignment
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
Definition:TargetLowering.h:2038
llvm::TargetLoweringBase::setLoadExtAction
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
Definition:TargetLowering.h:2579
llvm::TargetLoweringBase::ArgListTy
std::vector< ArgListEntry > ArgListTy
Definition:TargetLowering.h:329
llvm::TargetLoweringBase::allowsMemoryAccessForAlignment
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
Definition:TargetLoweringBase.cpp:1708
llvm::TargetLoweringBase::MaxStoresPerMemcpy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
Definition:TargetLowering.h:3716
llvm::TargetLoweringBase::setSchedulingPreference
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
Definition:TargetLowering.h:2498
llvm::TargetLoweringBase::setJumpIsExpensive
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
Definition:TargetLoweringBase.cpp:941
llvm::TargetLoweringBase::getOperationAction
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
Definition:TargetLowering.h:1270
llvm::TargetLowering
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Definition:TargetLowering.h:3780
llvm::TargetLowering::ConstraintType
ConstraintType
Definition:TargetLowering.h:4950
llvm::TargetLowering::C_RegisterClass
@ C_RegisterClass
Definition:TargetLowering.h:4952
llvm::TargetLowering::expandUnalignedStore
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
Definition:TargetLowering.cpp:10277
llvm::TargetLowering::getConstraintType
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
Definition:TargetLowering.cpp:5525
llvm::TargetLowering::expandUnalignedLoad
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
Definition:TargetLowering.cpp:10128
llvm::TargetLowering::getRegForInlineAsmConstraint
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
Definition:TargetLowering.cpp:5669
llvm::TargetLowering::expandRoundInexactToOdd
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
Definition:TargetLowering.cpp:11593
llvm::TargetLowering::expandFP_ROUND
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
Definition:TargetLowering.cpp:11661
llvm::TargetLowering::LowerAsmOperandForConstraint
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Definition:TargetLowering.cpp:5587
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition:TargetMachine.h:77
llvm::TargetMachine::getOptLevel
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Definition:TargetMachine.h:257
llvm::TargetMachine::Options
TargetOptions Options
Definition:TargetMachine.h:118
llvm::TargetMachine::getSymbol
MCSymbol * getSymbol(const GlobalValue *GV) const
Definition:TargetMachine.cpp:283
llvm::TargetOptions::UnsafeFPMath
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
Definition:TargetOptions.h:176
llvm::TargetOptions::AllowFPOpFusion
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
Definition:TargetOptions.h:420
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition:TargetRegisterInfo.h:235
llvm::TypeSize
Definition:TypeSize.h:334
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition:Type.h:45
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition:Type.h:270
llvm::Type::isFloatTy
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition:Type.h:153
llvm::Type::isBFloatTy
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition:Type.h:145
llvm::Type::VoidTyID
@ VoidTyID
type with no size
Definition:Type.h:63
llvm::Type::isAggregateType
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition:Type.h:303
llvm::Type::isHalfTy
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition:Type.h:142
llvm::Type::isDoubleTy
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition:Type.h:156
llvm::Type::isFloatingPointTy
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition:Type.h:184
llvm::Type::isIntegerTy
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition:Type.h:237
llvm::Type::getTypeID
TypeID getTypeID() const
Return the type id for the type.
Definition:Type.h:136
llvm::Type::getPrimitiveSizeInBits
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
llvm::UniqueStringSaver::save
StringRef save(const char *S)
Definition:StringSaver.h:52
llvm::User
Definition:User.h:44
llvm::Value
LLVM Value Representation.
Definition:Value.h:74
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition:Value.h:255
llvm::cl::Option::getNumOccurrences
int getNumOccurrences() const
Definition:CommandLine.h:399
llvm::cl::opt
Definition:CommandLine.h:1423
llvm::raw_string_ostream
A raw_ostream that writes to an std::string.
Definition:raw_ostream.h:661
uint32_t
uint64_t
unsigned
Analysis.h
ErrorHandling.h
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition:ErrorHandling.h:143
TargetMachine.h
llvm::AMDGPUISD::BFI
@ BFI
Definition:AMDGPUISelLowering.h:496
llvm::APIntOps::pow
APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition:APInt.cpp:3112
llvm::ARM_MB::LD
@ LD
Definition:ARMBaseInfo.h:72
llvm::ARM_MB::ST
@ ST
Definition:ARMBaseInfo.h:73
llvm::CallingConv::C
@ C
The default llvm calling convention, compatible with C.
Definition:CallingConv.h:34
llvm::FPOpFusion::Fast
@ Fast
Definition:TargetOptions.h:37
llvm::IRSimilarity::Legal
@ Legal
Definition:IRSimilarityIdentifier.h:76
llvm::ISD::NodeType
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition:ISDOpcodes.h:40
llvm::ISD::SETCC
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition:ISDOpcodes.h:780
llvm::ISD::STACKRESTORE
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
Definition:ISDOpcodes.h:1197
llvm::ISD::STACKSAVE
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition:ISDOpcodes.h:1193
llvm::ISD::STORE
@ STORE
Definition:ISDOpcodes.h:1103
llvm::ISD::SREM
@ SREM
Definition:ISDOpcodes.h:251
llvm::ISD::SMUL_LOHI
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition:ISDOpcodes.h:257
llvm::ISD::SSUBO_CARRY
@ SSUBO_CARRY
Definition:ISDOpcodes.h:321
llvm::ISD::UDIV
@ UDIV
Definition:ISDOpcodes.h:250
llvm::ISD::UINT_TO_FP
@ UINT_TO_FP
Definition:ISDOpcodes.h:842
llvm::ISD::UMIN
@ UMIN
Definition:ISDOpcodes.h:699
llvm::ISD::BSWAP
@ BSWAP
Byte Swap and Counting operators.
Definition:ISDOpcodes.h:744
llvm::ISD::ROTR
@ ROTR
Definition:ISDOpcodes.h:739
llvm::ISD::VAEND
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
Definition:ISDOpcodes.h:1226
llvm::ISD::ConstantFP
@ ConstantFP
Definition:ISDOpcodes.h:77
llvm::ISD::UADDO
@ UADDO
Definition:ISDOpcodes.h:331
llvm::ISD::FTRUNC
@ FTRUNC
Definition:ISDOpcodes.h:1013
llvm::ISD::SDIV
@ SDIV
Definition:ISDOpcodes.h:249
llvm::ISD::ADDC
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition:ISDOpcodes.h:276
llvm::ISD::FMAXNUM_IEEE
@ FMAXNUM_IEEE
Definition:ISDOpcodes.h:1045
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition:ISDOpcodes.h:246
llvm::ISD::LOAD
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition:ISDOpcodes.h:1102
llvm::ISD::ANY_EXTEND
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition:ISDOpcodes.h:814
llvm::ISD::FSUB
@ FSUB
Definition:ISDOpcodes.h:398
llvm::ISD::FMA
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition:ISDOpcodes.h:498
llvm::ISD::SUBC
@ SUBC
Definition:ISDOpcodes.h:277
llvm::ISD::FABS
@ FABS
Definition:ISDOpcodes.h:982
llvm::ISD::FNEARBYINT
@ FNEARBYINT
Definition:ISDOpcodes.h:1015
llvm::ISD::INTRINSIC_VOID
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition:ISDOpcodes.h:205
llvm::ISD::RETURNADDR
@ RETURNADDR
Definition:ISDOpcodes.h:101
llvm::ISD::GlobalAddress
@ GlobalAddress
Definition:ISDOpcodes.h:78
llvm::ISD::SINT_TO_FP
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition:ISDOpcodes.h:841
llvm::ISD::CONCAT_VECTORS
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition:ISDOpcodes.h:558
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition:ISDOpcodes.h:397
llvm::ISD::ABS
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition:ISDOpcodes.h:717
llvm::ISD::UDIVREM
@ UDIVREM
Definition:ISDOpcodes.h:263
llvm::ISD::SDIVREM
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition:ISDOpcodes.h:262
llvm::ISD::SRL
@ SRL
Definition:ISDOpcodes.h:737
llvm::ISD::FMAXIMUM
@ FMAXIMUM
Definition:ISDOpcodes.h:1051
llvm::ISD::BITCAST
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition:ISDOpcodes.h:954
llvm::ISD::BUILD_PAIR
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition:ISDOpcodes.h:236
llvm::ISD::FFLOOR
@ FFLOOR
Definition:ISDOpcodes.h:1018
llvm::ISD::SRA
@ SRA
Definition:ISDOpcodes.h:736
llvm::ISD::USUBO
@ USUBO
Definition:ISDOpcodes.h:335
llvm::ISD::SIGN_EXTEND
@ SIGN_EXTEND
Conversion operators.
Definition:ISDOpcodes.h:805
llvm::ISD::FLOG2
@ FLOG2
Definition:ISDOpcodes.h:1007
llvm::ISD::READSTEADYCOUNTER
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition:ISDOpcodes.h:1259
llvm::ISD::USHLSAT
@ USHLSAT
Definition:ISDOpcodes.h:367
llvm::ISD::UADDSAT
@ UADDSAT
Definition:ISDOpcodes.h:348
llvm::ISD::FMAXNUM
@ FMAXNUM
Definition:ISDOpcodes.h:1032
llvm::ISD::FRINT
@ FRINT
Definition:ISDOpcodes.h:1014
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition:ISDOpcodes.h:981
llvm::ISD::BR_CC
@ BR_CC
BR_CC - Conditional branch.
Definition:ISDOpcodes.h:1148
llvm::ISD::CTTZ
@ CTTZ
Definition:ISDOpcodes.h:745
llvm::ISD::SSUBO
@ SSUBO
Same for subtraction.
Definition:ISDOpcodes.h:334
llvm::ISD::FP_TO_UINT
@ FP_TO_UINT
Definition:ISDOpcodes.h:888
llvm::ISD::BRIND
@ BRIND
BRIND - Indirect branch.
Definition:ISDOpcodes.h:1123
llvm::ISD::BR_JT
@ BR_JT
BR_JT - Jumptable branch.
Definition:ISDOpcodes.h:1127
llvm::ISD::OR
@ OR
Definition:ISDOpcodes.h:710
llvm::ISD::SSUBSAT
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition:ISDOpcodes.h:356
llvm::ISD::UMULO
@ UMULO
Definition:ISDOpcodes.h:339
llvm::ISD::SRA_PARTS
@ SRA_PARTS
Definition:ISDOpcodes.h:795
llvm::ISD::SELECT
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition:ISDOpcodes.h:757
llvm::ISD::UMUL_LOHI
@ UMUL_LOHI
Definition:ISDOpcodes.h:258
llvm::ISD::UNDEF
@ UNDEF
UNDEF - An undefined node.
Definition:ISDOpcodes.h:218
llvm::ISD::VACOPY
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
Definition:ISDOpcodes.h:1222
llvm::ISD::FSHL
@ FSHL
Definition:ISDOpcodes.h:740
llvm::ISD::CopyFromReg
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition:ISDOpcodes.h:215
llvm::ISD::SADDO
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition:ISDOpcodes.h:330
llvm::ISD::FSHR
@ FSHR
Definition:ISDOpcodes.h:741
llvm::ISD::FROUND
@ FROUND
Definition:ISDOpcodes.h:1016
llvm::ISD::USUBSAT
@ USUBSAT
Definition:ISDOpcodes.h:357
llvm::ISD::MULHU
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition:ISDOpcodes.h:674
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition:ISDOpcodes.h:735
llvm::ISD::VECTOR_SHUFFLE
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition:ISDOpcodes.h:615
llvm::ISD::EXTRACT_SUBVECTOR
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition:ISDOpcodes.h:588
llvm::ISD::FMINNUM_IEEE
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition:ISDOpcodes.h:1044
llvm::ISD::FCOS
@ FCOS
Definition:ISDOpcodes.h:986
llvm::ISD::EXTRACT_VECTOR_ELT
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition:ISDOpcodes.h:550
llvm::ISD::CopyToReg
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition:ISDOpcodes.h:209
llvm::ISD::ZERO_EXTEND
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition:ISDOpcodes.h:811
llvm::ISD::DEBUGTRAP
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition:ISDOpcodes.h:1282
llvm::ISD::CTPOP
@ CTPOP
Definition:ISDOpcodes.h:747
llvm::ISD::SELECT_CC
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition:ISDOpcodes.h:772
llvm::ISD::FMUL
@ FMUL
Definition:ISDOpcodes.h:399
llvm::ISD::SRL_PARTS
@ SRL_PARTS
Definition:ISDOpcodes.h:796
llvm::ISD::FMINNUM
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition:ISDOpcodes.h:1031
llvm::ISD::SUB
@ SUB
Definition:ISDOpcodes.h:247
llvm::ISD::MULHS
@ MULHS
Definition:ISDOpcodes.h:675
llvm::ISD::SSHLSAT
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition:ISDOpcodes.h:366
llvm::ISD::SMULO
@ SMULO
Same for multiplication.
Definition:ISDOpcodes.h:338
llvm::ISD::PARITY
@ PARITY
Definition:ISDOpcodes.h:749
llvm::ISD::DYNAMIC_STACKALLOC
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition:ISDOpcodes.h:1112
llvm::ISD::SIGN_EXTEND_INREG
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition:ISDOpcodes.h:849
llvm::ISD::SMIN
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition:ISDOpcodes.h:697
llvm::ISD::FP_EXTEND
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition:ISDOpcodes.h:939
llvm::ISD::VSELECT
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition:ISDOpcodes.h:766
llvm::ISD::UADDO_CARRY
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition:ISDOpcodes.h:310
llvm::ISD::FROUNDEVEN
@ FROUNDEVEN
Definition:ISDOpcodes.h:1017
llvm::ISD::FDIV
@ FDIV
Definition:ISDOpcodes.h:400
llvm::ISD::BF16_TO_FP
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition:ISDOpcodes.h:973
llvm::ISD::FRAMEADDR
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition:ISDOpcodes.h:100
llvm::ISD::FREM
@ FREM
Definition:ISDOpcodes.h:401
llvm::ISD::FMINIMUM
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition:ISDOpcodes.h:1050
llvm::ISD::FP_TO_SINT
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition:ISDOpcodes.h:887
llvm::ISD::READCYCLECOUNTER
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition:ISDOpcodes.h:1253
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition:ISDOpcodes.h:709
llvm::ISD::TRAP
@ TRAP
TRAP - Trapping instruction.
Definition:ISDOpcodes.h:1279
llvm::ISD::INTRINSIC_WO_CHAIN
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition:ISDOpcodes.h:190
llvm::ISD::USUBO_CARRY
@ USUBO_CARRY
Definition:ISDOpcodes.h:311
llvm::ISD::SUBE
@ SUBE
Definition:ISDOpcodes.h:287
llvm::ISD::ADDE
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition:ISDOpcodes.h:286
llvm::ISD::UREM
@ UREM
Definition:ISDOpcodes.h:252
llvm::ISD::FREEZE
@ FREEZE
FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...
Definition:ISDOpcodes.h:223
llvm::ISD::INSERT_VECTOR_ELT
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition:ISDOpcodes.h:539
llvm::ISD::TokenFactor
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition:ISDOpcodes.h:52
llvm::ISD::FSIN
@ FSIN
Definition:ISDOpcodes.h:985
llvm::ISD::FCEIL
@ FCEIL
Definition:ISDOpcodes.h:1012
llvm::ISD::MUL
@ MUL
Definition:ISDOpcodes.h:248
llvm::ISD::FP_ROUND
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition:ISDOpcodes.h:920
llvm::ISD::CTLZ
@ CTLZ
Definition:ISDOpcodes.h:746
llvm::ISD::VASTART
@ VASTART
Definition:ISDOpcodes.h:1227
llvm::ISD::FSQRT
@ FSQRT
Definition:ISDOpcodes.h:983
llvm::ISD::TRUNCATE
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition:ISDOpcodes.h:817
llvm::ISD::VAARG
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
Definition:ISDOpcodes.h:1217
llvm::ISD::ROTL
@ ROTL
Definition:ISDOpcodes.h:738
llvm::ISD::SHL_PARTS
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition:ISDOpcodes.h:794
llvm::ISD::BITREVERSE
@ BITREVERSE
Definition:ISDOpcodes.h:748
llvm::ISD::FCOPYSIGN
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition:ISDOpcodes.h:508
llvm::ISD::SADDSAT
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition:ISDOpcodes.h:347
llvm::ISD::FEXP2
@ FEXP2
Definition:ISDOpcodes.h:1010
llvm::ISD::SMAX
@ SMAX
Definition:ISDOpcodes.h:698
llvm::ISD::UMAX
@ UMAX
Definition:ISDOpcodes.h:700
llvm::ISD::SADDO_CARRY
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition:ISDOpcodes.h:320
llvm::ISD::INTRINSIC_W_CHAIN
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition:ISDOpcodes.h:198
llvm::ISD::BUILD_VECTOR
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition:ISDOpcodes.h:530
llvm::ISD::allOperandsUndef
bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
Definition:SelectionDAG.cpp:350
llvm::ISD::SETOLT
@ SETOLT
Definition:ISDOpcodes.h:1616
llvm::ISD::SETNE
@ SETNE
Definition:ISDOpcodes.h:1635
llvm::ISD::SETUGT
@ SETUGT
Definition:ISDOpcodes.h:1622
llvm::ISD::SETOGT
@ SETOGT
Definition:ISDOpcodes.h:1614
llvm::ISD::SETULT
@ SETULT
Definition:ISDOpcodes.h:1624
llvm::ISD::SETGT
@ SETGT
Definition:ISDOpcodes.h:1631
llvm::ISD::SETLT
@ SETLT
Definition:ISDOpcodes.h:1633
llvm::ISD::SETGE
@ SETGE
Definition:ISDOpcodes.h:1632
llvm::ISD::SETUGE
@ SETUGE
Definition:ISDOpcodes.h:1623
llvm::ISD::SETLE
@ SETLE
Definition:ISDOpcodes.h:1634
llvm::ISD::SETULE
@ SETULE
Definition:ISDOpcodes.h:1625
llvm::ISD::SETEQ
@ SETEQ
Definition:ISDOpcodes.h:1630
llvm::ISD::NON_EXTLOAD
@ NON_EXTLOAD
Definition:ISDOpcodes.h:1590
llvm::ISD::SEXTLOAD
@ SEXTLOAD
Definition:ISDOpcodes.h:1590
llvm::ISD::ZEXTLOAD
@ ZEXTLOAD
Definition:ISDOpcodes.h:1590
llvm::ISD::EXTLOAD
@ EXTLOAD
Definition:ISDOpcodes.h:1590
llvm::LegacyLegalizeActions::Bitcast
@ Bitcast
Perform the operation on a different, but equivalently sized type.
Definition:LegacyLegalizerInfo.h:55
llvm::M68k::MemAddrModeKind::L
@ L
llvm::NVPTXAS::ADDRESS_SPACE_PARAM
@ ADDRESS_SPACE_PARAM
Definition:NVPTXAddrSpace.h:27
llvm::NVPTXAS::ADDRESS_SPACE_LOCAL
@ ADDRESS_SPACE_LOCAL
Definition:NVPTXAddrSpace.h:25
llvm::NVPTXAS::ADDRESS_SPACE_GENERIC
@ ADDRESS_SPACE_GENERIC
Definition:NVPTXAddrSpace.h:21
llvm::NVPTXISD::NodeType
NodeType
Definition:NVPTXISelLowering.h:23
llvm::NVPTXISD::Prototype
@ Prototype
Definition:NVPTXISelLowering.h:46
llvm::NVPTXISD::PrintConvergentCallUni
@ PrintConvergentCallUni
Definition:NVPTXISelLowering.h:38
llvm::NVPTXISD::LastCallArg
@ LastCallArg
Definition:NVPTXISelLowering.h:41
llvm::NVPTXISD::LOAD_PARAM
@ LOAD_PARAM
Definition:NVPTXISelLowering.h:29
llvm::NVPTXISD::CallArg
@ CallArg
Definition:NVPTXISelLowering.h:40
llvm::NVPTXISD::DeclareRetParam
@ DeclareRetParam
Definition:NVPTXISelLowering.h:32
llvm::NVPTXISD::STACKSAVE
@ STACKSAVE
Definition:NVPTXISelLowering.h:66
llvm::NVPTXISD::PRMT
@ PRMT
Definition:NVPTXISelLowering.h:62
llvm::NVPTXISD::StoreParamS32
@ StoreParamS32
Definition:NVPTXISelLowering.h:85
llvm::NVPTXISD::MoveParam
@ MoveParam
Definition:NVPTXISelLowering.h:47
llvm::NVPTXISD::CALL
@ CALL
Definition:NVPTXISelLowering.h:27
llvm::NVPTXISD::CallSymbol
@ CallSymbol
Definition:NVPTXISelLowering.h:45
llvm::NVPTXISD::BrxItem
@ BrxItem
Definition:NVPTXISelLowering.h:68
llvm::NVPTXISD::LoadParamV2
@ LoadParamV2
Definition:NVPTXISelLowering.h:80
llvm::NVPTXISD::PrintConvergentCall
@ PrintConvergentCall
Definition:NVPTXISelLowering.h:36
llvm::NVPTXISD::StoreV2
@ StoreV2
Definition:NVPTXISelLowering.h:77
llvm::NVPTXISD::RETURN
@ RETURN
Definition:NVPTXISelLowering.h:49
llvm::NVPTXISD::CallSeqBegin
@ CallSeqBegin
Definition:NVPTXISelLowering.h:50
llvm::NVPTXISD::FIRST_NUMBER
@ FIRST_NUMBER
Definition:NVPTXISelLowering.h:25
llvm::NVPTXISD::StoreRetval
@ StoreRetval
Definition:NVPTXISelLowering.h:87
llvm::NVPTXISD::StoreRetvalV2
@ StoreRetvalV2
Definition:NVPTXISelLowering.h:88
llvm::NVPTXISD::LDUV2
@ LDUV2
Definition:NVPTXISelLowering.h:75
llvm::NVPTXISD::MUL_WIDE_SIGNED
@ MUL_WIDE_SIGNED
Definition:NVPTXISelLowering.h:56
llvm::NVPTXISD::FSHL_CLAMP
@ FSHL_CLAMP
Definition:NVPTXISelLowering.h:54
llvm::NVPTXISD::SETP_F16X2
@ SETP_F16X2
Definition:NVPTXISelLowering.h:58
llvm::NVPTXISD::ProxyReg
@ ProxyReg
Definition:NVPTXISelLowering.h:53
llvm::NVPTXISD::StoreV4
@ StoreV4
Definition:NVPTXISelLowering.h:78
llvm::NVPTXISD::CallVal
@ CallVal
Definition:NVPTXISelLowering.h:44
llvm::NVPTXISD::BrxEnd
@ BrxEnd
Definition:NVPTXISelLowering.h:69
llvm::NVPTXISD::LoadParamV4
@ LoadParamV4
Definition:NVPTXISelLowering.h:81
llvm::NVPTXISD::Dummy
@ Dummy
Definition:NVPTXISelLowering.h:70
llvm::NVPTXISD::PrintCall
@ PrintCall
Definition:NVPTXISelLowering.h:35
llvm::NVPTXISD::CallPrototype
@ CallPrototype
Definition:NVPTXISelLowering.h:52
llvm::NVPTXISD::DeclareScalarRet
@ DeclareScalarRet
Definition:NVPTXISelLowering.h:34
llvm::NVPTXISD::DYNAMIC_STACKALLOC
@ DYNAMIC_STACKALLOC
Definition:NVPTXISelLowering.h:64
llvm::NVPTXISD::LoadV2
@ LoadV2
Definition:NVPTXISelLowering.h:73
llvm::NVPTXISD::CallArgEnd
@ CallArgEnd
Definition:NVPTXISelLowering.h:42
llvm::NVPTXISD::StoreRetvalV4
@ StoreRetvalV4
Definition:NVPTXISelLowering.h:89
llvm::NVPTXISD::BrxStart
@ BrxStart
Definition:NVPTXISelLowering.h:67
llvm::NVPTXISD::StoreParamV4
@ StoreParamV4
Definition:NVPTXISelLowering.h:84
llvm::NVPTXISD::CallArgBegin
@ CallArgBegin
Definition:NVPTXISelLowering.h:39
llvm::NVPTXISD::BFI
@ BFI
Definition:NVPTXISelLowering.h:61
llvm::NVPTXISD::StoreParamV2
@ StoreParamV2
Definition:NVPTXISelLowering.h:83
llvm::NVPTXISD::STACKRESTORE
@ STACKRESTORE
Definition:NVPTXISelLowering.h:65
llvm::NVPTXISD::Wrapper
@ Wrapper
Definition:NVPTXISelLowering.h:26
llvm::NVPTXISD::SETP_BF16X2
@ SETP_BF16X2
Definition:NVPTXISelLowering.h:59
llvm::NVPTXISD::DeclareParam
@ DeclareParam
Definition:NVPTXISelLowering.h:30
llvm::NVPTXISD::LDUV4
@ LDUV4
Definition:NVPTXISelLowering.h:76
llvm::NVPTXISD::CallVoid
@ CallVoid
Definition:NVPTXISelLowering.h:43
llvm::NVPTXISD::StoreParam
@ StoreParam
Definition:NVPTXISelLowering.h:82
llvm::NVPTXISD::StoreParamU32
@ StoreParamU32
Definition:NVPTXISelLowering.h:86
llvm::NVPTXISD::PrintCallUni
@ PrintCallUni
Definition:NVPTXISelLowering.h:37
llvm::NVPTXISD::DeclareRet
@ DeclareRet
Definition:NVPTXISelLowering.h:33
llvm::NVPTXISD::FSHR_CLAMP
@ FSHR_CLAMP
Definition:NVPTXISelLowering.h:55
llvm::NVPTXISD::DeclareScalarParam
@ DeclareScalarParam
Definition:NVPTXISelLowering.h:31
llvm::NVPTXISD::CallSeqEnd
@ CallSeqEnd
Definition:NVPTXISelLowering.h:51
llvm::NVPTXISD::BFE
@ BFE
Definition:NVPTXISelLowering.h:60
llvm::NVPTXISD::RET_GLUE
@ RET_GLUE
Definition:NVPTXISelLowering.h:28
llvm::NVPTXISD::FCOPYSIGN
@ FCOPYSIGN
Definition:NVPTXISelLowering.h:63
llvm::NVPTXISD::PseudoUseParam
@ PseudoUseParam
Definition:NVPTXISelLowering.h:48
llvm::NVPTXISD::MUL_WIDE_UNSIGNED
@ MUL_WIDE_UNSIGNED
Definition:NVPTXISelLowering.h:57
llvm::NVPTXISD::LoadV4
@ LoadV4
Definition:NVPTXISelLowering.h:74
llvm::NVPTXISD::LoadParam
@ LoadParam
Definition:NVPTXISelLowering.h:79
llvm::NVPTX::PTXLdStInstCode::V2
@ V2
Definition:NVPTX.h:163
llvm::NVPTX::PTXPrmtMode::NONE
@ NONE
Definition:NVPTX.h:219
llvm::NVPTX::Const
@ Const
Definition:NVPTX.h:147
llvm::RISCVFenceField::R
@ R
Definition:RISCVBaseInfo.h:373
llvm::SPII::Store
@ Store
Definition:SparcInstrInfo.h:33
llvm::SPII::Load
@ Load
Definition:SparcInstrInfo.h:32
llvm::Sched::RegPressure
@ RegPressure
Definition:TargetLowering.h:103
llvm::Sched::Source
@ Source
Definition:TargetLowering.h:102
llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
llvm::cl::Hidden
@ Hidden
Definition:CommandLine.h:137
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition:CommandLine.h:443
llvm::dwarf::Index
Index
Definition:Dwarf.h:882
llvm::ms_demangle::QualifierMangleMode::Result
@ Result
llvm::tgtok::TrueVal
@ TrueVal
Definition:TGLexer.h:58
llvm::tgtok::FalseVal
@ FalseVal
Definition:TGLexer.h:59
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition:AddressRanges.h:18
llvm::Offset
@ Offset
Definition:DWP.cpp:480
llvm::isIndirectCall
static bool isIndirectCall(const MachineInstr &MI)
Definition:ARMBaseInstrInfo.h:666
llvm::shouldEmitPTXNoReturn
bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)
Definition:NVPTXUtilities.cpp:367
llvm::CGDataKind::Unknown
@ Unknown
llvm::all_of
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition:STLExtras.h:1739
llvm::size
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition:STLExtras.h:1697
llvm::Isv2x16VT
bool Isv2x16VT(EVT VT)
Definition:NVPTXUtilities.cpp:386
llvm::enumerate
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition:STLExtras.h:2448
llvm::AlignStyle::Right
@ Right
llvm::AlignStyle::Left
@ Left
llvm::getAlign
MaybeAlign getAlign(const Function &F, unsigned Index)
Definition:NVPTXUtilities.cpp:323
llvm::PowerOf2Ceil
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition:MathExtras.h:395
llvm::transform
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition:STLExtras.h:1952
llvm::isPowerOf2_32
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition:MathExtras.h:292
llvm::promoteScalarArgumentSize
unsigned promoteScalarArgumentSize(unsigned size)
Definition:NVPTXUtilities.h:75
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition:Error.cpp:167
llvm::CodeGenOptLevel
CodeGenOptLevel
Code generation optimization level.
Definition:CodeGen.h:54
llvm::CodeGenOptLevel::None
@ None
-O0
llvm::CodeGenOptLevel::Default
@ Default
-O2, -Os
llvm::PackElem::Hi
@ Hi
llvm::PackElem::Lo
@ Lo
llvm::RecurKind::Mul
@ Mul
Product of integers.
llvm::RecurKind::Add
@ Add
Sum of integers.
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition:Alignment.h:155
llvm::Op
DWARFExpression::Operation Op
Definition:DWARFExpression.cpp:22
llvm::ComputeValueVTs
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition:Analysis.cpp:79
llvm::BitWidth
constexpr unsigned BitWidth
Definition:BitmaskEnum.h:217
llvm::isKernelFunction
bool isKernelFunction(const Function &F)
Definition:NVPTXUtilities.cpp:313
llvm::getMaybeBitcastedCallee
Function * getMaybeBitcastedCallee(const CallBase *CB)
Definition:NVPTXUtilities.cpp:363
llvm::commonAlignment
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition:Alignment.h:212
llvm::VFParamKind::Vector
@ Vector
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition:BitVector.h:860
raw_ostream.h
N
#define N
llvm::APFloatBase::IEEEsingle
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition:APFloat.cpp:257
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition:Alignment.h:39
llvm::Align::value
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition:Alignment.h:85
llvm::DenormalMode::PreserveSign
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
Definition:FloatingPointMode.h:80
llvm::DenormalMode::Output
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Definition:FloatingPointMode.h:91
llvm::EVT
Extended Value Type.
Definition:ValueTypes.h:35
llvm::EVT::getStoreSize
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition:ValueTypes.h:390
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition:ValueTypes.h:137
llvm::EVT::getVectorVT
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition:ValueTypes.h:74
llvm::EVT::changeTypeToInteger
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition:ValueTypes.h:121
llvm::EVT::isFloatingPoint
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition:ValueTypes.h:147
llvm::EVT::getVectorElementCount
ElementCount getVectorElementCount() const
Definition:ValueTypes.h:345
llvm::EVT::getSizeInBits
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition:ValueTypes.h:368
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition:ValueTypes.h:380
llvm::EVT::getSimpleVT
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition:ValueTypes.h:311
llvm::EVT::getFixedSizeInBits
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition:ValueTypes.h:376
llvm::EVT::isVector
bool isVector() const
Return true if this is a vector value type.
Definition:ValueTypes.h:168
llvm::EVT::getScalarType
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition:ValueTypes.h:318
llvm::EVT::bitsEq
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition:ValueTypes.h:251
llvm::EVT::getTypeForEVT
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition:ValueTypes.cpp:210
llvm::EVT::getVectorElementType
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition:ValueTypes.h:323
llvm::EVT::isScalarInteger
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition:ValueTypes.h:157
llvm::EVT::changeVectorElementType
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition:ValueTypes.h:102
llvm::EVT::getVectorNumElements
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition:ValueTypes.h:331
llvm::EVT::isInteger
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition:ValueTypes.h:152
llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition:MachineMemOperand.h:41
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition:Alignment.h:117
llvm::SDVTList
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
Definition:SelectionDAGNodes.h:79
llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Definition:TargetLowering.h:2816
llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition:TargetLowering.h:2818
llvm::TargetLoweringBase::AddrMode::BaseGV
GlobalValue * BaseGV
Definition:TargetLowering.h:2817
llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition:TargetLowering.h:2819
llvm::TargetLoweringBase::AddrMode::Scale
int64_t Scale
Definition:TargetLowering.h:2820
llvm::TargetLoweringBase::IntrinsicInfo
Definition:TargetLowering.h:1202
llvm::TargetLowering::CallLoweringInfo
This structure contains all information that is necessary for lowering calls.
Definition:TargetLowering.h:4529
llvm::TargetLowering::CallLoweringInfo::Args
ArgListTy Args
Definition:TargetLowering.h:4553
llvm::TargetLowering::CallLoweringInfo::IsTailCall
bool IsTailCall
Definition:TargetLowering.h:4545
llvm::TargetLowering::CallLoweringInfo::Callee
SDValue Callee
Definition:TargetLowering.h:4552
llvm::TargetLowering::CallLoweringInfo::DL
SDLoc DL
Definition:TargetLowering.h:4555
llvm::TargetLowering::CallLoweringInfo::IsVarArg
bool IsVarArg
Definition:TargetLowering.h:4534
llvm::TargetLowering::CallLoweringInfo::Ins
SmallVector< ISD::InputArg, 32 > Ins
Definition:TargetLowering.h:4559
llvm::TargetLowering::CallLoweringInfo::NumFixedArgs
unsigned NumFixedArgs
Definition:TargetLowering.h:4550
llvm::TargetLowering::CallLoweringInfo::Chain
SDValue Chain
Definition:TargetLowering.h:4530
llvm::TargetLowering::CallLoweringInfo::getArgs
ArgListTy & getArgs()
Definition:TargetLowering.h:4708
llvm::TargetLowering::CallLoweringInfo::CB
const CallBase * CB
Definition:TargetLowering.h:4556
llvm::TargetLowering::CallLoweringInfo::Outs
SmallVector< ISD::OutputArg, 32 > Outs
Definition:TargetLowering.h:4557
llvm::TargetLowering::CallLoweringInfo::OutVals
SmallVector< SDValue, 32 > OutVals
Definition:TargetLowering.h:4558
llvm::TargetLowering::CallLoweringInfo::RetTy
Type * RetTy
Definition:TargetLowering.h:4531
llvm::TargetLowering::CallLoweringInfo::IsConvergent
bool IsConvergent
Definition:TargetLowering.h:4538
llvm::TargetLowering::CallLoweringInfo::DAG
SelectionDAG & DAG
Definition:TargetLowering.h:4554
llvm::TargetLowering::DAGCombinerInfo
Definition:TargetLowering.h:4228
llvm::TargetLowering::DAGCombinerInfo::isAfterLegalizeDAG
bool isAfterLegalizeDAG() const
Definition:TargetLowering.h:4241
llvm::TargetLowering::DAGCombinerInfo::DAG
SelectionDAG & DAG
Definition:TargetLowering.h:4234
llvm::TargetLowering::DAGCombinerInfo::CombineTo
SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
Definition:DAGCombiner.cpp:921
llvm::cl::desc
Definition:CommandLine.h:409

Generated on Thu Jul 17 2025 15:11:18 for LLVM by doxygen 1.9.6
[8]ページ先頭

©2009-2025 Movatter.jp