Movatterモバイル変換

Go to the documentation of this file.

1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//

2//

3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4// See https://llvm.org/LICENSE.txt for license information.

5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6//

7//===----------------------------------------------------------------------===//

8//

9// This file defines the interfaces that NVPTX uses to lower LLVM code into a

10// selection DAG.

11//

12//===----------------------------------------------------------------------===//

14#include "NVPTXISelLowering.h"

15#include "MCTargetDesc/NVPTXBaseInfo.h"

16#include "NVPTX.h"

17#include "NVPTXSubtarget.h"

18#include "NVPTXTargetMachine.h"

19#include "NVPTXTargetObjectFile.h"

20#include "NVPTXUtilities.h"

21#include "llvm/ADT/APInt.h"

22#include "llvm/ADT/STLExtras.h"

23#include "llvm/ADT/SmallVector.h"

24#include "llvm/ADT/StringRef.h"

25#include "llvm/CodeGen/Analysis.h"

26#include "llvm/CodeGen/ISDOpcodes.h"

27#include "llvm/CodeGen/MachineFunction.h"

28#include "llvm/CodeGen/MachineJumpTableInfo.h"

29#include "llvm/CodeGen/MachineMemOperand.h"

30#include "llvm/CodeGen/SelectionDAG.h"

31#include "llvm/CodeGen/SelectionDAGNodes.h"

32#include "llvm/CodeGen/TargetCallingConv.h"

33#include "llvm/CodeGen/TargetLowering.h"

34#include "llvm/CodeGen/ValueTypes.h"

35#include "llvm/CodeGenTypes/MachineValueType.h"

36#include "llvm/IR/Argument.h"

37#include "llvm/IR/Attributes.h"

38#include "llvm/IR/Constants.h"

39#include "llvm/IR/DataLayout.h"

40#include "llvm/IR/DerivedTypes.h"

41#include "llvm/IR/DiagnosticInfo.h"

42#include "llvm/IR/FPEnv.h"

43#include "llvm/IR/Function.h"

44#include "llvm/IR/GlobalValue.h"

45#include "llvm/IR/Instruction.h"

46#include "llvm/IR/Instructions.h"

47#include "llvm/IR/IntrinsicsNVPTX.h"

48#include "llvm/IR/Module.h"

49#include "llvm/IR/Type.h"

50#include "llvm/IR/Value.h"

51#include "llvm/Support/Alignment.h"

52#include "llvm/Support/Casting.h"

53#include "llvm/Support/CodeGen.h"

54#include "llvm/Support/CommandLine.h"

55#include "llvm/Support/ErrorHandling.h"

56#include "llvm/Support/NVPTXAddrSpace.h"

57#include "llvm/Support/raw_ostream.h"

58#include "llvm/Target/TargetMachine.h"

59#include "llvm/Target/TargetOptions.h"

60#include <algorithm>

61#include <cassert>

62#include <cmath>

63#include <cstdint>

64#include <iterator>

65#include <optional>

66#include <string>

67#include <utility>

68#include <vector>

70#define DEBUG_TYPE "nvptx-lower"

72using namespacellvm;

74static std::atomic<unsigned>GlobalUniqueCallSite;

76staticcl::opt<bool>sched4reg(

77"nvptx-sched4reg",

78cl::desc("NVPTX Specific: schedule for register pressue"),cl::init(false));

80staticcl::opt<unsigned>FMAContractLevelOpt(

81"nvptx-fma-level",cl::Hidden,

82cl::desc("NVPTX Specific: FMA contraction (0: don't do it"

83" 1: do it 2: do it aggressively"),

84cl::init(2));

86staticcl::opt<int>UsePrecDivF32(

87"nvptx-prec-divf32",cl::Hidden,

88cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use"

89" IEEE Compliant F32 div.rnd if available."),

90cl::init(2));

92staticcl::opt<bool>UsePrecSqrtF32(

93"nvptx-prec-sqrtf32",cl::Hidden,

94cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),

95cl::init(true));

97/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it

98/// does NOT use lg2.approx for log2, so this is disabled by default.

99staticcl::opt<bool>UseApproxLog2F32(

100"nvptx-approx-log2f32",

101cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),

102cl::init(false));

103

104staticcl::opt<bool>ForceMinByValParamAlign(

105"nvptx-force-min-byval-param-align",cl::Hidden,

106cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval"

107" params of device functions."),

108cl::init(false));

109

110intNVPTXTargetLowering::getDivF32Level() const{

111if (UsePrecDivF32.getNumOccurrences() > 0) {

112// If nvptx-prec-div32=N is used on the command-line, always honor it

113returnUsePrecDivF32;

114 }else {

115// Otherwise, use div.approx if fast math is enabled

116if (getTargetMachine().Options.UnsafeFPMath)

117return 0;

118else

119return 2;

120 }

121}

122

123boolNVPTXTargetLowering::usePrecSqrtF32() const{

124if (UsePrecSqrtF32.getNumOccurrences() > 0) {

125// If nvptx-prec-sqrtf32 is used on the command-line, always honor it

126returnUsePrecSqrtF32;

127 }else {

128// Otherwise, use sqrt.approx if fast math is enabled

129return !getTargetMachine().Options.UnsafeFPMath;

130 }

131}

132

133boolNVPTXTargetLowering::useF32FTZ(constMachineFunction &MF) const{

134return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==

135DenormalMode::PreserveSign;

136}

137

138staticboolIsPTXVectorType(MVT VT) {

139switch (VT.SimpleTy) {

140default:

141returnfalse;

142case MVT::v2i1:

143case MVT::v4i1:

144case MVT::v2i8:

145case MVT::v4i8:

146case MVT::v8i8:// <2 x i8x4>

147case MVT::v16i8:// <4 x i8x4>

148case MVT::v2i16:

149case MVT::v4i16:

150case MVT::v8i16:// <4 x i16x2>

151case MVT::v2i32:

152case MVT::v4i32:

153case MVT::v2i64:

154case MVT::v2f16:

155case MVT::v4f16:

156case MVT::v8f16:// <4 x f16x2>

157case MVT::v2bf16:

158case MVT::v4bf16:

159case MVT::v8bf16:// <4 x bf16x2>

160case MVT::v2f32:

161case MVT::v4f32:

162case MVT::v2f64:

163returntrue;

164 }

165}

166

167staticboolIs16bitsType(MVT VT) {

168return (VT.SimpleTy == MVT::f16 || VT.SimpleTy == MVT::bf16 ||

169 VT.SimpleTy == MVT::i16);

170}

171

172// When legalizing vector loads/stores, this function is called, which does two

173// things:

174// 1. Determines Whether the vector is something we want to custom lower,

175// std::nullopt is returned if we do not want to custom lower it.

176// 2. If we do want to handle it, returns two parameters:

177// - unsigned int NumElts - The number of elements in the final vector

178// - EVT EltVT - The type of the elements in the final vector

179static std::optional<std::pair<unsigned int, EVT>>

180getVectorLoweringShape(EVT VectorVT) {

181if (!VectorVT.isVector() || !VectorVT.isSimple())

182return std::nullopt;

183

184EVT EltVT = VectorVT.getVectorElementType();

185unsigned NumElts = VectorVT.getVectorNumElements();

186

187// We only handle "native" vector sizes for now, e.g. <4 x double> is not

188// legal. We can (and should) split that into 2 stores of <2 x double> here

189// but I'm leaving that as a TODO for now.

190switch (VectorVT.getSimpleVT().SimpleTy) {

191default:

192return std::nullopt;

193case MVT::v2i8:

194case MVT::v2i16:

195case MVT::v2i32:

196case MVT::v2i64:

197case MVT::v2f16:

198case MVT::v2bf16:

199case MVT::v2f32:

200case MVT::v2f64:

201case MVT::v4i8:

202case MVT::v4i16:

203case MVT::v4i32:

204case MVT::v4f16:

205case MVT::v4bf16:

206case MVT::v4f32:

207// This is a "native" vector type

208return std::pair(NumElts, EltVT);

209case MVT::v8i8:// <2 x i8x4>

210case MVT::v8f16:// <4 x f16x2>

211case MVT::v8bf16:// <4 x bf16x2>

212case MVT::v8i16:// <4 x i16x2>

213case MVT::v16i8:// <4 x i8x4>

214// This can be upsized into a "native" vector type.

215// Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for

216// total load/store size, PTX syntax only supports v2/v4. Thus, we can't use

217// vectorized loads/stores with the actual element type for i8/i16 as that

218// would require v8/v16 variants that do not exist.

219// In order to load/store such vectors efficiently, here in Type

220// Legalization, we split the vector into word-sized chunks (v2x16/v4i8).

221// Later, we will lower to PTX as vectors of b32.

222

223// Number of elements to pack in one word.

224unsigned NPerWord = 32 / EltVT.getSizeInBits();

225

226return std::pair(NumElts / NPerWord,

227MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord));

228 }

229

230llvm_unreachable("All cases in switch should return.");

231}

232

233/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive

234/// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors

235/// into their primitive components.

236/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the

237/// same number of types as the Ins/Outs arrays in LowerFormalArguments,

238/// LowerCall, and LowerReturn.

239staticvoidComputePTXValueVTs(constTargetLowering &TLI,constDataLayout &DL,

240Type *Ty,SmallVectorImpl<EVT> &ValueVTs,

241SmallVectorImpl<uint64_t> *Offsets =nullptr,

242uint64_t StartingOffset = 0) {

243SmallVector<EVT, 16> TempVTs;

244SmallVector<uint64_t, 16> TempOffsets;

245

246// Special case for i128 - decompose to (i64, i64)

247if (Ty->isIntegerTy(128)) {

248 ValueVTs.push_back(EVT(MVT::i64));

249 ValueVTs.push_back(EVT(MVT::i64));

250

251if (Offsets) {

252 Offsets->push_back(StartingOffset + 0);

253 Offsets->push_back(StartingOffset + 8);

254 }

255

256return;

257 }

258

259// Given a struct type, recursively traverse the elements with custom ComputePTXValueVTs.

260if (StructType *STy = dyn_cast<StructType>(Ty)) {

261autoconst *SL =DL.getStructLayout(STy);

262auto ElementNum = 0;

263for(auto *EI : STy->elements()) {

264ComputePTXValueVTs(TLI,DL, EI, ValueVTs, Offsets,

265 StartingOffset + SL->getElementOffset(ElementNum));

266 ++ElementNum;

267 }

268return;

269 }

270

271// Given an array type, recursively traverse the elements with custom ComputePTXValueVTs.

272if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {

273Type *EltTy = ATy->getElementType();

274uint64_t EltSize =DL.getTypeAllocSize(EltTy);

275for (intI : llvm::seq<int>(ATy->getNumElements()))

276ComputePTXValueVTs(TLI,DL, EltTy, ValueVTs, Offsets, StartingOffset +I * EltSize);

277return;

278 }

279

280ComputeValueVTs(TLI,DL, Ty, TempVTs, &TempOffsets, StartingOffset);

281for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {

282EVT VT = TempVTs[i];

283uint64_t Off = TempOffsets[i];

284// Split vectors into individual elements, except for v2f16, which

285// we will pass as a single scalar.

286if (VT.isVector()) {

287unsigned NumElts = VT.getVectorNumElements();

288EVT EltVT = VT.getVectorElementType();

289// We require power-of-2 sized vectors becuase

290// TargetLoweringBase::getVectorTypeBreakdown() which is invoked in

291// ComputePTXValueVTs() cannot currently break down non-power-of-2 sized

292// vectors.

293if ((Is16bitsType(EltVT.getSimpleVT())) && NumElts % 2 == 0 &&

294isPowerOf2_32(NumElts)) {

295// Vectors with an even number of f16 elements will be passed to

296// us as an array of v2f16/v2bf16 elements. We must match this so we

297// stay in sync with Ins/Outs.

298switch (EltVT.getSimpleVT().SimpleTy) {

299case MVT::f16:

300 EltVT = MVT::v2f16;

301break;

302case MVT::bf16:

303 EltVT = MVT::v2bf16;

304break;

305case MVT::i16:

306 EltVT = MVT::v2i16;

307break;

308default:

309llvm_unreachable("Unexpected type");

310 }

311 NumElts /= 2;

312 }elseif (EltVT.getSimpleVT() == MVT::i8 &&

313 ((NumElts % 4 == 0 &&isPowerOf2_32(NumElts)) ||

314 NumElts == 3)) {

315// v*i8 are formally lowered as v4i8

316 EltVT = MVT::v4i8;

317 NumElts = (NumElts + 3) / 4;

318 }elseif (EltVT.getSimpleVT() == MVT::i8 && NumElts == 2) {

319// v2i8 is promoted to v2i16

320 NumElts = 1;

321 EltVT = MVT::v2i16;

322 }

323for (unsigned j = 0; j != NumElts; ++j) {

324 ValueVTs.push_back(EltVT);

325if (Offsets)

326 Offsets->push_back(Off + j * EltVT.getStoreSize());

327 }

328 }else {

329 ValueVTs.push_back(VT);

330if (Offsets)

331 Offsets->push_back(Off);

332 }

333 }

334}

335

336/// PromoteScalarIntegerPTX

337/// Used to make sure the arguments/returns are suitable for passing

338/// and promote them to a larger size if they're not.

339///

340/// The promoted type is placed in \p PromoteVT if the function returns true.

341staticboolPromoteScalarIntegerPTX(constEVT &VT,MVT *PromotedVT) {

342if (VT.isScalarInteger()) {

343switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {

344default:

345llvm_unreachable(

346"Promotion is not suitable for scalars of size larger than 64-bits");

347case 1:

348 *PromotedVT = MVT::i1;

349break;

350case 2:

351case 4:

352case 8:

353 *PromotedVT = MVT::i8;

354break;

355case 16:

356 *PromotedVT = MVT::i16;

357break;

358case 32:

359 *PromotedVT = MVT::i32;

360break;

361case 64:

362 *PromotedVT = MVT::i64;

363break;

364 }

365returnEVT(*PromotedVT) != VT;

366 }

367returnfalse;

368}

369

370// Check whether we can merge loads/stores of some of the pieces of a

371// flattened function parameter or return value into a single vector

372// load/store.

373//

374// The flattened parameter is represented as a list of EVTs and

375// offsets, and the whole structure is aligned to ParamAlignment. This

376// function determines whether we can load/store pieces of the

377// parameter starting at index Idx using a single vectorized op of

378// size AccessSize. If so, it returns the number of param pieces

379// covered by the vector op. Otherwise, it returns 1.

380staticunsignedCanMergeParamLoadStoresStartingAt(

381unsignedIdx,uint32_t AccessSize,constSmallVectorImpl<EVT> &ValueVTs,

382constSmallVectorImpl<uint64_t> &Offsets,Align ParamAlignment) {

383

384// Can't vectorize if param alignment is not sufficient.

385if (ParamAlignment < AccessSize)

386return 1;

387// Can't vectorize if offset is not aligned.

388if (Offsets[Idx] & (AccessSize - 1))

389return 1;

390

391EVT EltVT = ValueVTs[Idx];

392unsigned EltSize = EltVT.getStoreSize();

393

394// Element is too large to vectorize.

395if (EltSize >= AccessSize)

396return 1;

397

398unsigned NumElts = AccessSize / EltSize;

399// Can't vectorize if AccessBytes if not a multiple of EltSize.

400if (AccessSize != EltSize * NumElts)

401return 1;

402

403// We don't have enough elements to vectorize.

404if (Idx + NumElts > ValueVTs.size())

405return 1;

406

407// PTX ISA can only deal with 2- and 4-element vector ops.

408if (NumElts != 4 && NumElts != 2)

409return 1;

410

411for (unsigned j =Idx + 1; j <Idx + NumElts; ++j) {

412// Types do not match.

413if (ValueVTs[j] != EltVT)

414return 1;

415

416// Elements are not contiguous.

417if (Offsets[j] - Offsets[j - 1] != EltSize)

418return 1;

419 }

420// OK. We can vectorize ValueVTs[i..i+NumElts)

421return NumElts;

422}

423

424// Flags for tracking per-element vectorization state of loads/stores

425// of a flattened function parameter or return value.

426enumParamVectorizationFlags {

427PVF_INNER = 0x0,// Middle elements of a vector.

428PVF_FIRST = 0x1,// First element of the vector.

429PVF_LAST = 0x2,// Last element of the vector.

430// Scalar is effectively a 1-element vector.

431PVF_SCALAR =PVF_FIRST |PVF_LAST

432};

433

434// Computes whether and how we can vectorize the loads/stores of a

435// flattened function parameter or return value.

436//

437// The flattened parameter is represented as the list of ValueVTs and

438// Offsets, and is aligned to ParamAlignment bytes. We return a vector

439// of the same size as ValueVTs indicating how each piece should be

440// loaded/stored (i.e. as a scalar, or as part of a vector

441// load/store).

442staticSmallVector<ParamVectorizationFlags, 16>

443VectorizePTXValueVTs(constSmallVectorImpl<EVT> &ValueVTs,

444constSmallVectorImpl<uint64_t> &Offsets,

445Align ParamAlignment,bool IsVAArg =false) {

446// Set vector size to match ValueVTs and mark all elements as

447// scalars by default.

448SmallVector<ParamVectorizationFlags, 16> VectorInfo;

449 VectorInfo.assign(ValueVTs.size(),PVF_SCALAR);

450

451if (IsVAArg)

452return VectorInfo;

453

454// Check what we can vectorize using 128/64/32-bit accesses.

455for (intI = 0, E = ValueVTs.size();I != E; ++I) {

456// Skip elements we've already processed.

457assert(VectorInfo[I] ==PVF_SCALAR &&"Unexpected vector info state.");

458for (unsigned AccessSize : {16, 8, 4, 2}) {

459unsigned NumElts =CanMergeParamLoadStoresStartingAt(

460I, AccessSize, ValueVTs, Offsets, ParamAlignment);

461// Mark vectorized elements.

462switch (NumElts) {

463default:

464llvm_unreachable("Unexpected return value");

465case 1:

466// Can't vectorize using this size, try next smaller size.

467continue;

468case 2:

469assert(I + 1 < E &&"Not enough elements.");

470 VectorInfo[I] =PVF_FIRST;

471 VectorInfo[I + 1] =PVF_LAST;

472I += 1;

473break;

474case 4:

475assert(I + 3 < E &&"Not enough elements.");

476 VectorInfo[I] =PVF_FIRST;

477 VectorInfo[I + 1] =PVF_INNER;

478 VectorInfo[I + 2] =PVF_INNER;

479 VectorInfo[I + 3] =PVF_LAST;

480I += 3;

481break;

482 }

483// Break out of the inner loop because we've already succeeded

484// using largest possible AccessSize.

485break;

486 }

487 }

488return VectorInfo;

489}

490

491staticSDValue MaybeBitcast(SelectionDAG &DAG,SDLoc DL,EVT VT,

492SDValue Value) {

493if (Value->getValueType(0) == VT)

494returnValue;

495return DAG.getNode(ISD::BITCAST,DL, VT,Value);

496}

497

498// NVPTXTargetLowering Constructor.

499NVPTXTargetLowering::NVPTXTargetLowering(constNVPTXTargetMachine &TM,

500constNVPTXSubtarget &STI)

501 :TargetLowering(TM), nvTM(&TM), STI(STI) {

502// always lower memset, memcpy, and memmove intrinsics to load/store

503// instructions, rather

504// then generating calls to memset, mempcy or memmove.

505MaxStoresPerMemset =MaxStoresPerMemsetOptSize = (unsigned)0xFFFFFFFF;

506MaxStoresPerMemcpy =MaxStoresPerMemcpyOptSize = (unsigned) 0xFFFFFFFF;

507MaxStoresPerMemmove =MaxStoresPerMemmoveOptSize = (unsigned) 0xFFFFFFFF;

508

509setBooleanContents(ZeroOrNegativeOneBooleanContent);

510setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

511

512// Jump is Expensive. Don't create extra control flow for 'and', 'or'

513// condition branches.

514setJumpIsExpensive(true);

515

516// Wide divides are _very_ slow. Try to reduce the width of the divide if

517// possible.

518addBypassSlowDiv(64, 32);

519

520// By default, use the Source scheduling

521if (sched4reg)

522setSchedulingPreference(Sched::RegPressure);

523else

524setSchedulingPreference(Sched::Source);

525

526auto setFP16OperationAction = [&](unsignedOp,MVT VT,LegalizeAction Action,

527LegalizeAction NoF16Action) {

528bool IsOpSupported = STI.allowFP16Math();

529switch (Op) {

530// Several FP16 instructions are available on sm_80 only.

531caseISD::FMINNUM:

532caseISD::FMAXNUM:

533caseISD::FMAXNUM_IEEE:

534caseISD::FMINNUM_IEEE:

535caseISD::FMAXIMUM:

536caseISD::FMINIMUM:

537 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;

538break;

539caseISD::FEXP2:

540 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;

541break;

542 }

543setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);

544 };

545

546auto setBF16OperationAction = [&](unsignedOp,MVT VT,LegalizeAction Action,

547LegalizeAction NoBF16Action) {

548bool IsOpSupported = STI.hasNativeBF16Support(Op);

549setOperationAction(

550Op, VT, IsOpSupported ? Action : NoBF16Action);

551 };

552

553auto setI16x2OperationAction = [&](unsignedOp,MVT VT,LegalizeAction Action,

554LegalizeAction NoI16x2Action) {

555bool IsOpSupported =false;

556// instructions are available on sm_90 only

557switch (Op) {

558caseISD::ADD:

559caseISD::SMAX:

560caseISD::SMIN:

561caseISD::UMIN:

562caseISD::UMAX:

563 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;

564break;

565 }

566setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);

567 };

568

569addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);

570addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);

571addRegisterClass(MVT::v2i16, &NVPTX::Int32RegsRegClass);

572addRegisterClass(MVT::v4i8, &NVPTX::Int32RegsRegClass);

573addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);

574addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);

575addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);

576addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);

577addRegisterClass(MVT::f16, &NVPTX::Int16RegsRegClass);

578addRegisterClass(MVT::v2f16, &NVPTX::Int32RegsRegClass);

579addRegisterClass(MVT::bf16, &NVPTX::Int16RegsRegClass);

580addRegisterClass(MVT::v2bf16, &NVPTX::Int32RegsRegClass);

581

582// Conversion to/from FP16/FP16x2 is always legal.

583setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16,Custom);

584setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16,Custom);

585setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16,Expand);

586setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16,Expand);

587

588setOperationAction(ISD::READCYCLECOUNTER, MVT::i64,Legal);

589if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)

590setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64,Legal);

591

592 setFP16OperationAction(ISD::SETCC, MVT::f16,Legal,Promote);

593 setFP16OperationAction(ISD::SETCC, MVT::v2f16,Legal,Expand);

594

595// Conversion to/from BFP16/BFP16x2 is always legal.

596setOperationAction(ISD::BUILD_VECTOR, MVT::v2bf16,Custom);

597setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2bf16,Custom);

598setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2bf16,Expand);

599setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2bf16,Expand);

600

601 setBF16OperationAction(ISD::SETCC, MVT::v2bf16,Legal,Expand);

602 setBF16OperationAction(ISD::SETCC, MVT::bf16,Legal,Promote);

603if (getOperationAction(ISD::SETCC, MVT::bf16) ==Promote)

604AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);

605

606// Conversion to/from i16/i16x2 is always legal.

607setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16,Custom);

608setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16,Custom);

609setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16,Expand);

610setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i16,Expand);

611

612setOperationAction(ISD::BUILD_VECTOR, MVT::v4i8,Custom);

613setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i8,Custom);

614setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i8,Custom);

615setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i8,Custom);

616

617// Custom conversions to/from v2i8.

618setOperationAction(ISD::BITCAST, MVT::v2i8,Custom);

619

620// Only logical ops can be done on v4i8 directly, others must be done

621// elementwise.

622setOperationAction(

623 {ISD::ABS,ISD::ADD,ISD::ADDC,ISD::ADDE,

624ISD::BITREVERSE,ISD::CTLZ,ISD::CTPOP,ISD::CTTZ,

625ISD::FP_TO_SINT,ISD::FP_TO_UINT,ISD::FSHL,ISD::FSHR,

626ISD::MUL,ISD::MULHS,ISD::MULHU,ISD::PARITY,

627ISD::ROTL,ISD::ROTR,ISD::SADDO,ISD::SADDO_CARRY,

628ISD::SADDSAT,ISD::SDIV,ISD::SDIVREM,ISD::SELECT_CC,

629ISD::SETCC,ISD::SHL,ISD::SINT_TO_FP,ISD::SMAX,

630ISD::SMIN,ISD::SMULO,ISD::SMUL_LOHI,ISD::SRA,

631ISD::SREM,ISD::SRL,ISD::SSHLSAT,ISD::SSUBO,

632ISD::SSUBO_CARRY,ISD::SSUBSAT,ISD::SUB,ISD::SUBC,

633ISD::SUBE,ISD::UADDO,ISD::UADDO_CARRY,ISD::UADDSAT,

634ISD::UDIV,ISD::UDIVREM,ISD::UINT_TO_FP,ISD::UMAX,

635ISD::UMIN,ISD::UMULO,ISD::UMUL_LOHI,ISD::UREM,

636ISD::USHLSAT,ISD::USUBO,ISD::USUBO_CARRY,ISD::VSELECT,

637ISD::USUBSAT},

638 MVT::v4i8,Expand);

639

640// Operations not directly supported by NVPTX.

641for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,

642 MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16, MVT::v4i8,

643 MVT::i32, MVT::i64}) {

644setOperationAction(ISD::SELECT_CC, VT,Expand);

645setOperationAction(ISD::BR_CC, VT,Expand);

646 }

647

648// Some SIGN_EXTEND_INREG can be done using cvt instruction.

649// For others we will expand to a SHL/SRA pair.

650setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i64,Legal);

651setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32,Legal);

652setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16,Legal);

653setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 ,Legal);

654setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,Expand);

655setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16,Expand);

656

657setOperationAction(ISD::SHL_PARTS, MVT::i32 ,Custom);

658setOperationAction(ISD::SRA_PARTS, MVT::i32 ,Custom);

659setOperationAction(ISD::SRL_PARTS, MVT::i32 ,Custom);

660setOperationAction(ISD::SHL_PARTS, MVT::i64 ,Custom);

661setOperationAction(ISD::SRA_PARTS, MVT::i64 ,Custom);

662setOperationAction(ISD::SRL_PARTS, MVT::i64 ,Custom);

663

664setOperationAction(ISD::BITREVERSE, MVT::i32,Legal);

665setOperationAction(ISD::BITREVERSE, MVT::i64,Legal);

666

667setOperationAction({ISD::ROTL,ISD::ROTR},

668 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},

669Expand);

670

671if (STI.hasHWROT32())

672setOperationAction({ISD::FSHL,ISD::FSHR}, MVT::i32,Legal);

673

674setOperationAction(ISD::BSWAP, MVT::i16,Expand);

675

676setOperationAction(ISD::BR_JT, MVT::Other,Custom);

677setOperationAction(ISD::BRIND, MVT::Other,Expand);

678

679setOperationAction(ISD::GlobalAddress, MVT::i32,Custom);

680setOperationAction(ISD::GlobalAddress, MVT::i64,Custom);

681

682// We want to legalize constant related memmove and memcopy

683// intrinsics.

684setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other,Custom);

685

686// Turn FP extload into load/fpextend

687setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16,Expand);

688setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16,Expand);

689setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16,Expand);

690setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16,Expand);

691setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32,Expand);

692setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16,Expand);

693setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16,Expand);

694setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16,Expand);

695setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16,Expand);

696setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32,Expand);

697setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16,Expand);

698setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16,Expand);

699setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16,Expand);

700setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16,Expand);

701setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32,Expand);

702setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16,Expand);

703setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16,Expand);

704setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16,Expand);

705setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16,Expand);

706// Turn FP truncstore into trunc + store.

707// FIXME: vector types should also be expanded

708setTruncStoreAction(MVT::f32, MVT::f16,Expand);

709setTruncStoreAction(MVT::f64, MVT::f16,Expand);

710setTruncStoreAction(MVT::f32, MVT::bf16,Expand);

711setTruncStoreAction(MVT::f64, MVT::bf16,Expand);

712setTruncStoreAction(MVT::f64, MVT::f32,Expand);

713

714// PTX does not support load / store predicate registers

715setOperationAction(ISD::LOAD, MVT::i1,Custom);

716setOperationAction(ISD::STORE, MVT::i1,Custom);

717

718for (MVT VT :MVT::integer_valuetypes()) {

719setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1,Promote);

720setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1,Promote);

721setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1,Promote);

722setTruncStoreAction(VT, MVT::i1,Expand);

723 }

724

725setCondCodeAction({ISD::SETNE,ISD::SETEQ,ISD::SETUGE,ISD::SETULE,

726ISD::SETUGT,ISD::SETULT,ISD::SETGT,ISD::SETLT,

727ISD::SETGE,ISD::SETLE},

728 MVT::i1,Expand);

729

730// expand extload of vector of integers.

731setLoadExtAction({ISD::EXTLOAD,ISD::SEXTLOAD,ISD::ZEXTLOAD}, MVT::v2i16,

732 MVT::v2i8,Expand);

733setTruncStoreAction(MVT::v2i16, MVT::v2i8,Expand);

734

735// This is legal in NVPTX

736setOperationAction(ISD::ConstantFP, MVT::f64,Legal);

737setOperationAction(ISD::ConstantFP, MVT::f32,Legal);

738setOperationAction(ISD::ConstantFP, MVT::f16,Legal);

739setOperationAction(ISD::ConstantFP, MVT::bf16,Legal);

740

741setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64},Custom);

742setOperationAction({ISD::STACKRESTORE,ISD::STACKSAVE}, MVT::Other,Custom);

743

744// TRAP can be lowered to PTX trap

745setOperationAction(ISD::TRAP, MVT::Other,Legal);

746// DEBUGTRAP can be lowered to PTX brkpt

747setOperationAction(ISD::DEBUGTRAP, MVT::Other,Legal);

748

749// Register custom handling for vector loads/stores

750for (MVT VT :MVT::fixedlen_vector_valuetypes()) {

751if (IsPTXVectorType(VT)) {

752setOperationAction(ISD::LOAD, VT,Custom);

753setOperationAction(ISD::STORE, VT,Custom);

754setOperationAction(ISD::INTRINSIC_W_CHAIN, VT,Custom);

755 }

756 }

757

758// Support varargs.

759setOperationAction(ISD::VASTART, MVT::Other,Custom);

760setOperationAction(ISD::VAARG, MVT::Other,Custom);

761setOperationAction(ISD::VACOPY, MVT::Other,Expand);

762setOperationAction(ISD::VAEND, MVT::Other,Expand);

763

764// Custom handling for i8 intrinsics

765setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8,Custom);

766

767for (constauto& Ty : {MVT::i16, MVT::i32, MVT::i64}) {

768setOperationAction(ISD::ABS, Ty,Legal);

769setOperationAction(ISD::SMIN, Ty,Legal);

770setOperationAction(ISD::SMAX, Ty,Legal);

771setOperationAction(ISD::UMIN, Ty,Legal);

772setOperationAction(ISD::UMAX, Ty,Legal);

773

774setOperationAction(ISD::CTPOP, Ty,Legal);

775setOperationAction(ISD::CTLZ, Ty,Legal);

776 }

777

778 setI16x2OperationAction(ISD::ABS, MVT::v2i16,Legal,Custom);

779 setI16x2OperationAction(ISD::SMIN, MVT::v2i16,Legal,Custom);

780 setI16x2OperationAction(ISD::SMAX, MVT::v2i16,Legal,Custom);

781 setI16x2OperationAction(ISD::UMIN, MVT::v2i16,Legal,Custom);

782 setI16x2OperationAction(ISD::UMAX, MVT::v2i16,Legal,Custom);

783 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16,Legal,Expand);

784 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16,Legal,Expand);

785

786 setI16x2OperationAction(ISD::ADD, MVT::v2i16,Legal,Custom);

787 setI16x2OperationAction(ISD::SUB, MVT::v2i16,Legal,Custom);

788 setI16x2OperationAction(ISD::MUL, MVT::v2i16,Legal,Custom);

789 setI16x2OperationAction(ISD::SHL, MVT::v2i16,Legal,Custom);

790 setI16x2OperationAction(ISD::SREM, MVT::v2i16,Legal,Custom);

791 setI16x2OperationAction(ISD::UREM, MVT::v2i16,Legal,Custom);

792

793// Other arithmetic and logic ops are unsupported.

794setOperationAction({ISD::SDIV,ISD::UDIV,ISD::SRA,ISD::SRL,ISD::MULHS,

795ISD::MULHU,ISD::FP_TO_SINT,ISD::FP_TO_UINT,

796ISD::SINT_TO_FP,ISD::UINT_TO_FP,ISD::SETCC},

797 MVT::v2i16,Expand);

798

799setOperationAction(ISD::ADDC, MVT::i32,Legal);

800setOperationAction(ISD::ADDE, MVT::i32,Legal);

801setOperationAction(ISD::SUBC, MVT::i32,Legal);

802setOperationAction(ISD::SUBE, MVT::i32,Legal);

803if (STI.getPTXVersion() >= 43) {

804setOperationAction(ISD::ADDC, MVT::i64,Legal);

805setOperationAction(ISD::ADDE, MVT::i64,Legal);

806setOperationAction(ISD::SUBC, MVT::i64,Legal);

807setOperationAction(ISD::SUBE, MVT::i64,Legal);

808 }

809

810setOperationAction(ISD::CTTZ, MVT::i16,Expand);

811setOperationAction(ISD::CTTZ, MVT::v2i16,Expand);

812setOperationAction(ISD::CTTZ, MVT::i32,Expand);

813setOperationAction(ISD::CTTZ, MVT::i64,Expand);

814

815// PTX does not directly support SELP of i1, so promote to i32 first

816setOperationAction(ISD::SELECT, MVT::i1,Custom);

817

818// PTX cannot multiply two i64s in a single instruction.

819setOperationAction(ISD::SMUL_LOHI, MVT::i64,Expand);

820setOperationAction(ISD::UMUL_LOHI, MVT::i64,Expand);

821

822// We have some custom DAG combine patterns for these nodes

823setTargetDAGCombine({ISD::ADD,ISD::AND,ISD::EXTRACT_VECTOR_ELT,ISD::FADD,

824ISD::MUL,ISD::SHL,ISD::SREM,ISD::UREM,ISD::VSELECT,

825ISD::BUILD_VECTOR});

826

827// setcc for f16x2 and bf16x2 needs special handling to prevent

828// legalizer's attempt to scalarize it due to v2i1 not being legal.

829if (STI.allowFP16Math() || STI.hasBF16Math())

830setTargetDAGCombine(ISD::SETCC);

831

832// Promote fp16 arithmetic if fp16 hardware isn't available or the

833// user passed --nvptx-no-fp16-math. The flag is useful because,

834// although sm_53+ GPUs have some sort of FP16 support in

835// hardware, only sm_53 and sm_60 have full implementation. Others

836// only have token amount of hardware and are likely to run faster

837// by using fp32 units instead.

838for (constauto &Op : {ISD::FADD,ISD::FMUL,ISD::FSUB,ISD::FMA}) {

839 setFP16OperationAction(Op, MVT::f16,Legal,Promote);

840 setFP16OperationAction(Op, MVT::v2f16,Legal,Expand);

841 setBF16OperationAction(Op, MVT::v2bf16,Legal,Expand);

842// bf16 must be promoted to f32.

843 setBF16OperationAction(Op, MVT::bf16,Legal,Promote);

844if (getOperationAction(Op, MVT::bf16) ==Promote)

845AddPromotedToType(Op, MVT::bf16, MVT::f32);

846 }

847

848// On SM80, we select add/mul/sub as fma to avoid promotion to float

849for (constauto &Op : {ISD::FADD,ISD::FMUL,ISD::FSUB}) {

850for (constauto &VT : {MVT::bf16, MVT::v2bf16}) {

851if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {

852setOperationAction(Op, VT,Custom);

853 }

854 }

855 }

856

857// f16/f16x2 neg was introduced in PTX 60, SM_53.

858constbool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&

859 STI.getPTXVersion() >= 60 &&

860 STI.allowFP16Math();

861for (constauto &VT : {MVT::f16, MVT::v2f16})

862setOperationAction(ISD::FNEG, VT,

863 IsFP16FP16x2NegAvailable ?Legal :Expand);

864

865 setBF16OperationAction(ISD::FNEG, MVT::bf16,Legal,Expand);

866 setBF16OperationAction(ISD::FNEG, MVT::v2bf16,Legal,Expand);

867// (would be) Library functions.

868

869// These map to conversion instructions for scalar FP types.

870for (constauto &Op : {ISD::FCEIL,ISD::FFLOOR,ISD::FNEARBYINT,ISD::FRINT,

871ISD::FROUNDEVEN,ISD::FTRUNC}) {

872setOperationAction(Op, MVT::f16,Legal);

873setOperationAction(Op, MVT::f32,Legal);

874setOperationAction(Op, MVT::f64,Legal);

875setOperationAction(Op, MVT::v2f16,Expand);

876setOperationAction(Op, MVT::v2bf16,Expand);

877 setBF16OperationAction(Op, MVT::bf16,Legal,Promote);

878if (getOperationAction(Op, MVT::bf16) ==Promote)

879AddPromotedToType(Op, MVT::bf16, MVT::f32);

880 }

881

882if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {

883setOperationAction(ISD::BF16_TO_FP, MVT::f32,Expand);

884 }

885if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {

886for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {

887setOperationAction(ISD::FP_EXTEND, VT,Custom);

888setOperationAction(ISD::FP_ROUND, VT,Custom);

889 }

890 }

891

892// sm_80 only has conversions between f32 and bf16. Custom lower all other

893// bf16 conversions.

894if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {

895for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {

896setOperationAction(

897 {ISD::SINT_TO_FP,ISD::UINT_TO_FP,ISD::FP_TO_SINT,ISD::FP_TO_UINT},

898 VT,Custom);

899 }

900setOperationAction(

901 {ISD::SINT_TO_FP,ISD::UINT_TO_FP,ISD::FP_TO_SINT,ISD::FP_TO_UINT},

902 MVT::bf16,Custom);

903 }

904

905setOperationAction(ISD::FROUND, MVT::f16,Promote);

906setOperationAction(ISD::FROUND, MVT::v2f16,Expand);

907setOperationAction(ISD::FROUND, MVT::v2bf16,Expand);

908setOperationAction(ISD::FROUND, MVT::f32,Custom);

909setOperationAction(ISD::FROUND, MVT::f64,Custom);

910setOperationAction(ISD::FROUND, MVT::bf16,Promote);

911AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);

912

913// 'Expand' implements FCOPYSIGN without calling an external library.

914setOperationAction(ISD::FCOPYSIGN, MVT::f16,Expand);

915setOperationAction(ISD::FCOPYSIGN, MVT::v2f16,Expand);

916setOperationAction(ISD::FCOPYSIGN, MVT::bf16,Expand);

917setOperationAction(ISD::FCOPYSIGN, MVT::v2bf16,Expand);

918setOperationAction(ISD::FCOPYSIGN, MVT::f32,Custom);

919setOperationAction(ISD::FCOPYSIGN, MVT::f64,Custom);

920

921// These map to corresponding instructions for f32/f64. f16 must be

922// promoted to f32. v2f16 is expanded to f16, which is then promoted

923// to f32.

924for (constauto &Op :

925 {ISD::FDIV,ISD::FREM,ISD::FSQRT,ISD::FSIN,ISD::FCOS}) {

926setOperationAction(Op, MVT::f16,Promote);

927setOperationAction(Op, MVT::f32,Legal);

928setOperationAction(Op, MVT::f64,Legal);

929setOperationAction(Op, MVT::v2f16,Expand);

930setOperationAction(Op, MVT::v2bf16,Expand);

931setOperationAction(Op, MVT::bf16,Promote);

932AddPromotedToType(Op, MVT::bf16, MVT::f32);

933 }

934

935setOperationAction(ISD::FABS, {MVT::f32, MVT::f64},Legal);

936if (STI.getPTXVersion() >= 65) {

937 setFP16OperationAction(ISD::FABS, MVT::f16,Legal,Promote);

938 setFP16OperationAction(ISD::FABS, MVT::v2f16,Legal,Expand);

939 }else {

940setOperationAction(ISD::FABS, MVT::f16,Promote);

941setOperationAction(ISD::FABS, MVT::v2f16,Expand);

942 }

943 setBF16OperationAction(ISD::FABS, MVT::v2bf16,Legal,Expand);

944 setBF16OperationAction(ISD::FABS, MVT::bf16,Legal,Promote);

945if (getOperationAction(ISD::FABS, MVT::bf16) ==Promote)

946AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);

947

948for (constauto &Op : {ISD::FMINNUM,ISD::FMAXNUM}) {

949setOperationAction(Op, MVT::f32,Legal);

950setOperationAction(Op, MVT::f64,Legal);

951 setFP16OperationAction(Op, MVT::f16,Legal,Promote);

952 setFP16OperationAction(Op, MVT::v2f16,Legal,Expand);

953 setBF16OperationAction(Op, MVT::v2bf16,Legal,Expand);

954 setBF16OperationAction(Op, MVT::bf16,Legal,Promote);

955if (getOperationAction(Op, MVT::bf16) ==Promote)

956AddPromotedToType(Op, MVT::bf16, MVT::f32);

957 }

958bool SupportsF32MinMaxNaN =

959 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;

960for (constauto &Op : {ISD::FMINIMUM,ISD::FMAXIMUM}) {

961setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ?Legal :Expand);

962 setFP16OperationAction(Op, MVT::f16,Legal,Expand);

963 setFP16OperationAction(Op, MVT::v2f16,Legal,Expand);

964 setBF16OperationAction(Op, MVT::bf16,Legal,Expand);

965 setBF16OperationAction(Op, MVT::v2bf16,Legal,Expand);

966 }

967

968// Custom lowering for inline asm with 128-bit operands

969setOperationAction(ISD::CopyToReg, MVT::i128,Custom);

970setOperationAction(ISD::CopyFromReg, MVT::i128,Custom);

971

972// FEXP2 support:

973// - f32

974// - f16/f16x2 (sm_70+, PTX 7.0+)

975// - bf16/bf16x2 (sm_90+, PTX 7.8+)

976// When f16/bf16 types aren't supported, they are promoted/expanded to f32.

977setOperationAction(ISD::FEXP2, MVT::f32,Legal);

978 setFP16OperationAction(ISD::FEXP2, MVT::f16,Legal,Promote);

979 setFP16OperationAction(ISD::FEXP2, MVT::v2f16,Legal,Expand);

980 setBF16OperationAction(ISD::FEXP2, MVT::bf16,Legal,Promote);

981 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16,Legal,Expand);

982

983// FLOG2 supports f32 only

984// f16/bf16 types aren't supported, but they are promoted/expanded to f32.

985if (UseApproxLog2F32) {

986setOperationAction(ISD::FLOG2, MVT::f32,Legal);

987setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);

988setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);

989setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16},Expand);

990 }

991

992// No FPOW or FREM in PTX.

993

994// Now deduce the information based on the above mentioned

995// actions

996computeRegisterProperties(STI.getRegisterInfo());

997

998setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());

999setMaxAtomicSizeInBitsSupported(64);

1000setMaxDivRemBitWidthSupported(64);

1001}

1002

1003constchar *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const{

1004

1005#define MAKE_CASE(V) \

1006 case V: \

1007 return #V;

1008

1009switch ((NVPTXISD::NodeType)Opcode) {

1010caseNVPTXISD::FIRST_NUMBER:

1011break;

1012

1013MAKE_CASE(NVPTXISD::CALL)

1014MAKE_CASE(NVPTXISD::RET_GLUE)

1015MAKE_CASE(NVPTXISD::LOAD_PARAM)

1016MAKE_CASE(NVPTXISD::Wrapper)

1017MAKE_CASE(NVPTXISD::DeclareParam)

1018MAKE_CASE(NVPTXISD::DeclareScalarParam)

1019MAKE_CASE(NVPTXISD::DeclareRet)

1020MAKE_CASE(NVPTXISD::DeclareScalarRet)

1021MAKE_CASE(NVPTXISD::DeclareRetParam)

1022MAKE_CASE(NVPTXISD::PrintCall)

1023MAKE_CASE(NVPTXISD::PrintConvergentCall)

1024MAKE_CASE(NVPTXISD::PrintCallUni)

1025MAKE_CASE(NVPTXISD::PrintConvergentCallUni)

1026MAKE_CASE(NVPTXISD::LoadParam)

1027MAKE_CASE(NVPTXISD::LoadParamV2)

1028MAKE_CASE(NVPTXISD::LoadParamV4)

1029MAKE_CASE(NVPTXISD::StoreParam)

1030MAKE_CASE(NVPTXISD::StoreParamV2)

1031MAKE_CASE(NVPTXISD::StoreParamV4)

1032MAKE_CASE(NVPTXISD::StoreParamS32)

1033MAKE_CASE(NVPTXISD::StoreParamU32)

1034MAKE_CASE(NVPTXISD::CallArgBegin)

1035MAKE_CASE(NVPTXISD::CallArg)

1036MAKE_CASE(NVPTXISD::LastCallArg)

1037MAKE_CASE(NVPTXISD::CallArgEnd)

1038MAKE_CASE(NVPTXISD::CallVoid)

1039MAKE_CASE(NVPTXISD::CallVal)

1040MAKE_CASE(NVPTXISD::CallSymbol)

1041MAKE_CASE(NVPTXISD::Prototype)

1042MAKE_CASE(NVPTXISD::MoveParam)

1043MAKE_CASE(NVPTXISD::StoreRetval)

1044MAKE_CASE(NVPTXISD::StoreRetvalV2)

1045MAKE_CASE(NVPTXISD::StoreRetvalV4)

1046MAKE_CASE(NVPTXISD::PseudoUseParam)

1047MAKE_CASE(NVPTXISD::RETURN)

1048MAKE_CASE(NVPTXISD::CallSeqBegin)

1049MAKE_CASE(NVPTXISD::CallSeqEnd)

1050MAKE_CASE(NVPTXISD::CallPrototype)

1051MAKE_CASE(NVPTXISD::ProxyReg)

1052MAKE_CASE(NVPTXISD::LoadV2)

1053MAKE_CASE(NVPTXISD::LoadV4)

1054MAKE_CASE(NVPTXISD::LDUV2)

1055MAKE_CASE(NVPTXISD::LDUV4)

1056MAKE_CASE(NVPTXISD::StoreV2)

1057MAKE_CASE(NVPTXISD::StoreV4)

1058MAKE_CASE(NVPTXISD::FSHL_CLAMP)

1059MAKE_CASE(NVPTXISD::FSHR_CLAMP)

1060MAKE_CASE(NVPTXISD::BFE)

1061MAKE_CASE(NVPTXISD::BFI)

1062MAKE_CASE(NVPTXISD::PRMT)

1063MAKE_CASE(NVPTXISD::FCOPYSIGN)

1064MAKE_CASE(NVPTXISD::DYNAMIC_STACKALLOC)

1065MAKE_CASE(NVPTXISD::STACKRESTORE)

1066MAKE_CASE(NVPTXISD::STACKSAVE)

1067MAKE_CASE(NVPTXISD::SETP_F16X2)

1068MAKE_CASE(NVPTXISD::SETP_BF16X2)

1069MAKE_CASE(NVPTXISD::Dummy)

1070MAKE_CASE(NVPTXISD::MUL_WIDE_SIGNED)

1071MAKE_CASE(NVPTXISD::MUL_WIDE_UNSIGNED)

1072MAKE_CASE(NVPTXISD::BrxEnd)

1073MAKE_CASE(NVPTXISD::BrxItem)

1074MAKE_CASE(NVPTXISD::BrxStart)

1075 }

1076returnnullptr;

1077

1078#undef MAKE_CASE

1079}

1080

1081TargetLoweringBase::LegalizeTypeAction

1082NVPTXTargetLowering::getPreferredVectorAction(MVT VT) const{

1083if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&

1084 VT.getScalarType() == MVT::i1)

1085returnTypeSplitVector;

1086returnTargetLoweringBase::getPreferredVectorAction(VT);

1087}

1088

1089SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand,SelectionDAG &DAG,

1090intEnabled,int &ExtraSteps,

1091bool &UseOneConst,

1092bool Reciprocal) const{

1093if (!(Enabled ==ReciprocalEstimate::Enabled ||

1094 (Enabled ==ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))

1095returnSDValue();

1096

1097if (ExtraSteps ==ReciprocalEstimate::Unspecified)

1098 ExtraSteps = 0;

1099

1100SDLoc DL(Operand);

1101EVT VT = Operand.getValueType();

1102bool Ftz =useF32FTZ(DAG.getMachineFunction());

1103

1104auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {

1105return DAG.getNode(ISD::INTRINSIC_WO_CHAIN,DL, VT,

1106 DAG.getConstant(IID,DL, MVT::i32), Operand);

1107 };

1108

1109// The sqrt and rsqrt refinement processes assume we always start out with an

1110// approximation of the rsqrt. Therefore, if we're going to do any refinement

1111// (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing

1112// any refinement, we must return a regular sqrt.

1113if (Reciprocal || ExtraSteps > 0) {

1114if (VT == MVT::f32)

1115return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f

1116 : Intrinsic::nvvm_rsqrt_approx_f);

1117elseif (VT == MVT::f64)

1118return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);

1119else

1120returnSDValue();

1121 }else {

1122if (VT == MVT::f32)

1123return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f

1124 : Intrinsic::nvvm_sqrt_approx_f);

1125else {

1126// There's no sqrt.approx.f64 instruction, so we emit

1127// reciprocal(rsqrt(x)). This is faster than

1128// select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain

1129// x * rsqrt(x).)

1130return DAG.getNode(

1131ISD::INTRINSIC_WO_CHAIN,DL, VT,

1132 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d,DL, MVT::i32),

1133 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));

1134 }

1135 }

1136}

1137

1138SDValue

1139NVPTXTargetLowering::LowerGlobalAddress(SDValue Op,SelectionDAG &DAG) const{

1140SDLoc dl(Op);

1141constGlobalAddressSDNode *GAN = cast<GlobalAddressSDNode>(Op);

1142auto PtrVT =getPointerTy(DAG.getDataLayout(), GAN->getAddressSpace());

1143Op = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, PtrVT);

1144return DAG.getNode(NVPTXISD::Wrapper, dl, PtrVT,Op);

1145}

1146

1147staticboolIsTypePassedAsArray(constType *Ty) {

1148return Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128) ||

1149 Ty->isHalfTy() || Ty->isBFloatTy();

1150}

1151

1152std::stringNVPTXTargetLowering::getPrototype(

1153constDataLayout &DL,Type *retTy,constArgListTy &Args,

1154constSmallVectorImpl<ISD::OutputArg> &Outs,MaybeAlign retAlignment,

1155 std::optional<std::pair<unsigned, const APInt &>> VAInfo,

1156constCallBase &CB,unsigned UniqueCallSite) const{

1157auto PtrVT =getPointerTy(DL);

1158

1159bool isABI = (STI.getSmVersion() >= 20);

1160assert(isABI &&"Non-ABI compilation is not supported");

1161if (!isABI)

1162return"";

1163

1164 std::string Prototype;

1165raw_string_ostream O(Prototype);

1166 O <<"prototype_" << UniqueCallSite <<" : .callprototype ";

1167

1168if (retTy->getTypeID() ==Type::VoidTyID) {

1169 O <<"()";

1170 }else {

1171 O <<"(";

1172if ((retTy->isFloatingPointTy() || retTy->isIntegerTy()) &&

1173 !IsTypePassedAsArray(retTy)) {

1174unsignedsize = 0;

1175if (auto *ITy = dyn_cast<IntegerType>(retTy)) {

1176size = ITy->getBitWidth();

1177 }else {

1178assert(retTy->isFloatingPointTy() &&

1179"Floating point type expected here");

1180size = retTy->getPrimitiveSizeInBits();

1181 }

1182// PTX ABI requires all scalar return values to be at least 32

1183// bits in size. fp16 normally uses .b16 as its storage type in

1184// PTX, so its size must be adjusted here, too.

1185size =promoteScalarArgumentSize(size);

1186

1187 O <<".param .b" <<size <<" _";

1188 }elseif (isa<PointerType>(retTy)) {

1189 O <<".param .b" << PtrVT.getSizeInBits() <<" _";

1190 }elseif (IsTypePassedAsArray(retTy)) {

1191 O <<".param .align " << (retAlignment ? retAlignment->value() : 0)

1192 <<" .b8 _[" <<DL.getTypeAllocSize(retTy) <<"]";

1193 }else {

1194llvm_unreachable("Unknown return type");

1195 }

1196 O <<") ";

1197 }

1198 O <<"_ (";

1199

1200bool first =true;

1201

1202unsigned NumArgs = VAInfo ? VAInfo->first : Args.size();

1203for (unsigned i = 0, OIdx = 0; i != NumArgs; ++i, ++OIdx) {

1204Type *Ty = Args[i].Ty;

1205if (!first) {

1206 O <<", ";

1207 }

1208 first =false;

1209

1210if (!Outs[OIdx].Flags.isByVal()) {

1211if (IsTypePassedAsArray(Ty)) {

1212Align ParamAlign =

1213 getArgumentAlignment(&CB, Ty, i +AttributeList::FirstArgIndex,DL);

1214 O <<".param .align " << ParamAlign.value() <<" .b8 ";

1215 O <<"_";

1216 O <<"[" <<DL.getTypeAllocSize(Ty) <<"]";

1217// update the index for Outs

1218SmallVector<EVT, 16> vtparts;

1219ComputeValueVTs(*this,DL, Ty, vtparts);

1220if (unsigned len = vtparts.size())

1221 OIdx += len - 1;

1222continue;

1223 }

1224// i8 types in IR will be i16 types in SDAG

1225assert((getValueType(DL, Ty) == Outs[OIdx].VT ||

1226 (getValueType(DL, Ty) == MVT::i8 && Outs[OIdx].VT == MVT::i16)) &&

1227"type mismatch between callee prototype and arguments");

1228// scalar type

1229unsigned sz = 0;

1230if (isa<IntegerType>(Ty)) {

1231 sz = cast<IntegerType>(Ty)->getBitWidth();

1232 sz =promoteScalarArgumentSize(sz);

1233 }elseif (isa<PointerType>(Ty)) {

1234 sz = PtrVT.getSizeInBits();

1235 }else {

1236 sz = Ty->getPrimitiveSizeInBits();

1237 }

1238 O <<".param .b" << sz <<" ";

1239 O <<"_";

1240continue;

1241 }

1242

1243// Indirect calls need strict ABI alignment so we disable optimizations by

1244// not providing a function to optimize.

1245Type *ETy = Args[i].IndirectType;

1246Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();

1247Align ParamByValAlign =

1248getFunctionByValParamAlign(/*F=*/nullptr, ETy, InitialAlign,DL);

1249

1250 O <<".param .align " << ParamByValAlign.value() <<" .b8 ";

1251 O <<"_";

1252 O <<"[" << Outs[OIdx].Flags.getByValSize() <<"]";

1253 }

1254

1255if (VAInfo)

1256 O << (first ?"" :",") <<" .param .align " << VAInfo->second

1257 <<" .b8 _[]\n";

1258 O <<")";

1259if (shouldEmitPTXNoReturn(&CB, *nvTM))

1260 O <<" .noreturn";

1261 O <<";";

1262

1263return Prototype;

1264}

1265

1266Align NVPTXTargetLowering::getFunctionArgumentAlignment(

1267constFunction *F,Type *Ty,unsignedIdx,constDataLayout &DL) const{

1268returngetAlign(*F,Idx).value_or(getFunctionParamOptimizedAlign(F, Ty,DL));

1269}

1270

1271Align NVPTXTargetLowering::getArgumentAlignment(constCallBase *CB,Type *Ty,

1272unsignedIdx,

1273constDataLayout &DL) const{

1274if (!CB) {

1275// CallSite is zero, fallback to ABI type alignment

1276returnDL.getABITypeAlign(Ty);

1277 }

1278

1279constFunction *DirectCallee = CB->getCalledFunction();

1280

1281if (!DirectCallee) {

1282// We don't have a direct function symbol, but that may be because of

1283// constant cast instructions in the call.

1284

1285// With bitcast'd call targets, the instruction will be the call

1286if (constauto *CI = dyn_cast<CallInst>(CB)) {

1287// Check if we have call alignment metadata

1288if (MaybeAlign StackAlign =getAlign(*CI,Idx))

1289return StackAlign.value();

1290 }

1291 DirectCallee =getMaybeBitcastedCallee(CB);

1292 }

1293

1294// Check for function alignment information if we found that the

1295// ultimate target is a Function

1296if (DirectCallee)

1297returngetFunctionArgumentAlignment(DirectCallee, Ty,Idx,DL);

1298

1299// Call is indirect, fall back to the ABI type alignment

1300returnDL.getABITypeAlign(Ty);

1301}

1302

1303staticbooladjustElementType(EVT &ElementType) {

1304switch (ElementType.getSimpleVT().SimpleTy) {

1305default:

1306returnfalse;

1307case MVT::f16:

1308case MVT::bf16:

1309 ElementType = MVT::i16;

1310returntrue;

1311case MVT::f32:

1312case MVT::v2f16:

1313case MVT::v2bf16:

1314 ElementType = MVT::i32;

1315returntrue;

1316case MVT::f64:

1317 ElementType = MVT::i64;

1318returntrue;

1319 }

1320}

1321

1322// Use byte-store when the param address of the argument value is unaligned.

1323// This may happen when the return value is a field of a packed structure.

1324//

1325// This is called in LowerCall() when passing the param values.

1326staticSDValue LowerUnalignedStoreParam(SelectionDAG &DAG,SDValue Chain,

1327uint64_t Offset,EVT ElementType,

1328SDValue StVal,SDValue &InGlue,

1329unsigned ArgID,constSDLoc &dl) {

1330// Bit logic only works on integer types

1331if (adjustElementType(ElementType))

1332 StVal = DAG.getNode(ISD::BITCAST, dl, ElementType, StVal);

1333

1334// Store each byte

1335SDVTList StoreVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1336for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {

1337// Shift the byte to the last byte position

1338SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, StVal,

1339 DAG.getConstant(i * 8, dl, MVT::i32));

1340SDValue StoreOperands[] = {Chain, DAG.getConstant(ArgID, dl, MVT::i32),

1341 DAG.getConstant(Offset + i, dl, MVT::i32),

1342 ShiftVal, InGlue};

1343// Trunc store only the last byte by using

1344// st.param.b8

1345// The register type can be larger than b8.

1346 Chain = DAG.getMemIntrinsicNode(

1347NVPTXISD::StoreParam, dl, StoreVTs, StoreOperands, MVT::i8,

1348MachinePointerInfo(),Align(1),MachineMemOperand::MOStore);

1349 InGlue = Chain.getValue(1);

1350 }

1351return Chain;

1352}

1353

1354// Use byte-load when the param adress of the returned value is unaligned.

1355// This may happen when the returned value is a field of a packed structure.

1356staticSDValue

1357LowerUnalignedLoadRetParam(SelectionDAG &DAG,SDValue &Chain,uint64_t Offset,

1358EVT ElementType,SDValue &InGlue,

1359SmallVectorImpl<SDValue> &TempProxyRegOps,

1360constSDLoc &dl) {

1361// Bit logic only works on integer types

1362EVT MergedType = ElementType;

1363adjustElementType(MergedType);

1364

1365// Load each byte and construct the whole value. Initial value to 0

1366SDValue RetVal = DAG.getConstant(0, dl, MergedType);

1367// LoadParamMemI8 loads into i16 register only

1368SDVTList LoadVTs = DAG.getVTList(MVT::i16, MVT::Other, MVT::Glue);

1369for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {

1370SDValue LoadOperands[] = {Chain, DAG.getConstant(1, dl, MVT::i32),

1371 DAG.getConstant(Offset + i, dl, MVT::i32),

1372 InGlue};

1373// This will be selected to LoadParamMemI8

1374SDValue LdVal =

1375 DAG.getMemIntrinsicNode(NVPTXISD::LoadParam, dl, LoadVTs, LoadOperands,

1376 MVT::i8,MachinePointerInfo(),Align(1));

1377SDValue TmpLdVal = LdVal.getValue(0);

1378 Chain = LdVal.getValue(1);

1379 InGlue = LdVal.getValue(2);

1380

1381 TmpLdVal = DAG.getNode(NVPTXISD::ProxyReg, dl,

1382 TmpLdVal.getSimpleValueType(), TmpLdVal);

1383 TempProxyRegOps.push_back(TmpLdVal);

1384

1385SDValue CMask = DAG.getConstant(255, dl, MergedType);

1386SDValue CShift = DAG.getConstant(i * 8, dl, MVT::i32);

1387// Need to extend the i16 register to the whole width.

1388 TmpLdVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MergedType, TmpLdVal);

1389// Mask off the high bits. Leave only the lower 8bits.

1390// Do this because we are using loadparam.b8.

1391 TmpLdVal = DAG.getNode(ISD::AND, dl, MergedType, TmpLdVal, CMask);

1392// Shift and merge

1393 TmpLdVal = DAG.getNode(ISD::SHL, dl, MergedType, TmpLdVal, CShift);

1394 RetVal = DAG.getNode(ISD::OR, dl, MergedType, RetVal, TmpLdVal);

1395 }

1396if (ElementType != MergedType)

1397 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);

1398

1399return RetVal;

1400}

1401

1402staticboolshouldConvertToIndirectCall(constCallBase *CB,

1403constGlobalAddressSDNode *Func) {

1404if (!Func)

1405returnfalse;

1406if (auto *CalleeFunc = dyn_cast<Function>(Func->getGlobal()))

1407return CB->getFunctionType() != CalleeFunc->getFunctionType();

1408returnfalse;

1409}

1410

1411SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,

1412SmallVectorImpl<SDValue> &InVals) const{

1413

1414if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))

1415report_fatal_error(

1416"Support for variadic functions (unsized array parameter) introduced "

1417"in PTX ISA version 6.0 and requires target sm_30.");

1418

1419SelectionDAG &DAG = CLI.DAG;

1420SDLoc dl = CLI.DL;

1421SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;

1422SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;

1423SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;

1424SDValue Chain = CLI.Chain;

1425SDValue Callee = CLI.Callee;

1426bool &isTailCall = CLI.IsTailCall;

1427ArgListTy &Args = CLI.getArgs();

1428Type *RetTy = CLI.RetTy;

1429constCallBase *CB = CLI.CB;

1430constDataLayout &DL = DAG.getDataLayout();

1431

1432bool isABI = (STI.getSmVersion() >= 20);

1433assert(isABI &&"Non-ABI compilation is not supported");

1434if (!isABI)

1435return Chain;

1436

1437// Variadic arguments.

1438//

1439// Normally, for each argument, we declare a param scalar or a param

1440// byte array in the .param space, and store the argument value to that

1441// param scalar or array starting at offset 0.

1442//

1443// In the case of the first variadic argument, we declare a vararg byte array

1444// with size 0. The exact size of this array isn't known at this point, so

1445// it'll be patched later. All the variadic arguments will be stored to this

1446// array at a certain offset (which gets tracked by 'VAOffset'). The offset is

1447// initially set to 0, so it can be used for non-variadic arguments (which use

1448// 0 offset) to simplify the code.

1449//

1450// After all vararg is processed, 'VAOffset' holds the size of the

1451// vararg byte array.

1452

1453SDValue VADeclareParam;// vararg byte array

1454unsigned FirstVAArg = CLI.NumFixedArgs;// position of the first variadic

1455unsigned VAOffset = 0;// current offset in the param array

1456

1457unsigned UniqueCallSite =GlobalUniqueCallSite.fetch_add(1);

1458SDValue TempChain = Chain;

1459 Chain = DAG.getCALLSEQ_START(Chain, UniqueCallSite, 0, dl);

1460SDValue InGlue = Chain.getValue(1);

1461

1462unsigned ParamCount = 0;

1463// Args.size() and Outs.size() need not match.

1464// Outs.size() will be larger

1465// * if there is an aggregate argument with multiple fields (each field

1466// showing up separately in Outs)

1467// * if there is a vector argument with more than typical vector-length

1468// elements (generally if more than 4) where each vector element is

1469// individually present in Outs.

1470// So a different index should be used for indexing into Outs/OutVals.

1471// See similar issue in LowerFormalArguments.

1472unsigned OIdx = 0;

1473// Declare the .params or .reg need to pass values

1474// to the function

1475for (unsigned i = 0, e = Args.size(); i != e; ++i, ++OIdx) {

1476EVT VT = Outs[OIdx].VT;

1477Type *Ty = Args[i].Ty;

1478bool IsVAArg = (i >= CLI.NumFixedArgs);

1479bool IsByVal = Outs[OIdx].Flags.isByVal();

1480

1481SmallVector<EVT, 16> VTs;

1482SmallVector<uint64_t, 16> Offsets;

1483

1484assert((!IsByVal || Args[i].IndirectType) &&

1485"byval arg must have indirect type");

1486Type *ETy = (IsByVal ? Args[i].IndirectType : Ty);

1487ComputePTXValueVTs(*this,DL, ETy, VTs, &Offsets, IsByVal ? 0 : VAOffset);

1488

1489Align ArgAlign;

1490if (IsByVal) {

1491// The ByValAlign in the Outs[OIdx].Flags is always set at this point,

1492// so we don't need to worry whether it's naturally aligned or not.

1493// See TargetLowering::LowerCallTo().

1494Align InitialAlign = Outs[OIdx].Flags.getNonZeroByValAlign();

1495 ArgAlign =getFunctionByValParamAlign(CB->getCalledFunction(), ETy,

1496 InitialAlign,DL);

1497if (IsVAArg)

1498 VAOffset =alignTo(VAOffset, ArgAlign);

1499 }else {

1500 ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1,DL);

1501 }

1502

1503unsignedTypeSize =

1504 (IsByVal ? Outs[OIdx].Flags.getByValSize() :DL.getTypeAllocSize(Ty));

1505SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1506

1507bool NeedAlign;// Does argument declaration specify alignment?

1508bool PassAsArray = IsByVal ||IsTypePassedAsArray(Ty);

1509if (IsVAArg) {

1510if (ParamCount == FirstVAArg) {

1511SDValue DeclareParamOps[] = {

1512 Chain, DAG.getConstant(STI.getMaxRequiredAlignment(), dl, MVT::i32),

1513 DAG.getConstant(ParamCount, dl, MVT::i32),

1514 DAG.getConstant(1, dl, MVT::i32), InGlue};

1515 VADeclareParam = Chain = DAG.getNode(NVPTXISD::DeclareParam, dl,

1516 DeclareParamVTs, DeclareParamOps);

1517 }

1518 NeedAlign = PassAsArray;

1519 }elseif (PassAsArray) {

1520// declare .param .align <align> .b8 .param<n>[<size>];

1521SDValue DeclareParamOps[] = {

1522 Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),

1523 DAG.getConstant(ParamCount, dl, MVT::i32),

1524 DAG.getConstant(TypeSize, dl, MVT::i32), InGlue};

1525 Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,

1526 DeclareParamOps);

1527 NeedAlign =true;

1528 }else {

1529// declare .param .b<size> .param<n>;

1530if (VT.isInteger() || VT.isFloatingPoint()) {

1531// PTX ABI requires integral types to be at least 32 bits in

1532// size. FP16 is loaded/stored using i16, so it's handled

1533// here as well.

1534TypeSize =promoteScalarArgumentSize(TypeSize * 8) / 8;

1535 }

1536SDValue DeclareScalarParamOps[] = {

1537 Chain, DAG.getConstant(ParamCount, dl, MVT::i32),

1538 DAG.getConstant(TypeSize * 8, dl, MVT::i32),

1539 DAG.getConstant(0, dl, MVT::i32), InGlue};

1540 Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,

1541 DeclareScalarParamOps);

1542 NeedAlign =false;

1543 }

1544 InGlue = Chain.getValue(1);

1545

1546// PTX Interoperability Guide 3.3(A): [Integer] Values shorter

1547// than 32-bits are sign extended or zero extended, depending on

1548// whether they are signed or unsigned types. This case applies

1549// only to scalar parameters and not to aggregate values.

1550bool ExtendIntegerParam =

1551 Ty->isIntegerTy() &&DL.getTypeAllocSizeInBits(Ty) < 32;

1552

1553auto VectorInfo =VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);

1554SmallVector<SDValue, 6> StoreOperands;

1555for (unsigned j = 0, je = VTs.size(); j != je; ++j) {

1556EVT EltVT = VTs[j];

1557int CurOffset = Offsets[j];

1558MaybeAlign PartAlign;

1559if (NeedAlign)

1560 PartAlign =commonAlignment(ArgAlign, CurOffset);

1561

1562SDValue StVal = OutVals[OIdx];

1563

1564MVT PromotedVT;

1565if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {

1566 EltVT =EVT(PromotedVT);

1567 }

1568if (PromoteScalarIntegerPTX(StVal.getValueType(), &PromotedVT)) {

1569llvm::ISD::NodeType Ext =

1570 Outs[OIdx].Flags.isSExt() ?ISD::SIGN_EXTEND :ISD::ZERO_EXTEND;

1571 StVal = DAG.getNode(Ext, dl, PromotedVT, StVal);

1572 }

1573

1574if (IsByVal) {

1575auto PtrVT =getPointerTy(DL);

1576SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StVal,

1577 DAG.getConstant(CurOffset, dl, PtrVT));

1578 StVal = DAG.getLoad(EltVT, dl, TempChain, srcAddr,MachinePointerInfo(),

1579 PartAlign);

1580 }elseif (ExtendIntegerParam) {

1581assert(VTs.size() == 1 &&"Scalar can't have multiple parts.");

1582// zext/sext to i32

1583 StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ?ISD::SIGN_EXTEND

1584 :ISD::ZERO_EXTEND,

1585 dl, MVT::i32, StVal);

1586 }

1587

1588if (!ExtendIntegerParam && EltVT.getSizeInBits() < 16) {

1589// Use 16-bit registers for small stores as it's the

1590// smallest general purpose register size supported by NVPTX.

1591 StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);

1592 }

1593

1594// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a

1595// scalar store. In such cases, fall back to byte stores.

1596if (VectorInfo[j] ==PVF_SCALAR && !IsVAArg && PartAlign.has_value() &&

1597 PartAlign.value() <

1598DL.getABITypeAlign(EltVT.getTypeForEVT(*DAG.getContext()))) {

1599assert(StoreOperands.empty() &&"Unfinished preceeding store.");

1600 Chain =LowerUnalignedStoreParam(

1601 DAG, Chain, IsByVal ? CurOffset + VAOffset : CurOffset, EltVT,

1602 StVal, InGlue, ParamCount, dl);

1603

1604// LowerUnalignedStoreParam took care of inserting the necessary nodes

1605// into the SDAG, so just move on to the next element.

1606if (!IsByVal)

1607 ++OIdx;

1608continue;

1609 }

1610

1611// New store.

1612if (VectorInfo[j] &PVF_FIRST) {

1613assert(StoreOperands.empty() &&"Unfinished preceding store.");

1614 StoreOperands.push_back(Chain);

1615 StoreOperands.push_back(

1616 DAG.getConstant(IsVAArg ? FirstVAArg : ParamCount, dl, MVT::i32));

1617

1618 StoreOperands.push_back(DAG.getConstant(

1619 IsByVal ? CurOffset + VAOffset : (IsVAArg ? VAOffset : CurOffset),

1620 dl, MVT::i32));

1621 }

1622

1623// Record the value to store.

1624 StoreOperands.push_back(StVal);

1625

1626if (VectorInfo[j] &PVF_LAST) {

1627unsigned NumElts = StoreOperands.size() - 3;

1628NVPTXISD::NodeType Op;

1629switch (NumElts) {

1630case 1:

1631Op =NVPTXISD::StoreParam;

1632break;

1633case 2:

1634Op =NVPTXISD::StoreParamV2;

1635break;

1636case 4:

1637Op =NVPTXISD::StoreParamV4;

1638break;

1639default:

1640llvm_unreachable("Invalid vector info.");

1641 }

1642

1643 StoreOperands.push_back(InGlue);

1644

1645// Adjust type of the store op if we've extended the scalar

1646// return value.

1647EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;

1648

1649 Chain = DAG.getMemIntrinsicNode(

1650Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,

1651 TheStoreType,MachinePointerInfo(), PartAlign,

1652MachineMemOperand::MOStore);

1653 InGlue = Chain.getValue(1);

1654

1655// Cleanup.

1656 StoreOperands.clear();

1657

1658// TODO: We may need to support vector types that can be passed

1659// as scalars in variadic arguments.

1660if (!IsByVal && IsVAArg) {

1661assert(NumElts == 1 &&

1662"Vectorization is expected to be disabled for variadics.");

1663 VAOffset +=DL.getTypeAllocSize(

1664 TheStoreType.getTypeForEVT(*DAG.getContext()));

1665 }

1666 }

1667if (!IsByVal)

1668 ++OIdx;

1669 }

1670assert(StoreOperands.empty() &&"Unfinished parameter store.");

1671if (!IsByVal && VTs.size() > 0)

1672 --OIdx;

1673 ++ParamCount;

1674if (IsByVal && IsVAArg)

1675 VAOffset +=TypeSize;

1676 }

1677

1678GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());

1679MaybeAlign retAlignment = std::nullopt;

1680

1681// Handle Result

1682if (Ins.size() > 0) {

1683SmallVector<EVT, 16> resvtparts;

1684ComputeValueVTs(*this,DL,RetTy, resvtparts);

1685

1686// Declare

1687// .param .align N .b8 retval0[<size-in-bytes>], or

1688// .param .b<size-in-bits> retval0

1689unsigned resultsz =DL.getTypeAllocSizeInBits(RetTy);

1690if (!IsTypePassedAsArray(RetTy)) {

1691 resultsz =promoteScalarArgumentSize(resultsz);

1692SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1693SDValue DeclareRetOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),

1694 DAG.getConstant(resultsz, dl, MVT::i32),

1695 DAG.getConstant(0, dl, MVT::i32), InGlue };

1696 Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,

1697 DeclareRetOps);

1698 InGlue = Chain.getValue(1);

1699 }else {

1700 retAlignment = getArgumentAlignment(CB,RetTy, 0,DL);

1701assert(retAlignment &&"retAlignment is guaranteed to be set");

1702SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1703SDValue DeclareRetOps[] = {

1704 Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),

1705 DAG.getConstant(resultsz / 8, dl, MVT::i32),

1706 DAG.getConstant(0, dl, MVT::i32), InGlue};

1707 Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,

1708 DeclareRetOps);

1709 InGlue = Chain.getValue(1);

1710 }

1711 }

1712

1713bool HasVAArgs = CLI.IsVarArg && (CLI.Args.size() > CLI.NumFixedArgs);

1714// Set the size of the vararg param byte array if the callee is a variadic

1715// function and the variadic part is not empty.

1716if (HasVAArgs) {

1717SDValue DeclareParamOps[] = {

1718 VADeclareParam.getOperand(0), VADeclareParam.getOperand(1),

1719 VADeclareParam.getOperand(2), DAG.getConstant(VAOffset, dl, MVT::i32),

1720 VADeclareParam.getOperand(4)};

1721 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),

1722 VADeclareParam->getVTList(), DeclareParamOps);

1723 }

1724

1725// If the type of the callsite does not match that of the function, convert

1726// the callsite to an indirect call.

1727bool ConvertToIndirectCall =shouldConvertToIndirectCall(CB, Func);

1728

1729// Both indirect calls and libcalls have nullptr Func. In order to distinguish

1730// between them we must rely on the call site value which is valid for

1731// indirect calls but is always null for libcalls.

1732boolisIndirectCall = (!Func && CB) || ConvertToIndirectCall;

1733

1734if (isa<ExternalSymbolSDNode>(Callee)) {

1735Function* CalleeFunc =nullptr;

1736

1737// Try to find the callee in the current module.

1738 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);

1739assert(CalleeFunc !=nullptr &&"Libcall callee must be set.");

1740

1741// Set the "libcall callee" attribute to indicate that the function

1742// must always have a declaration.

1743 CalleeFunc->addFnAttr("nvptx-libcall-callee","true");

1744 }

1745

1746if (isIndirectCall) {

1747// This is indirect function call case : PTX requires a prototype of the

1748// form

1749// proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);

1750// to be emitted, and the label has to used as the last arg of call

1751// instruction.

1752// The prototype is embedded in a string and put as the operand for a

1753// CallPrototype SDNode which will print out to the value of the string.

1754SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1755 std::string Proto =getPrototype(

1756DL,RetTy, Args, Outs, retAlignment,

1757 HasVAArgs

1758 ? std::optional<std::pair<unsigned, const APInt &>>(std::make_pair(

1759 CLI.NumFixedArgs, VADeclareParam->getConstantOperandAPInt(1)))

1760 : std::nullopt,

1761 *CB, UniqueCallSite);

1762constchar *ProtoStr =nvTM->getStrPool().save(Proto).data();

1763SDValue ProtoOps[] = {

1764 Chain,

1765 DAG.getTargetExternalSymbol(ProtoStr, MVT::i32),

1766 InGlue,

1767 };

1768 Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);

1769 InGlue = Chain.getValue(1);

1770 }

1771// Op to just print "call"

1772SDVTList PrintCallVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1773SDValue PrintCallOps[] = {

1774 Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, dl, MVT::i32), InGlue

1775 };

1776// We model convergent calls as separate opcodes.

1777unsigned Opcode =isIndirectCall ?NVPTXISD::PrintCall :NVPTXISD::PrintCallUni;

1778if (CLI.IsConvergent)

1779 Opcode = Opcode ==NVPTXISD::PrintCallUni ?NVPTXISD::PrintConvergentCallUni

1780 :NVPTXISD::PrintConvergentCall;

1781 Chain = DAG.getNode(Opcode, dl, PrintCallVTs, PrintCallOps);

1782 InGlue = Chain.getValue(1);

1783

1784if (ConvertToIndirectCall) {

1785// Copy the function ptr to a ptx register and use the register to call the

1786// function.

1787EVT DestVT = Callee.getValueType();

1788MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();

1789constTargetLowering &TLI = DAG.getTargetLoweringInfo();

1790unsigned DestReg =

1791 RegInfo.createVirtualRegister(TLI.getRegClassFor(DestVT.getSimpleVT()));

1792auto RegCopy = DAG.getCopyToReg(DAG.getEntryNode(), dl, DestReg, Callee);

1793 Callee = DAG.getCopyFromReg(RegCopy, dl, DestReg, DestVT);

1794 }

1795

1796// Ops to print out the function name

1797SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1798SDValue CallVoidOps[] = { Chain, Callee, InGlue };

1799 Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);

1800 InGlue = Chain.getValue(1);

1801

1802// Ops to print out the param list

1803SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1804SDValue CallArgBeginOps[] = { Chain, InGlue };

1805 Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,

1806 CallArgBeginOps);

1807 InGlue = Chain.getValue(1);

1808

1809for (unsigned i = 0, e = std::min(CLI.NumFixedArgs + 1, ParamCount); i != e;

1810 ++i) {

1811unsigned opcode;

1812if (i == (e - 1))

1813 opcode =NVPTXISD::LastCallArg;

1814else

1815 opcode =NVPTXISD::CallArg;

1816SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1817SDValue CallArgOps[] = { Chain, DAG.getConstant(1, dl, MVT::i32),

1818 DAG.getConstant(i, dl, MVT::i32), InGlue };

1819 Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);

1820 InGlue = Chain.getValue(1);

1821 }

1822SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1823SDValue CallArgEndOps[] = { Chain,

1824 DAG.getConstant(isIndirectCall ? 0 : 1, dl, MVT::i32),

1825 InGlue };

1826 Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);

1827 InGlue = Chain.getValue(1);

1828

1829if (isIndirectCall) {

1830SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);

1831SDValue PrototypeOps[] = {

1832 Chain, DAG.getConstant(UniqueCallSite, dl, MVT::i32), InGlue};

1833 Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);

1834 InGlue = Chain.getValue(1);

1835 }

1836

1837SmallVector<SDValue, 16> ProxyRegOps;

1838SmallVector<std::optional<MVT>, 16> ProxyRegTruncates;

1839// An item of the vector is filled if the element does not need a ProxyReg

1840// operation on it and should be added to InVals as is. ProxyRegOps and

1841// ProxyRegTruncates contain empty/none items at the same index.

1842SmallVector<SDValue, 16> RetElts;

1843// A temporary ProxyReg operations inserted in `LowerUnalignedLoadRetParam()`

1844// to use the values of `LoadParam`s and to be replaced later then

1845// `CALLSEQ_END` is added.

1846SmallVector<SDValue, 16> TempProxyRegOps;

1847

1848// Generate loads from param memory/moves from registers for result

1849if (Ins.size() > 0) {

1850SmallVector<EVT, 16> VTs;

1851SmallVector<uint64_t, 16> Offsets;

1852ComputePTXValueVTs(*this,DL,RetTy, VTs, &Offsets, 0);

1853assert(VTs.size() == Ins.size() &&"Bad value decomposition");

1854

1855Align RetAlign = getArgumentAlignment(CB,RetTy, 0,DL);

1856auto VectorInfo =VectorizePTXValueVTs(VTs, Offsets, RetAlign);

1857

1858SmallVector<EVT, 6> LoadVTs;

1859int VecIdx = -1;// Index of the first element of the vector.

1860

1861// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than

1862// 32-bits are sign extended or zero extended, depending on whether

1863// they are signed or unsigned types.

1864bool ExtendIntegerRetVal =

1865RetTy->isIntegerTy() &&DL.getTypeAllocSizeInBits(RetTy) < 32;

1866

1867for (unsigned i = 0, e = VTs.size(); i != e; ++i) {

1868bool needTruncate =false;

1869EVT TheLoadType = VTs[i];

1870EVT EltType = Ins[i].VT;

1871Align EltAlign =commonAlignment(RetAlign, Offsets[i]);

1872MVT PromotedVT;

1873

1874if (PromoteScalarIntegerPTX(TheLoadType, &PromotedVT)) {

1875 TheLoadType =EVT(PromotedVT);

1876 EltType =EVT(PromotedVT);

1877 needTruncate =true;

1878 }

1879

1880if (ExtendIntegerRetVal) {

1881 TheLoadType = MVT::i32;

1882 EltType = MVT::i32;

1883 needTruncate =true;

1884 }elseif (TheLoadType.getSizeInBits() < 16) {

1885if (VTs[i].isInteger())

1886 needTruncate =true;

1887 EltType = MVT::i16;

1888 }

1889

1890// If we have a PVF_SCALAR entry, it may not be sufficiently aligned for a

1891// scalar load. In such cases, fall back to byte loads.

1892if (VectorInfo[i] ==PVF_SCALAR &&RetTy->isAggregateType() &&

1893 EltAlign <DL.getABITypeAlign(

1894 TheLoadType.getTypeForEVT(*DAG.getContext()))) {

1895assert(VecIdx == -1 && LoadVTs.empty() &&"Orphaned operand list.");

1896SDValue Ret =LowerUnalignedLoadRetParam(

1897 DAG, Chain, Offsets[i], TheLoadType, InGlue, TempProxyRegOps, dl);

1898 ProxyRegOps.push_back(SDValue());

1899 ProxyRegTruncates.push_back(std::optional<MVT>());

1900 RetElts.resize(i);

1901 RetElts.push_back(Ret);

1902

1903continue;

1904 }

1905

1906// Record index of the very first element of the vector.

1907if (VectorInfo[i] &PVF_FIRST) {

1908assert(VecIdx == -1 && LoadVTs.empty() &&"Orphaned operand list.");

1909 VecIdx = i;

1910 }

1911

1912 LoadVTs.push_back(EltType);

1913

1914if (VectorInfo[i] &PVF_LAST) {

1915unsigned NumElts = LoadVTs.size();

1916 LoadVTs.push_back(MVT::Other);

1917 LoadVTs.push_back(MVT::Glue);

1918NVPTXISD::NodeType Op;

1919switch (NumElts) {

1920case 1:

1921Op =NVPTXISD::LoadParam;

1922break;

1923case 2:

1924Op =NVPTXISD::LoadParamV2;

1925break;

1926case 4:

1927Op =NVPTXISD::LoadParamV4;

1928break;

1929default:

1930llvm_unreachable("Invalid vector info.");

1931 }

1932

1933SDValue LoadOperands[] = {

1934 Chain, DAG.getConstant(1, dl, MVT::i32),

1935 DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InGlue};

1936SDValue RetVal = DAG.getMemIntrinsicNode(

1937Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,

1938MachinePointerInfo(), EltAlign,

1939MachineMemOperand::MOLoad);

1940

1941for (unsigned j = 0; j < NumElts; ++j) {

1942 ProxyRegOps.push_back(RetVal.getValue(j));

1943

1944if (needTruncate)

1945 ProxyRegTruncates.push_back(std::optional<MVT>(Ins[VecIdx + j].VT));

1946else

1947 ProxyRegTruncates.push_back(std::optional<MVT>());

1948 }

1949

1950 Chain = RetVal.getValue(NumElts);

1951 InGlue = RetVal.getValue(NumElts + 1);

1952

1953// Cleanup

1954 VecIdx = -1;

1955 LoadVTs.clear();

1956 }

1957 }

1958 }

1959

1960 Chain =

1961 DAG.getCALLSEQ_END(Chain, UniqueCallSite, UniqueCallSite + 1, InGlue, dl);

1962 InGlue = Chain.getValue(1);

1963

1964// Append ProxyReg instructions to the chain to make sure that `callseq_end`

1965// will not get lost. Otherwise, during libcalls expansion, the nodes can become

1966// dangling.

1967for (unsigned i = 0; i < ProxyRegOps.size(); ++i) {

1968if (i < RetElts.size() && RetElts[i]) {

1969 InVals.push_back(RetElts[i]);

1970continue;

1971 }

1972

1973SDValue Ret = DAG.getNode(

1974NVPTXISD::ProxyReg, dl,

1975 DAG.getVTList(ProxyRegOps[i].getSimpleValueType(), MVT::Other, MVT::Glue),

1976 { Chain, ProxyRegOps[i], InGlue }

1977 );

1978

1979 Chain = Ret.getValue(1);

1980 InGlue = Ret.getValue(2);

1981

1982if (ProxyRegTruncates[i]) {

1983 Ret = DAG.getNode(ISD::TRUNCATE, dl, *ProxyRegTruncates[i], Ret);

1984 }

1985

1986 InVals.push_back(Ret);

1987 }

1988

1989for (SDValue &T : TempProxyRegOps) {

1990SDValue Repl = DAG.getNode(

1991NVPTXISD::ProxyReg, dl,

1992 DAG.getVTList(T.getSimpleValueType(), MVT::Other, MVT::Glue),

1993 {Chain, T.getOperand(0), InGlue});

1994 DAG.ReplaceAllUsesWith(T, Repl);

1995 DAG.RemoveDeadNode(T.getNode());

1996

1997 Chain = Repl.getValue(1);

1998 InGlue = Repl.getValue(2);

1999 }

2000

2001// set isTailCall to false for now, until we figure out how to express

2002// tail call optimization in PTX

2003 isTailCall =false;

2004return Chain;

2005}

2006

2007SDValue NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,

2008SelectionDAG &DAG) const{

2009

2010if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {

2011constFunction &Fn = DAG.getMachineFunction().getFunction();

2012

2013DiagnosticInfoUnsupported NoDynamicAlloca(

2014 Fn,

2015"Support for dynamic alloca introduced in PTX ISA version 7.3 and "

2016"requires target sm_52.",

2017SDLoc(Op).getDebugLoc());

2018 DAG.getContext()->diagnose(NoDynamicAlloca);

2019auto Ops = {DAG.getConstant(0,SDLoc(),Op.getValueType()),

2020Op.getOperand(0)};

2021return DAG.getMergeValues(Ops,SDLoc());

2022 }

2023

2024SDValue Chain =Op.getOperand(0);

2025SDValue Size =Op.getOperand(1);

2026uint64_t Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();

2027SDLoc DL(Op.getNode());

2028

2029// The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.

2030MVT ValueSizeTy =nvTM->is64Bit() ? MVT::i64 : MVT::i32;

2031

2032SDValue AllocOps[] = {Chain, DAG.getZExtOrTrunc(Size,DL, ValueSizeTy),

2033 DAG.getTargetConstant(Align,DL, MVT::i32)};

2034EVT RetTypes[] = {ValueSizeTy, MVT::Other};

2035return DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC,DL, RetTypes, AllocOps);

2036}

2037

2038SDValue NVPTXTargetLowering::LowerSTACKRESTORE(SDValue Op,

2039SelectionDAG &DAG) const{

2040SDLoc DL(Op.getNode());

2041if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {

2042constFunction &Fn = DAG.getMachineFunction().getFunction();

2043

2044DiagnosticInfoUnsupported NoStackRestore(

2045 Fn,

2046"Support for stackrestore requires PTX ISA version >= 7.3 and target "

2047">= sm_52.",

2048DL.getDebugLoc());

2049 DAG.getContext()->diagnose(NoStackRestore);

2050returnOp.getOperand(0);

2051 }

2052

2053constMVT LocalVT =getPointerTy(DAG.getDataLayout(),ADDRESS_SPACE_LOCAL);

2054SDValue Chain =Op.getOperand(0);

2055SDValue Ptr =Op.getOperand(1);

2056SDValue ASC = DAG.getAddrSpaceCast(DL, LocalVT,Ptr,ADDRESS_SPACE_GENERIC,

2057ADDRESS_SPACE_LOCAL);

2058return DAG.getNode(NVPTXISD::STACKRESTORE,DL, MVT::Other, {Chain, ASC});

2059}

2060

2061SDValue NVPTXTargetLowering::LowerSTACKSAVE(SDValue Op,

2062SelectionDAG &DAG) const{

2063SDLoc DL(Op.getNode());

2064if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {

2065constFunction &Fn = DAG.getMachineFunction().getFunction();

2066

2067DiagnosticInfoUnsupported NoStackSave(

2068 Fn,

2069"Support for stacksave requires PTX ISA version >= 7.3 and target >= "

2070"sm_52.",

2071DL.getDebugLoc());

2072 DAG.getContext()->diagnose(NoStackSave);

2073auto Ops = {DAG.getConstant(0,DL,Op.getValueType()),Op.getOperand(0)};

2074return DAG.getMergeValues(Ops,DL);

2075 }

2076

2077constMVT LocalVT =getPointerTy(DAG.getDataLayout(),ADDRESS_SPACE_LOCAL);

2078SDValue Chain =Op.getOperand(0);

2079SDValue SS =

2080 DAG.getNode(NVPTXISD::STACKSAVE,DL, {LocalVT, MVT::Other}, Chain);

2081SDValue ASC = DAG.getAddrSpaceCast(

2082DL,Op.getValueType(), SS,ADDRESS_SPACE_LOCAL,ADDRESS_SPACE_GENERIC);

2083return DAG.getMergeValues({ASC,SDValue(SS.getNode(), 1)},DL);

2084}

2085

2086// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()

2087// (see LegalizeDAG.cpp). This is slow and uses local memory.

2088// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5

2089SDValue

2090NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op,SelectionDAG &DAG) const{

2091SDNode *Node =Op.getNode();

2092SDLoc dl(Node);

2093SmallVector<SDValue, 8> Ops;

2094unsigned NumOperands = Node->getNumOperands();

2095for (unsigned i = 0; i < NumOperands; ++i) {

2096SDValue SubOp = Node->getOperand(i);

2097EVT VVT = SubOp.getNode()->getValueType(0);

2098EVT EltVT = VVT.getVectorElementType();

2099unsigned NumSubElem = VVT.getVectorNumElements();

2100for (unsigned j = 0; j < NumSubElem; ++j) {

2101 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,

2102 DAG.getIntPtrConstant(j, dl)));

2103 }

2104 }

2105return DAG.getBuildVector(Node->getValueType(0), dl, Ops);

2106}

2107

2108SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op,SelectionDAG &DAG) const{

2109// Handle bitcasting from v2i8 without hitting the default promotion

2110// strategy which goes through stack memory.

2111EVT FromVT =Op->getOperand(0)->getValueType(0);

2112if (FromVT != MVT::v2i8) {

2113returnOp;

2114 }

2115

2116// Pack vector elements into i16 and bitcast to final type

2117SDLoc DL(Op);

2118SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i8,

2119Op->getOperand(0), DAG.getIntPtrConstant(0,DL));

2120SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i8,

2121Op->getOperand(0), DAG.getIntPtrConstant(1,DL));

2122SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND,DL, MVT::i16, Vec0);

2123SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND,DL, MVT::i16, Vec1);

2124SDValue Const8 = DAG.getConstant(8,DL, MVT::i16);

2125SDValue AsInt = DAG.getNode(

2126ISD::OR,DL, MVT::i16,

2127 {Extend0, DAG.getNode(ISD::SHL,DL, MVT::i16, {Extend1, Const8})});

2128EVT ToVT =Op->getValueType(0);

2129returnMaybeBitcast(DAG,DL, ToVT, AsInt);

2130}

2131

2132// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it

2133// would get lowered as two constant loads and vector-packing move.

2134// Instead we want just a constant move:

2135// mov.b32 %r2, 0x40003C00

2136SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,

2137SelectionDAG &DAG) const{

2138EVT VT =Op->getValueType(0);

2139if (!(Isv2x16VT(VT) || VT == MVT::v4i8))

2140returnOp;

2141SDLoc DL(Op);

2142

2143if (!llvm::all_of(Op->ops(), [](SDValue Operand) {

2144 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||

2145 isa<ConstantFPSDNode>(Operand);

2146 })) {

2147if (VT != MVT::v4i8)

2148returnOp;

2149// Lower non-const v4i8 vector as byte-wise constructed i32, which allows us

2150// to optimize calculation of constant parts.

2151auto GetPRMT = [&](constSDValue Left,constSDValue Right,bool Cast,

2152uint64_t SelectionValue) ->SDValue {

2153SDValue L =Left;

2154SDValue R =Right;

2155if (Cast) {

2156L = DAG.getAnyExtOrTrunc(L,DL, MVT::i32);

2157R = DAG.getAnyExtOrTrunc(R,DL, MVT::i32);

2158 }

2159return DAG.getNode(

2160NVPTXISD::PRMT,DL, MVT::v4i8,

2161 {L,R, DAG.getConstant(SelectionValue,DL, MVT::i32),

2162 DAG.getConstant(NVPTX::PTXPrmtMode::NONE,DL, MVT::i32)});

2163 };

2164auto PRMT__10 = GetPRMT(Op->getOperand(0),Op->getOperand(1),true, 0x3340);

2165auto PRMT__32 = GetPRMT(Op->getOperand(2),Op->getOperand(3),true, 0x3340);

2166auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32,false, 0x5410);

2167return DAG.getNode(ISD::BITCAST,DL, VT, PRMT3210);

2168 }

2169

2170// Get value or the Nth operand as an APInt(32). Undef values treated as 0.

2171auto GetOperand = [](SDValue Op,intN) ->APInt {

2172constSDValue &Operand =Op->getOperand(N);

2173EVT VT =Op->getValueType(0);

2174if (Operand->isUndef())

2175returnAPInt(32, 0);

2176APInt Value;

2177if (VT == MVT::v2f16 || VT == MVT::v2bf16)

2178Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();

2179elseif (VT == MVT::v2i16 || VT == MVT::v4i8)

2180Value = Operand->getAsAPIntVal();

2181else

2182llvm_unreachable("Unsupported type");

2183// i8 values are carried around as i16, so we need to zero out upper bits,

2184// so they do not get in the way of combining individual byte values

2185if (VT == MVT::v4i8)

2186Value =Value.trunc(8);

2187returnValue.zext(32);

2188 };

2189APInt Value;

2190if (Isv2x16VT(VT)) {

2191Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(16);

2192 }elseif (VT == MVT::v4i8) {

2193Value = GetOperand(Op, 0) | GetOperand(Op, 1).shl(8) |

2194 GetOperand(Op, 2).shl(16) | GetOperand(Op, 3).shl(24);

2195 }else {

2196llvm_unreachable("Unsupported type");

2197 }

2198SDValue Const = DAG.getConstant(Value,DL, MVT::i32);

2199return DAG.getNode(ISD::BITCAST,DL,Op->getValueType(0), Const);

2200}

2201

2202SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,

2203SelectionDAG &DAG) const{

2204SDValue Index =Op->getOperand(1);

2205SDValue Vector =Op->getOperand(0);

2206SDLoc DL(Op);

2207EVT VectorVT =Vector.getValueType();

2208

2209if (VectorVT == MVT::v4i8) {

2210SDValue BFE =

2211 DAG.getNode(NVPTXISD::BFE,DL, MVT::i32,

2212 {Vector,

2213 DAG.getNode(ISD::MUL,DL, MVT::i32,

2214 DAG.getZExtOrTrunc(Index,DL, MVT::i32),

2215 DAG.getConstant(8,DL, MVT::i32)),

2216 DAG.getConstant(8,DL, MVT::i32)});

2217return DAG.getAnyExtOrTrunc(BFE,DL,Op->getValueType(0));

2218 }

2219

2220// Constant index will be matched by tablegen.

2221if (isa<ConstantSDNode>(Index.getNode()))

2222returnOp;

2223

2224// Extract individual elements and select one of them.

2225assert(Isv2x16VT(VectorVT) &&"Unexpected vector type.");

2226EVT EltVT = VectorVT.getVectorElementType();

2227

2228SDLoc dl(Op.getNode());

2229SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,Vector,

2230 DAG.getIntPtrConstant(0, dl));

2231SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,Vector,

2232 DAG.getIntPtrConstant(1, dl));

2233return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,

2234ISD::CondCode::SETEQ);

2235}

2236

2237SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,

2238SelectionDAG &DAG) const{

2239SDValue Vector =Op->getOperand(0);

2240EVT VectorVT =Vector.getValueType();

2241

2242if (VectorVT != MVT::v4i8)

2243returnOp;

2244SDLoc DL(Op);

2245SDValue Value =Op->getOperand(1);

2246if (Value->isUndef())

2247returnVector;

2248

2249SDValue Index =Op->getOperand(2);

2250

2251SDValue BFI =

2252 DAG.getNode(NVPTXISD::BFI,DL, MVT::i32,

2253 {DAG.getZExtOrTrunc(Value,DL, MVT::i32),Vector,

2254 DAG.getNode(ISD::MUL,DL, MVT::i32,

2255 DAG.getZExtOrTrunc(Index,DL, MVT::i32),

2256 DAG.getConstant(8,DL, MVT::i32)),

2257 DAG.getConstant(8,DL, MVT::i32)});

2258return DAG.getNode(ISD::BITCAST,DL,Op->getValueType(0), BFI);

2259}

2260

2261SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,

2262SelectionDAG &DAG) const{

2263SDValue V1 =Op.getOperand(0);

2264EVT VectorVT = V1.getValueType();

2265if (VectorVT != MVT::v4i8 ||Op.getValueType() != MVT::v4i8)

2266returnOp;

2267

2268// Lower shuffle to PRMT instruction.

2269constShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

2270SDValue V2 =Op.getOperand(1);

2271uint32_t Selector = 0;

2272for (autoI :llvm::enumerate(SVN->getMask())) {

2273if (I.value() != -1)// -1 is a placeholder for undef.

2274 Selector |= (I.value() << (I.index() * 4));

2275 }

2276

2277SDLoc DL(Op);

2278return DAG.getNode(NVPTXISD::PRMT,DL, MVT::v4i8, V1, V2,

2279 DAG.getConstant(Selector,DL, MVT::i32),

2280 DAG.getConstant(NVPTX::PTXPrmtMode::NONE,DL, MVT::i32));

2281}

2282/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which

2283/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift

2284/// amount, or

2285/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift

2286/// amount.

2287SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,

2288SelectionDAG &DAG) const{

2289assert(Op.getNumOperands() == 3 &&"Not a double-shift!");

2290assert(Op.getOpcode() ==ISD::SRA_PARTS ||Op.getOpcode() ==ISD::SRL_PARTS);

2291

2292EVT VT =Op.getValueType();

2293unsigned VTBits = VT.getSizeInBits();

2294SDLoc dl(Op);

2295SDValue ShOpLo =Op.getOperand(0);

2296SDValue ShOpHi =Op.getOperand(1);

2297SDValue ShAmt =Op.getOperand(2);

2298unsigned Opc = (Op.getOpcode() ==ISD::SRA_PARTS) ?ISD::SRA :ISD::SRL;

2299

2300if (VTBits == 32 && STI.getSmVersion() >= 35) {

2301// For 32bit and sm35, we can use the funnel shift 'shf' instruction.

2302// {dHi, dLo} = {aHi, aLo} >> Amt

2303// dHi = aHi >> Amt

2304// dLo = shf.r.clamp aLo, aHi, Amt

2305

2306SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);

2307SDValue Lo =

2308 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);

2309

2310SDValue Ops[2] = {Lo,Hi };

2311return DAG.getMergeValues(Ops, dl);

2312 }

2313else {

2314// {dHi, dLo} = {aHi, aLo} >> Amt

2315// - if (Amt>=size) then

2316// dLo = aHi >> (Amt-size)

2317// dHi = aHi >> Amt (this is either all 0 or all 1)

2318// else

2319// dLo = (aLo >>logic Amt) | (aHi << (size-Amt))

2320// dHi = aHi >> Amt

2321

2322SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,

2323 DAG.getConstant(VTBits, dl, MVT::i32),

2324 ShAmt);

2325SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);

2326SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,

2327 DAG.getConstant(VTBits, dl, MVT::i32));

2328SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);

2329SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);

2330SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);

2331

2332SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,

2333 DAG.getConstant(VTBits, dl, MVT::i32),

2334ISD::SETGE);

2335SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);

2336SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);

2337

2338SDValue Ops[2] = {Lo,Hi };

2339return DAG.getMergeValues(Ops, dl);

2340 }

2341}

2342

2343/// LowerShiftLeftParts - Lower SHL_PARTS, which

2344/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift

2345/// amount, or

2346/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift

2347/// amount.

2348SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,

2349SelectionDAG &DAG) const{

2350assert(Op.getNumOperands() == 3 &&"Not a double-shift!");

2351assert(Op.getOpcode() ==ISD::SHL_PARTS);

2352

2353EVT VT =Op.getValueType();

2354unsigned VTBits = VT.getSizeInBits();

2355SDLoc dl(Op);

2356SDValue ShOpLo =Op.getOperand(0);

2357SDValue ShOpHi =Op.getOperand(1);

2358SDValue ShAmt =Op.getOperand(2);

2359

2360if (VTBits == 32 && STI.getSmVersion() >= 35) {

2361// For 32bit and sm35, we can use the funnel shift 'shf' instruction.

2362// {dHi, dLo} = {aHi, aLo} << Amt

2363// dHi = shf.l.clamp aLo, aHi, Amt

2364// dLo = aLo << Amt

2365

2366SDValue Hi =

2367 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);

2368SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);

2369

2370SDValue Ops[2] = {Lo,Hi };

2371return DAG.getMergeValues(Ops, dl);

2372 }

2373else {

2374// {dHi, dLo} = {aHi, aLo} << Amt

2375// - if (Amt>=size) then

2376// dLo = aLo << Amt (all 0)

2377// dLo = aLo << (Amt-size)

2378// else

2379// dLo = aLo << Amt

2380// dHi = (aHi << Amt) | (aLo >> (size-Amt))

2381

2382SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,

2383 DAG.getConstant(VTBits, dl, MVT::i32),

2384 ShAmt);

2385SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);

2386SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,

2387 DAG.getConstant(VTBits, dl, MVT::i32));

2388SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);

2389SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);

2390SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);

2391

2392SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,

2393 DAG.getConstant(VTBits, dl, MVT::i32),

2394ISD::SETGE);

2395SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);

2396SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);

2397

2398SDValue Ops[2] = {Lo,Hi };

2399return DAG.getMergeValues(Ops, dl);

2400 }

2401}

2402

2403/// If the types match, convert the generic copysign to the NVPTXISD version,

2404/// otherwise bail ensuring that mismatched cases are properly expaned.

2405SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,

2406SelectionDAG &DAG) const{

2407EVT VT =Op.getValueType();

2408SDLoc DL(Op);

2409

2410SDValue In1 =Op.getOperand(0);

2411SDValue In2 =Op.getOperand(1);

2412EVT SrcVT = In2.getValueType();

2413

2414if (!SrcVT.bitsEq(VT))

2415returnSDValue();

2416

2417return DAG.getNode(NVPTXISD::FCOPYSIGN,DL, VT, In1, In2);

2418}

2419

2420SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op,SelectionDAG &DAG) const{

2421EVT VT =Op.getValueType();

2422

2423if (VT == MVT::f32)

2424return LowerFROUND32(Op, DAG);

2425

2426if (VT == MVT::f64)

2427return LowerFROUND64(Op, DAG);

2428

2429llvm_unreachable("unhandled type");

2430}

2431

2432// This is the the rounding method used in CUDA libdevice in C like code:

2433// float roundf(float A)

2434// {

2435// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));

2436// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;

2437// return abs(A) < 0.5 ? (float)(int)A : RoundedA;

2438// }

2439SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,

2440SelectionDAG &DAG) const{

2441SDLoc SL(Op);

2442SDValue A =Op.getOperand(0);

2443EVT VT =Op.getValueType();

2444

2445SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT,A);

2446

2447// RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))

2448SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32,A);

2449constunsigned SignBitMask = 0x80000000;

2450SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,

2451 DAG.getConstant(SignBitMask, SL, MVT::i32));

2452constunsigned PointFiveInBits = 0x3F000000;

2453SDValue PointFiveWithSignRaw =

2454 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,

2455 DAG.getConstant(PointFiveInBits, SL, MVT::i32));

2456SDValue PointFiveWithSign =

2457 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);

2458SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT,A, PointFiveWithSign);

2459SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);

2460

2461// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;

2462EVT SetCCVT =getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

2463SDValue IsLarge =

2464 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),

2465ISD::SETOGT);

2466 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge,A, RoundedA);

2467

2468// return abs(A) < 0.5 ? (float)(int)A : RoundedA;

2469SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,

2470 DAG.getConstantFP(0.5, SL, VT),ISD::SETOLT);

2471SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT,A);

2472return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);

2473}

2474

2475// The implementation of round(double) is similar to that of round(float) in

2476// that they both separate the value range into three regions and use a method

2477// specific to the region to round the values. However, round(double) first

2478// calculates the round of the absolute value and then adds the sign back while

2479// round(float) directly rounds the value with sign.

2480SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,

2481SelectionDAG &DAG) const{

2482SDLoc SL(Op);

2483SDValue A =Op.getOperand(0);

2484EVT VT =Op.getValueType();

2485

2486SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT,A);

2487

2488// double RoundedA = (double) (int) (abs(A) + 0.5f);

2489SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,

2490 DAG.getConstantFP(0.5, SL, VT));

2491SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);

2492

2493// RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;

2494EVT SetCCVT =getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);

2495SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,

2496 DAG.getConstantFP(0.5, SL, VT),ISD::SETOLT);

2497 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,

2498 DAG.getConstantFP(0, SL, VT),

2499 RoundedA);

2500

2501// Add sign to rounded_A

2502 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA,A);

2503 DAG.getNode(ISD::FTRUNC, SL, VT,A);

2504

2505// RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;

2506SDValue IsLarge =

2507 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),

2508ISD::SETOGT);

2509return DAG.getNode(ISD::SELECT, SL, VT, IsLarge,A, RoundedA);

2510}

2511

2512staticSDValue PromoteBinOpToF32(SDNode *N,SelectionDAG &DAG) {

2513EVT VT =N->getValueType(0);

2514EVT NVT = MVT::f32;

2515if (VT.isVector()) {

2516 NVT =EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());

2517 }

2518SDLoc DL(N);

2519SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0),DL, NVT);

2520SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1),DL, NVT);

2521SDValue Res = DAG.getNode(N->getOpcode(),DL, NVT, Tmp0, Tmp1,N->getFlags());

2522return DAG.getFPExtendOrRound(Res,DL, VT);

2523}

2524

2525SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,

2526SelectionDAG &DAG) const{

2527if (useF32FTZ(DAG.getMachineFunction())) {

2528returnPromoteBinOpToF32(Op.getNode(), DAG);

2529 }

2530returnOp;

2531}

2532

2533SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,

2534SelectionDAG &DAG) const{

2535assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);

2536

2537if (Op.getValueType() == MVT::bf16) {

2538SDLoc Loc(Op);

2539return DAG.getNode(

2540ISD::FP_ROUND, Loc, MVT::bf16,

2541 DAG.getNode(Op.getOpcode(), Loc, MVT::f32,Op.getOperand(0)),

2542 DAG.getIntPtrConstant(0, Loc,/*isTarget=*/true));

2543 }

2544

2545// Everything else is considered legal.

2546returnOp;

2547}

2548

2549SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,

2550SelectionDAG &DAG) const{

2551assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);

2552

2553if (Op.getOperand(0).getValueType() == MVT::bf16) {

2554SDLoc Loc(Op);

2555return DAG.getNode(

2556Op.getOpcode(), Loc,Op.getValueType(),

2557 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32,Op.getOperand(0)));

2558 }

2559

2560// Everything else is considered legal.

2561returnOp;

2562}

2563

2564SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,

2565SelectionDAG &DAG) const{

2566EVT NarrowVT =Op.getValueType();

2567SDValue Wide =Op.getOperand(0);

2568EVT WideVT = Wide.getValueType();

2569if (NarrowVT.getScalarType() == MVT::bf16) {

2570constTargetLowering *TLI = STI.getTargetLowering();

2571if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {

2572return TLI->expandFP_ROUND(Op.getNode(), DAG);

2573 }

2574if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {

2575// This combination was the first to support f32 -> bf16.

2576if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {

2577if (WideVT.getScalarType() == MVT::f32) {

2578returnOp;

2579 }

2580if (WideVT.getScalarType() == MVT::f64) {

2581SDLoc Loc(Op);

2582// Round-inexact-to-odd f64 to f32, then do the final rounding using

2583// the hardware f32 -> bf16 instruction.

2584SDValue rod = TLI->expandRoundInexactToOdd(

2585 WideVT.isVector() ? WideVT.changeVectorElementType(MVT::f32)

2586 : MVT::f32,

2587 Wide, Loc, DAG);

2588return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);

2589 }

2590 }

2591return TLI->expandFP_ROUND(Op.getNode(), DAG);

2592 }

2593 }

2594

2595// Everything else is considered legal.

2596returnOp;

2597}

2598

2599SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,

2600SelectionDAG &DAG) const{

2601SDValue Narrow =Op.getOperand(0);

2602EVT NarrowVT = Narrow.getValueType();

2603EVT WideVT =Op.getValueType();

2604if (NarrowVT.getScalarType() == MVT::bf16) {

2605if (WideVT.getScalarType() == MVT::f32 &&

2606 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {

2607SDLoc Loc(Op);

2608return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);

2609 }

2610if (WideVT.getScalarType() == MVT::f64 &&

2611 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {

2612EVT F32 = NarrowVT.isVector() ? NarrowVT.changeVectorElementType(MVT::f32)

2613 : MVT::f32;

2614SDLoc Loc(Op);

2615if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {

2616Op = DAG.getNode(ISD::FP_EXTEND, Loc,F32, Narrow);

2617 }else {

2618Op = DAG.getNode(ISD::BF16_TO_FP, Loc,F32, Narrow);

2619 }

2620return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT,Op);

2621 }

2622 }

2623

2624// Everything else is considered legal.

2625returnOp;

2626}

2627

2628staticSDValue LowerVectorArith(SDValue Op,SelectionDAG &DAG) {

2629SDLoc DL(Op);

2630if (Op.getValueType() != MVT::v2i16)

2631returnOp;

2632EVT EltVT =Op.getValueType().getVectorElementType();

2633SmallVector<SDValue> VecElements;

2634for (intI = 0, E =Op.getValueType().getVectorNumElements();I < E;I++) {

2635SmallVector<SDValue> ScalarArgs;

2636llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),

2637 [&](constSDUse &O) {

2638 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,

2639 O.get(), DAG.getIntPtrConstant(I, DL));

2640 });

2641 VecElements.push_back(DAG.getNode(Op.getOpcode(),DL, EltVT, ScalarArgs));

2642 }

2643SDValue V =

2644 DAG.getNode(ISD::BUILD_VECTOR,DL,Op.getValueType(), VecElements);

2645return V;

2646}

2647

2648SDValue

2649NVPTXTargetLowering::LowerOperation(SDValue Op,SelectionDAG &DAG) const{

2650switch (Op.getOpcode()) {

2651caseISD::RETURNADDR:

2652returnSDValue();

2653caseISD::FRAMEADDR:

2654returnSDValue();

2655caseISD::GlobalAddress:

2656returnLowerGlobalAddress(Op, DAG);

2657caseISD::INTRINSIC_W_CHAIN:

2658returnOp;

2659caseISD::BUILD_VECTOR:

2660return LowerBUILD_VECTOR(Op, DAG);

2661caseISD::BITCAST:

2662return LowerBITCAST(Op, DAG);

2663caseISD::EXTRACT_SUBVECTOR:

2664returnOp;

2665caseISD::EXTRACT_VECTOR_ELT:

2666return LowerEXTRACT_VECTOR_ELT(Op, DAG);

2667caseISD::INSERT_VECTOR_ELT:

2668return LowerINSERT_VECTOR_ELT(Op, DAG);

2669caseISD::VECTOR_SHUFFLE:

2670return LowerVECTOR_SHUFFLE(Op, DAG);

2671caseISD::CONCAT_VECTORS:

2672return LowerCONCAT_VECTORS(Op, DAG);

2673caseISD::STORE:

2674return LowerSTORE(Op, DAG);

2675caseISD::LOAD:

2676return LowerLOAD(Op, DAG);

2677caseISD::SHL_PARTS:

2678return LowerShiftLeftParts(Op, DAG);

2679caseISD::SRA_PARTS:

2680caseISD::SRL_PARTS:

2681return LowerShiftRightParts(Op, DAG);

2682caseISD::SELECT:

2683return LowerSelect(Op, DAG);

2684caseISD::FROUND:

2685return LowerFROUND(Op, DAG);

2686caseISD::FCOPYSIGN:

2687return LowerFCOPYSIGN(Op, DAG);

2688caseISD::SINT_TO_FP:

2689caseISD::UINT_TO_FP:

2690return LowerINT_TO_FP(Op, DAG);

2691caseISD::FP_TO_SINT:

2692caseISD::FP_TO_UINT:

2693return LowerFP_TO_INT(Op, DAG);

2694caseISD::FP_ROUND:

2695return LowerFP_ROUND(Op, DAG);

2696caseISD::FP_EXTEND:

2697return LowerFP_EXTEND(Op, DAG);

2698caseISD::BR_JT:

2699return LowerBR_JT(Op, DAG);

2700caseISD::VAARG:

2701return LowerVAARG(Op, DAG);

2702caseISD::VASTART:

2703return LowerVASTART(Op, DAG);

2704caseISD::ABS:

2705caseISD::SMIN:

2706caseISD::SMAX:

2707caseISD::UMIN:

2708caseISD::UMAX:

2709caseISD::ADD:

2710caseISD::SUB:

2711caseISD::MUL:

2712caseISD::SHL:

2713caseISD::SREM:

2714caseISD::UREM:

2715returnLowerVectorArith(Op, DAG);

2716caseISD::DYNAMIC_STACKALLOC:

2717returnLowerDYNAMIC_STACKALLOC(Op, DAG);

2718caseISD::STACKRESTORE:

2719returnLowerSTACKRESTORE(Op, DAG);

2720caseISD::STACKSAVE:

2721returnLowerSTACKSAVE(Op, DAG);

2722caseISD::CopyToReg:

2723return LowerCopyToReg_128(Op, DAG);

2724caseISD::FADD:

2725caseISD::FSUB:

2726caseISD::FMUL:

2727// Used only for bf16 on SM80, where we select fma for non-ftz operation

2728return PromoteBinOpIfF32FTZ(Op, DAG);

2729

2730default:

2731llvm_unreachable("Custom lowering not defined for operation");

2732 }

2733}

2734

2735SDValue NVPTXTargetLowering::LowerBR_JT(SDValue Op,SelectionDAG &DAG) const{

2736SDLoc DL(Op);

2737SDValue Chain =Op.getOperand(0);

2738constauto *JT = cast<JumpTableSDNode>(Op.getOperand(1));

2739SDValue Index =Op.getOperand(2);

2740

2741unsigned JId = JT->getIndex();

2742MachineJumpTableInfo *MJTI = DAG.getMachineFunction().getJumpTableInfo();

2743ArrayRef<MachineBasicBlock *> MBBs = MJTI->getJumpTables()[JId].MBBs;

2744

2745SDValue IdV = DAG.getConstant(JId,DL, MVT::i32);

2746

2747// Generate BrxStart node

2748SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);

2749 Chain = DAG.getNode(NVPTXISD::BrxStart,DL, VTs, Chain, IdV);

2750

2751// Generate BrxItem nodes

2752assert(!MBBs.empty());

2753for (MachineBasicBlock *MBB : MBBs.drop_back())

2754 Chain = DAG.getNode(NVPTXISD::BrxItem,DL, VTs, Chain.getValue(0),

2755 DAG.getBasicBlock(MBB), Chain.getValue(1));

2756

2757// Generate BrxEnd nodes

2758SDValue EndOps[] = {Chain.getValue(0), DAG.getBasicBlock(MBBs.back()), Index,

2759 IdV, Chain.getValue(1)};

2760SDValue BrxEnd = DAG.getNode(NVPTXISD::BrxEnd,DL, VTs, EndOps);

2761

2762return BrxEnd;

2763}

2764

2765// This will prevent AsmPrinter from trying to print the jump tables itself.

2766unsignedNVPTXTargetLowering::getJumpTableEncoding() const{

2767returnMachineJumpTableInfo::EK_Inline;

2768}

2769

2770// This function is almost a copy of SelectionDAG::expandVAArg().

2771// The only diff is that this one produces loads from local address space.

2772SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op,SelectionDAG &DAG) const{

2773constTargetLowering *TLI = STI.getTargetLowering();

2774SDLoc DL(Op);

2775

2776SDNode *Node =Op.getNode();

2777constValue *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();

2778EVT VT = Node->getValueType(0);

2779auto *Ty = VT.getTypeForEVT(*DAG.getContext());

2780SDValue Tmp1 = Node->getOperand(0);

2781SDValue Tmp2 = Node->getOperand(1);

2782constMaybeAlign MA(Node->getConstantOperandVal(3));

2783

2784SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()),DL,

2785 Tmp1, Tmp2,MachinePointerInfo(V));

2786SDValue VAList = VAListLoad;

2787

2788if (MA && *MA > TLI->getMinStackArgumentAlignment()) {

2789 VAList = DAG.getNode(

2790ISD::ADD,DL, VAList.getValueType(), VAList,

2791 DAG.getConstant(MA->value() - 1,DL, VAList.getValueType()));

2792

2793 VAList = DAG.getNode(ISD::AND,DL, VAList.getValueType(), VAList,

2794 DAG.getSignedConstant(-(int64_t)MA->value(),DL,

2795 VAList.getValueType()));

2796 }

2797

2798// Increment the pointer, VAList, to the next vaarg

2799 Tmp1 = DAG.getNode(ISD::ADD,DL, VAList.getValueType(), VAList,

2800 DAG.getConstant(DAG.getDataLayout().getTypeAllocSize(Ty),

2801DL, VAList.getValueType()));

2802

2803// Store the incremented VAList to the legalized pointer

2804 Tmp1 = DAG.getStore(VAListLoad.getValue(1),DL, Tmp1, Tmp2,

2805MachinePointerInfo(V));

2806

2807constValue *SrcV =Constant::getNullValue(

2808PointerType::get(*DAG.getContext(),ADDRESS_SPACE_LOCAL));

2809

2810// Load the actual argument out of the pointer VAList

2811return DAG.getLoad(VT,DL, Tmp1, VAList,MachinePointerInfo(SrcV));

2812}

2813

2814SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op,SelectionDAG &DAG) const{

2815constTargetLowering *TLI = STI.getTargetLowering();

2816SDLoc DL(Op);

2817EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());

2818

2819// Store the address of unsized array <function>_vararg[] in the ap object.

2820SDValue Arg = getParamSymbol(DAG,/* vararg */ -1, PtrVT);

2821SDValue VAReg = DAG.getNode(NVPTXISD::Wrapper,DL, PtrVT, Arg);

2822

2823constValue *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();

2824return DAG.getStore(Op.getOperand(0),DL, VAReg,Op.getOperand(1),

2825MachinePointerInfo(SV));

2826}

2827

2828SDValue NVPTXTargetLowering::LowerSelect(SDValue Op,SelectionDAG &DAG) const{

2829SDValue Op0 =Op->getOperand(0);

2830SDValue Op1 =Op->getOperand(1);

2831SDValue Op2 =Op->getOperand(2);

2832SDLoc DL(Op.getNode());

2833

2834assert(Op.getValueType() == MVT::i1 &&"Custom lowering enabled only for i1");

2835

2836 Op1 = DAG.getNode(ISD::ANY_EXTEND,DL, MVT::i32, Op1);

2837 Op2 = DAG.getNode(ISD::ANY_EXTEND,DL, MVT::i32, Op2);

2838SDValue Select = DAG.getNode(ISD::SELECT,DL, MVT::i32, Op0, Op1, Op2);

2839SDValue Trunc = DAG.getNode(ISD::TRUNCATE,DL, MVT::i1,Select);

2840

2841return Trunc;

2842}

2843

2844SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op,SelectionDAG &DAG) const{

2845if (Op.getValueType() == MVT::i1)

2846return LowerLOADi1(Op, DAG);

2847

2848// v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle

2849// unaligned loads and have to handle it here.

2850EVT VT =Op.getValueType();

2851if (Isv2x16VT(VT) || VT == MVT::v4i8) {

2852LoadSDNode *Load = cast<LoadSDNode>(Op);

2853EVT MemVT =Load->getMemoryVT();

2854if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),

2855 MemVT, *Load->getMemOperand())) {

2856SDValue Ops[2];

2857 std::tie(Ops[0], Ops[1]) =expandUnalignedLoad(Load, DAG);

2858return DAG.getMergeValues(Ops,SDLoc(Op));

2859 }

2860 }

2861

2862returnSDValue();

2863}

2864

2865// v = ld i1* addr

2866// =>

2867// v1 = ld i8* addr (-> i16)

2868// v = trunc i16 to i1

2869SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op,SelectionDAG &DAG) const{

2870SDNode *Node =Op.getNode();

2871LoadSDNode *LD = cast<LoadSDNode>(Node);

2872SDLoc dl(Node);

2873assert(LD->getExtensionType() ==ISD::NON_EXTLOAD);

2874assert(Node->getValueType(0) == MVT::i1 &&

2875"Custom lowering for i1 load only");

2876SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16,LD->getChain(),

2877LD->getBasePtr(),LD->getPointerInfo(),

2878 MVT::i8,LD->getAlign(),

2879LD->getMemOperand()->getFlags());

2880SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);

2881// The legalizer (the caller) is expecting two values from the legalized

2882// load, so we build a MergeValues node for it. See ExpandUnalignedLoad()

2883// in LegalizeDAG.cpp which also uses MergeValues.

2884SDValue Ops[] = { result,LD->getChain() };

2885return DAG.getMergeValues(Ops, dl);

2886}

2887

2888SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op,SelectionDAG &DAG) const{

2889StoreSDNode *Store = cast<StoreSDNode>(Op);

2890EVT VT =Store->getMemoryVT();

2891

2892if (VT == MVT::i1)

2893return LowerSTOREi1(Op, DAG);

2894

2895// v2f16 is legal, so we can't rely on legalizer to handle unaligned

2896// stores and have to handle it here.

2897if ((Isv2x16VT(VT) || VT == MVT::v4i8) &&

2898 !allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),

2899 VT, *Store->getMemOperand()))

2900returnexpandUnalignedStore(Store, DAG);

2901

2902// v2f16, v2bf16 and v2i16 don't need special handling.

2903if (Isv2x16VT(VT) || VT == MVT::v4i8)

2904returnSDValue();

2905

2906if (VT.isVector())

2907return LowerSTOREVector(Op, DAG);

2908

2909returnSDValue();

2910}

2911

2912SDValue

2913NVPTXTargetLowering::LowerSTOREVector(SDValue Op,SelectionDAG &DAG) const{

2914SDNode *N =Op.getNode();

2915SDValue Val =N->getOperand(1);

2916SDLoc DL(N);

2917EVT ValVT = Val.getValueType();

2918

2919auto NumEltsAndEltVT =getVectorLoweringShape(ValVT);

2920if (!NumEltsAndEltVT)

2921returnSDValue();

2922auto [NumElts, EltVT] = NumEltsAndEltVT.value();

2923

2924MemSDNode *MemSD = cast<MemSDNode>(N);

2925constDataLayout &TD = DAG.getDataLayout();

2926

2927Align Alignment = MemSD->getAlign();

2928Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));

2929if (Alignment < PrefAlign) {

2930// This store is not sufficiently aligned, so bail out and let this vector

2931// store be scalarized. Note that we may still be able to emit smaller

2932// vector stores. For example, if we are storing a <4 x float> with an

2933// alignment of 8, this check will fail but the legalizer will try again

2934// with 2 x <2 x float>, which will succeed with an alignment of 8.

2935returnSDValue();

2936 }

2937

2938// Since StoreV2 is a target node, we cannot rely on DAG type legalization.

2939// Therefore, we must ensure the type is legal. For i1 and i8, we set the

2940// stored type to i16 and propagate the "real" type as the memory type.

2941bool NeedExt =false;

2942if (EltVT.getSizeInBits() < 16)

2943 NeedExt =true;

2944

2945unsigned Opcode = 0;

2946switch (NumElts) {

2947default:

2948returnSDValue();

2949case 2:

2950 Opcode =NVPTXISD::StoreV2;

2951break;

2952case 4:

2953 Opcode =NVPTXISD::StoreV4;

2954break;

2955 }

2956

2957SmallVector<SDValue, 8> Ops;

2958

2959// First is the chain

2960 Ops.push_back(N->getOperand(0));

2961

2962// Then the split values

2963assert(NumElts <= ValVT.getVectorNumElements() &&

2964"NumElts should not increase, only decrease or stay the same.");

2965if (NumElts < ValVT.getVectorNumElements()) {

2966// If the number of elements has decreased, getVectorLoweringShape has

2967// upsized the element types

2968assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&

2969 EltVT.getVectorNumElements() <= 4 &&"Unexpected upsized type.");

2970// Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be

2971// stored as b32s

2972unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();

2973for (unsigned i = 0; i < NumElts; ++i) {

2974SmallVector<SDValue, 4> SubVectorElts;

2975 DAG.ExtractVectorElements(Val, SubVectorElts, i * NumEltsPerSubVector,

2976 NumEltsPerSubVector);

2977SDValue SubVector = DAG.getBuildVector(EltVT,DL, SubVectorElts);

2978 Ops.push_back(SubVector);

2979 }

2980 }else {

2981for (unsigned i = 0; i < NumElts; ++i) {

2982SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, EltVT, Val,

2983 DAG.getIntPtrConstant(i,DL));

2984if (NeedExt)

2985 ExtVal = DAG.getNode(ISD::ANY_EXTEND,DL, MVT::i16, ExtVal);

2986 Ops.push_back(ExtVal);

2987 }

2988 }

2989

2990// Then any remaining arguments

2991 Ops.append(N->op_begin() + 2,N->op_end());

2992

2993SDValue NewSt =

2994 DAG.getMemIntrinsicNode(Opcode,DL, DAG.getVTList(MVT::Other), Ops,

2995 MemSD->getMemoryVT(), MemSD->getMemOperand());

2996

2997// return DCI.CombineTo(N, NewSt, true);

2998return NewSt;

2999}

3000

3001// st i1 v, addr

3002// =>

3003// v1 = zxt v to i16

3004// st.u8 i16, addr

3005SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op,SelectionDAG &DAG) const{

3006SDNode *Node =Op.getNode();

3007SDLoc dl(Node);

3008StoreSDNode *ST = cast<StoreSDNode>(Node);

3009SDValue Tmp1 =ST->getChain();

3010SDValue Tmp2 =ST->getBasePtr();

3011SDValue Tmp3 =ST->getValue();

3012assert(Tmp3.getValueType() == MVT::i1 &&"Custom lowering for i1 store only");

3013 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);

3014SDValue Result =

3015 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,ST->getPointerInfo(), MVT::i8,

3016ST->getAlign(),ST->getMemOperand()->getFlags());

3017returnResult;

3018}

3019

3020SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,

3021SelectionDAG &DAG) const{

3022// Change the CopyToReg to take in two 64-bit operands instead of a 128-bit

3023// operand so that it can pass the legalization.

3024

3025assert(Op.getOperand(1).getValueType() == MVT::i128 &&

3026"Custom lowering for 128-bit CopyToReg only");

3027

3028SDNode *Node =Op.getNode();

3029SDLoc DL(Node);

3030

3031SDValue Cast = DAG.getBitcast(MVT::v2i64,Op->getOperand(2));

3032SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i64, Cast,

3033 DAG.getIntPtrConstant(0,DL));

3034SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i64, Cast,

3035 DAG.getIntPtrConstant(1,DL));

3036

3037SmallVector<SDValue, 5> NewOps(Op->getNumOperands() + 1);

3038SmallVector<EVT, 3> ResultsType(Node->values());

3039

3040 NewOps[0] =Op->getOperand(0);// Chain

3041 NewOps[1] =Op->getOperand(1);// Dst Reg

3042 NewOps[2] =Lo;// Lower 64-bit

3043 NewOps[3] =Hi;// Higher 64-bit

3044if (Op.getNumOperands() == 4)

3045 NewOps[4] =Op->getOperand(3);// Glue if exists

3046

3047return DAG.getNode(ISD::CopyToReg,DL, ResultsType, NewOps);

3048}

3049

3050unsigned NVPTXTargetLowering::getNumRegisters(

3051LLVMContext &Context,EVT VT,

3052 std::optional<MVT> RegisterVT = std::nullopt) const{

3053if (VT == MVT::i128 && RegisterVT == MVT::i128)

3054return 1;

3055returnTargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);

3056}

3057

3058bool NVPTXTargetLowering::splitValueIntoRegisterParts(

3059SelectionDAG &DAG,constSDLoc &DL,SDValue Val,SDValue *Parts,

3060unsigned NumParts,MVT PartVT, std::optional<CallingConv::ID>CC) const{

3061if (Val.getValueType() == MVT::i128 && NumParts == 1) {

3062 Parts[0] = Val;

3063returntrue;

3064 }

3065returnfalse;

3066}

3067

3068// This creates target external symbol for a function parameter.

3069// Name of the symbol is composed from its index and the function name.

3070// Negative index corresponds to special parameter (unsized array) used for

3071// passing variable arguments.

3072SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG,int idx,

3073EVT v) const{

3074StringRef SavedStr =nvTM->getStrPool().save(

3075getParamName(&DAG.getMachineFunction().getFunction(), idx));

3076return DAG.getTargetExternalSymbol(SavedStr.data(), v);

3077}

3078

3079SDValue NVPTXTargetLowering::LowerFormalArguments(

3080SDValue Chain,CallingConv::ID CallConv,bool isVarArg,

3081constSmallVectorImpl<ISD::InputArg> &Ins,constSDLoc &dl,

3082SelectionDAG &DAG,SmallVectorImpl<SDValue> &InVals) const{

3083MachineFunction &MF = DAG.getMachineFunction();

3084constDataLayout &DL = DAG.getDataLayout();

3085auto PtrVT =getPointerTy(DAG.getDataLayout());

3086

3087constFunction *F = &MF.getFunction();

3088constAttributeList &PAL =F->getAttributes();

3089constTargetLowering *TLI = STI.getTargetLowering();

3090

3091SDValue Root = DAG.getRoot();

3092 std::vector<SDValue> OutChains;

3093

3094bool isABI = (STI.getSmVersion() >= 20);

3095assert(isABI &&"Non-ABI compilation is not supported");

3096if (!isABI)

3097return Chain;

3098

3099 std::vector<Type *> argTypes;

3100 std::vector<const Argument *> theArgs;

3101for (constArgument &I :F->args()) {

3102 theArgs.push_back(&I);

3103 argTypes.push_back(I.getType());

3104 }

3105// argTypes.size() (or theArgs.size()) and Ins.size() need not match.

3106// Ins.size() will be larger

3107// * if there is an aggregate argument with multiple fields (each field

3108// showing up separately in Ins)

3109// * if there is a vector argument with more than typical vector-length

3110// elements (generally if more than 4) where each vector element is

3111// individually present in Ins.

3112// So a different index should be used for indexing into Ins.

3113// See similar issue in LowerCall.

3114unsigned InsIdx = 0;

3115

3116for (unsigned i = 0, e = theArgs.size(); i != e; ++i, ++InsIdx) {

3117Type *Ty = argTypes[i];

3118

3119if (theArgs[i]->use_empty()) {

3120// argument is dead

3121if (IsTypePassedAsArray(Ty) && !Ty->isVectorTy()) {

3122SmallVector<EVT, 16> vtparts;

3123

3124ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts);

3125if (vtparts.empty())

3126report_fatal_error("Empty parameter types are not supported");

3127

3128for (unsigned parti = 0, parte = vtparts.size(); parti != parte;

3129 ++parti) {

3130 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));

3131 ++InsIdx;

3132 }

3133if (vtparts.size() > 0)

3134 --InsIdx;

3135continue;

3136 }

3137if (Ty->isVectorTy()) {

3138EVT ObjectVT =getValueType(DL, Ty);

3139unsigned NumRegs = TLI->getNumRegisters(F->getContext(), ObjectVT);

3140for (unsigned parti = 0; parti < NumRegs; ++parti) {

3141 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));

3142 ++InsIdx;

3143 }

3144if (NumRegs > 0)

3145 --InsIdx;

3146continue;

3147 }

3148 InVals.push_back(DAG.getNode(ISD::UNDEF, dl, Ins[InsIdx].VT));

3149continue;

3150 }

3151

3152// In the following cases, assign a node order of "i+1"

3153// to newly created nodes. The SDNodes for params have to

3154// appear in the same order as their order of appearance

3155// in the original function. "i+1" holds that order.

3156if (!PAL.hasParamAttr(i, Attribute::ByVal)) {

3157bool aggregateIsPacked =false;

3158if (StructType *STy = dyn_cast<StructType>(Ty))

3159 aggregateIsPacked = STy->isPacked();

3160

3161SmallVector<EVT, 16> VTs;

3162SmallVector<uint64_t, 16> Offsets;

3163ComputePTXValueVTs(*this,DL, Ty, VTs, &Offsets, 0);

3164if (VTs.empty())

3165report_fatal_error("Empty parameter types are not supported");

3166

3167Align ArgAlign =getFunctionArgumentAlignment(

3168F, Ty, i +AttributeList::FirstArgIndex,DL);

3169auto VectorInfo =VectorizePTXValueVTs(VTs, Offsets, ArgAlign);

3170

3171SDValue Arg = getParamSymbol(DAG, i, PtrVT);

3172int VecIdx = -1;// Index of the first element of the current vector.

3173for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {

3174if (VectorInfo[parti] &PVF_FIRST) {

3175assert(VecIdx == -1 &&"Orphaned vector.");

3176 VecIdx = parti;

3177 }

3178

3179// That's the last element of this store op.

3180if (VectorInfo[parti] &PVF_LAST) {

3181unsigned NumElts = parti - VecIdx + 1;

3182EVT EltVT = VTs[parti];

3183// i1 is loaded/stored as i8.

3184EVT LoadVT = EltVT;

3185if (EltVT == MVT::i1)

3186 LoadVT = MVT::i8;

3187elseif (Isv2x16VT(EltVT) || EltVT == MVT::v4i8)

3188// getLoad needs a vector type, but it can't handle

3189// vectors which contain v2f16 or v2bf16 elements. So we must load

3190// using i32 here and then bitcast back.

3191 LoadVT = MVT::i32;

3192

3193EVT VecVT =EVT::getVectorVT(F->getContext(), LoadVT, NumElts);

3194SDValue VecAddr =

3195 DAG.getNode(ISD::ADD, dl, PtrVT, Arg,

3196 DAG.getConstant(Offsets[VecIdx], dl, PtrVT));

3197Value *srcValue =Constant::getNullValue(

3198PointerType::get(F->getContext(),ADDRESS_SPACE_PARAM));

3199

3200constMaybeAlign PartAlign = [&]() ->MaybeAlign {

3201if (aggregateIsPacked)

3202returnAlign(1);

3203if (NumElts != 1)

3204return std::nullopt;

3205Align PartAlign =

3206DL.getABITypeAlign(EltVT.getTypeForEVT(F->getContext()));

3207returncommonAlignment(PartAlign, Offsets[parti]);

3208 }();

3209SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,

3210MachinePointerInfo(srcValue), PartAlign,

3211MachineMemOperand::MODereferenceable |

3212MachineMemOperand::MOInvariant);

3213if (P.getNode())

3214P.getNode()->setIROrder(i + 1);

3215for (unsigned j = 0; j < NumElts; ++j) {

3216SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT,P,

3217 DAG.getIntPtrConstant(j, dl));

3218// We've loaded i1 as an i8 and now must truncate it back to i1

3219if (EltVT == MVT::i1)

3220 Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);

3221// v2f16 was loaded as an i32. Now we must bitcast it back.

3222elseif (EltVT != LoadVT)

3223 Elt = DAG.getNode(ISD::BITCAST, dl, EltVT, Elt);

3224

3225// If a promoted integer type is used, truncate down to the original

3226MVT PromotedVT;

3227if (PromoteScalarIntegerPTX(EltVT, &PromotedVT)) {

3228 Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);

3229 }

3230

3231// Extend the element if necessary (e.g. an i8 is loaded

3232// into an i16 register)

3233if (Ins[InsIdx].VT.isInteger() &&

3234 Ins[InsIdx].VT.getFixedSizeInBits() >

3235 LoadVT.getFixedSizeInBits()) {

3236unsigned Extend = Ins[InsIdx].Flags.isSExt() ?ISD::SIGN_EXTEND

3237 :ISD::ZERO_EXTEND;

3238 Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);

3239 }

3240 InVals.push_back(Elt);

3241 }

3242

3243// Reset vector tracking state.

3244 VecIdx = -1;

3245 }

3246 ++InsIdx;

3247 }

3248if (VTs.size() > 0)

3249 --InsIdx;

3250continue;

3251 }

3252

3253// Param has ByVal attribute

3254// Return MoveParam(param symbol).

3255// Ideally, the param symbol can be returned directly,

3256// but when SDNode builder decides to use it in a CopyToReg(),

3257// machine instruction fails because TargetExternalSymbol

3258// (not lowered) is target dependent, and CopyToReg assumes

3259// the source is lowered.

3260EVT ObjectVT =getValueType(DL, Ty);

3261assert(ObjectVT == Ins[InsIdx].VT &&

3262"Ins type did not match function type");

3263SDValue Arg = getParamSymbol(DAG, i, PtrVT);

3264SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);

3265if (p.getNode())

3266 p.getNode()->setIROrder(i + 1);

3267 InVals.push_back(p);

3268 }

3269

3270if (!OutChains.empty())

3271 DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));

3272

3273return Chain;

3274}

3275

3276// Use byte-store when the param adress of the return value is unaligned.

3277// This may happen when the return value is a field of a packed structure.

3278staticSDValue LowerUnalignedStoreRet(SelectionDAG &DAG,SDValue Chain,

3279uint64_t Offset,EVT ElementType,

3280SDValue RetVal,constSDLoc &dl) {

3281// Bit logic only works on integer types

3282if (adjustElementType(ElementType))

3283 RetVal = DAG.getNode(ISD::BITCAST, dl, ElementType, RetVal);

3284

3285// Store each byte

3286for (unsigned i = 0, n = ElementType.getSizeInBits() / 8; i < n; i++) {

3287// Shift the byte to the last byte position

3288SDValue ShiftVal = DAG.getNode(ISD::SRL, dl, ElementType, RetVal,

3289 DAG.getConstant(i * 8, dl, MVT::i32));

3290SDValue StoreOperands[] = {Chain, DAG.getConstant(Offset + i, dl, MVT::i32),

3291 ShiftVal};

3292// Trunc store only the last byte by using

3293// st.param.b8

3294// The register type can be larger than b8.

3295 Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,

3296 DAG.getVTList(MVT::Other), StoreOperands,

3297 MVT::i8,MachinePointerInfo(), std::nullopt,

3298MachineMemOperand::MOStore);

3299 }

3300return Chain;

3301}

3302

3303SDValue

3304NVPTXTargetLowering::LowerReturn(SDValue Chain,CallingConv::ID CallConv,

3305bool isVarArg,

3306constSmallVectorImpl<ISD::OutputArg> &Outs,

3307constSmallVectorImpl<SDValue> &OutVals,

3308constSDLoc &dl,SelectionDAG &DAG) const{

3309constMachineFunction &MF = DAG.getMachineFunction();

3310constFunction &F = MF.getFunction();

3311Type *RetTy = MF.getFunction().getReturnType();

3312

3313bool isABI = (STI.getSmVersion() >= 20);

3314assert(isABI &&"Non-ABI compilation is not supported");

3315if (!isABI)

3316return Chain;

3317

3318constDataLayout &DL = DAG.getDataLayout();

3319SmallVector<SDValue, 16> PromotedOutVals;

3320SmallVector<EVT, 16> VTs;

3321SmallVector<uint64_t, 16> Offsets;

3322ComputePTXValueVTs(*this,DL,RetTy, VTs, &Offsets);

3323assert(VTs.size() == OutVals.size() &&"Bad return value decomposition");

3324

3325for (unsigned i = 0, e = VTs.size(); i != e; ++i) {

3326SDValue PromotedOutVal = OutVals[i];

3327MVT PromotedVT;

3328if (PromoteScalarIntegerPTX(VTs[i], &PromotedVT)) {

3329 VTs[i] =EVT(PromotedVT);

3330 }

3331if (PromoteScalarIntegerPTX(PromotedOutVal.getValueType(), &PromotedVT)) {

3332llvm::ISD::NodeType Ext =

3333 Outs[i].Flags.isSExt() ?ISD::SIGN_EXTEND :ISD::ZERO_EXTEND;

3334 PromotedOutVal = DAG.getNode(Ext, dl, PromotedVT, PromotedOutVal);

3335 }

3336 PromotedOutVals.push_back(PromotedOutVal);

3337 }

3338

3339auto VectorInfo =VectorizePTXValueVTs(

3340 VTs, Offsets,

3341RetTy->isSized() ?getFunctionParamOptimizedAlign(&F,RetTy,DL)

3342 :Align(1));

3343

3344// PTX Interoperability Guide 3.3(A): [Integer] Values shorter than

3345// 32-bits are sign extended or zero extended, depending on whether

3346// they are signed or unsigned types.

3347bool ExtendIntegerRetVal =

3348RetTy->isIntegerTy() &&DL.getTypeAllocSizeInBits(RetTy) < 32;

3349

3350SmallVector<SDValue, 6> StoreOperands;

3351for (unsigned i = 0, e = VTs.size(); i != e; ++i) {

3352SDValue OutVal = OutVals[i];

3353SDValue RetVal = PromotedOutVals[i];

3354

3355if (ExtendIntegerRetVal) {

3356 RetVal = DAG.getNode(Outs[i].Flags.isSExt() ?ISD::SIGN_EXTEND

3357 :ISD::ZERO_EXTEND,

3358 dl, MVT::i32, RetVal);

3359 }elseif (OutVal.getValueSizeInBits() < 16) {

3360// Use 16-bit registers for small load-stores as it's the

3361// smallest general purpose register size supported by NVPTX.

3362 RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);

3363 }

3364

3365// If we have a PVF_SCALAR entry, it may not even be sufficiently aligned

3366// for a scalar store. In such cases, fall back to byte stores.

3367if (VectorInfo[i] ==PVF_SCALAR &&RetTy->isAggregateType()) {

3368EVT ElementType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];

3369Align ElementTypeAlign =

3370DL.getABITypeAlign(ElementType.getTypeForEVT(RetTy->getContext()));

3371Align ElementAlign =

3372commonAlignment(DL.getABITypeAlign(RetTy), Offsets[i]);

3373if (ElementAlign < ElementTypeAlign) {

3374assert(StoreOperands.empty() &&"Orphaned operand list.");

3375 Chain =LowerUnalignedStoreRet(DAG, Chain, Offsets[i], ElementType,

3376 RetVal, dl);

3377

3378// The call to LowerUnalignedStoreRet inserted the necessary SDAG nodes

3379// into the graph, so just move on to the next element.

3380continue;

3381 }

3382 }

3383

3384// New load/store. Record chain and offset operands.

3385if (VectorInfo[i] &PVF_FIRST) {

3386assert(StoreOperands.empty() &&"Orphaned operand list.");

3387 StoreOperands.push_back(Chain);

3388 StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));

3389 }

3390

3391// Record the value to return.

3392 StoreOperands.push_back(RetVal);

3393

3394// That's the last element of this store op.

3395if (VectorInfo[i] &PVF_LAST) {

3396NVPTXISD::NodeType Op;

3397unsigned NumElts = StoreOperands.size() - 2;

3398switch (NumElts) {

3399case 1:

3400Op =NVPTXISD::StoreRetval;

3401break;

3402case 2:

3403Op =NVPTXISD::StoreRetvalV2;

3404break;

3405case 4:

3406Op =NVPTXISD::StoreRetvalV4;

3407break;

3408default:

3409llvm_unreachable("Invalid vector info.");

3410 }

3411

3412// Adjust type of load/store op if we've extended the scalar

3413// return value.

3414EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];

3415 Chain = DAG.getMemIntrinsicNode(

3416Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,

3417MachinePointerInfo(),Align(1),MachineMemOperand::MOStore);

3418// Cleanup vector state.

3419 StoreOperands.clear();

3420 }

3421 }

3422

3423return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);

3424}

3425

3426voidNVPTXTargetLowering::LowerAsmOperandForConstraint(

3427SDValue Op,StringRef Constraint, std::vector<SDValue> &Ops,

3428SelectionDAG &DAG) const{

3429if (Constraint.size() > 1)

3430return;

3431TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);

3432}

3433

3434// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as

3435// TgtMemIntrinsic

3436// because we need the information that is only available in the "Value" type

3437// of destination

3438// pointer. In particular, the address space information.

3439boolNVPTXTargetLowering::getTgtMemIntrinsic(

3440IntrinsicInfo &Info,constCallInst &I,

3441MachineFunction &MF,unsigned Intrinsic) const{

3442switch (Intrinsic) {

3443default:

3444returnfalse;

3445case Intrinsic::nvvm_match_all_sync_i32p:

3446case Intrinsic::nvvm_match_all_sync_i64p:

3447Info.opc =ISD::INTRINSIC_W_CHAIN;

3448// memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute

3449// in order to model data exchange with other threads, but perform no real

3450// memory accesses.

3451Info.memVT = MVT::i1;

3452

3453// Our result depends on both our and other thread's arguments.

3454Info.flags =MachineMemOperand::MOLoad |MachineMemOperand::MOStore;

3455returntrue;

3456case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:

3457case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:

3458case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:

3459case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:

3460case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:

3461case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:

3462case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:

3463case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:

3464case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:

3465case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:

3466case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:

3467case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:

3468case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:

3469case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:

3470case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:

3471case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:

3472case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:

3473case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:

3474case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:

3475case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:

3476case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:

3477case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:

3478case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:

3479case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {

3480Info.opc =ISD::INTRINSIC_W_CHAIN;

3481Info.memVT = MVT::v8f16;

3482Info.ptrVal =I.getArgOperand(0);

3483Info.offset = 0;

3484Info.flags =MachineMemOperand::MOLoad;

3485Info.align =Align(16);

3486returntrue;

3487 }

3488case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:

3489case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:

3490case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:

3491case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:

3492case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:

3493case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:

3494case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:

3495case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:

3496case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:

3497case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:

3498case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:

3499case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:

3500case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:

3501case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:

3502case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:

3503case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:

3504case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:

3505case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:

3506case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:

3507case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:

3508case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:

3509case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:

3510case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:

3511case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {

3512Info.opc =ISD::INTRINSIC_W_CHAIN;

3513Info.memVT = MVT::v2i32;

3514Info.ptrVal =I.getArgOperand(0);

3515Info.offset = 0;

3516Info.flags =MachineMemOperand::MOLoad;

3517Info.align =Align(8);

3518returntrue;

3519 }

3520

3521case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:

3522case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:

3523case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:

3524case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:

3525case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:

3526case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:

3527case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:

3528case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:

3529case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:

3530case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:

3531case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:

3532case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:

3533case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:

3534case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:

3535case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:

3536case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:

3537

3538case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:

3539case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:

3540case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:

3541case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:

3542case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:

3543case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:

3544case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:

3545case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:

3546case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:

3547case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:

3548case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:

3549case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:

3550case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:

3551case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:

3552case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:

3553case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:

3554case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:

3555case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16: {

3556Info.opc =ISD::INTRINSIC_W_CHAIN;

3557Info.memVT = MVT::v4i32;

3558Info.ptrVal =I.getArgOperand(0);

3559Info.offset = 0;

3560Info.flags =MachineMemOperand::MOLoad;

3561Info.align =Align(16);

3562returntrue;

3563 }

3564

3565case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:

3566case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:

3567case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:

3568case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:

3569case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:

3570case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:

3571case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:

3572case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:

3573

3574case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:

3575case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:

3576case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:

3577case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:

3578case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:

3579case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:

3580case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:

3581case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:

3582case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:

3583case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:

3584case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:

3585case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:

3586case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:

3587case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:

3588case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:

3589case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:

3590case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:

3591case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:

3592case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:

3593case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:

3594case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:

3595case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16: {

3596Info.opc =ISD::INTRINSIC_W_CHAIN;

3597Info.memVT = MVT::i32;

3598Info.ptrVal =I.getArgOperand(0);

3599Info.offset = 0;

3600Info.flags =MachineMemOperand::MOLoad;

3601Info.align =Align(4);

3602returntrue;

3603 }

3604

3605case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:

3606case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:

3607case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:

3608case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:

3609case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:

3610case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:

3611case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:

3612case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:

3613case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:

3614case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:

3615case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:

3616case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {

3617Info.opc =ISD::INTRINSIC_W_CHAIN;

3618Info.memVT = MVT::v4f16;

3619Info.ptrVal =I.getArgOperand(0);

3620Info.offset = 0;

3621Info.flags =MachineMemOperand::MOLoad;

3622Info.align =Align(16);

3623returntrue;

3624 }

3625

3626case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:

3627case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:

3628case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:

3629case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:

3630case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:

3631case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:

3632case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:

3633case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:

3634case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:

3635case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:

3636case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:

3637case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:

3638case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:

3639case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:

3640case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:

3641case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {

3642Info.opc =ISD::INTRINSIC_W_CHAIN;

3643Info.memVT = MVT::v8f32;

3644Info.ptrVal =I.getArgOperand(0);

3645Info.offset = 0;

3646Info.flags =MachineMemOperand::MOLoad;

3647Info.align =Align(16);

3648returntrue;

3649 }

3650

3651case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:

3652case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:

3653case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:

3654case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:

3655

3656case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:

3657case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:

3658case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:

3659case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:

3660

3661case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:

3662case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:

3663case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:

3664case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:

3665case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:

3666case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:

3667case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:

3668case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:

3669case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:

3670case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:

3671case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:

3672case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {

3673Info.opc =ISD::INTRINSIC_W_CHAIN;

3674Info.memVT = MVT::v8i32;

3675Info.ptrVal =I.getArgOperand(0);

3676Info.offset = 0;

3677Info.flags =MachineMemOperand::MOLoad;

3678Info.align =Align(16);

3679returntrue;

3680 }

3681

3682case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:

3683case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:

3684case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:

3685case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:

3686case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:

3687case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:

3688case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:

3689case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:

3690case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:

3691case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16: {

3692Info.opc =ISD::INTRINSIC_W_CHAIN;

3693Info.memVT = MVT::v2i32;

3694Info.ptrVal =I.getArgOperand(0);

3695Info.offset = 0;

3696Info.flags =MachineMemOperand::MOLoad;

3697Info.align =Align(8);

3698returntrue;

3699 }

3700

3701case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:

3702case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:

3703case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:

3704case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:

3705

3706case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:

3707case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:

3708case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:

3709case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {

3710Info.opc =ISD::INTRINSIC_W_CHAIN;

3711Info.memVT = MVT::f64;

3712Info.ptrVal =I.getArgOperand(0);

3713Info.offset = 0;

3714Info.flags =MachineMemOperand::MOLoad;

3715Info.align =Align(8);

3716returntrue;

3717 }

3718

3719case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:

3720case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:

3721case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:

3722case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {

3723Info.opc =ISD::INTRINSIC_W_CHAIN;

3724Info.memVT = MVT::v2f64;

3725Info.ptrVal =I.getArgOperand(0);

3726Info.offset = 0;

3727Info.flags =MachineMemOperand::MOLoad;

3728Info.align =Align(16);

3729returntrue;

3730 }

3731

3732case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:

3733case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:

3734case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:

3735case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:

3736case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:

3737case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:

3738case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:

3739case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:

3740case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:

3741case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:

3742case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:

3743case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {

3744Info.opc =ISD::INTRINSIC_VOID;

3745Info.memVT = MVT::v4f16;

3746Info.ptrVal =I.getArgOperand(0);

3747Info.offset = 0;

3748Info.flags =MachineMemOperand::MOStore;

3749Info.align =Align(16);

3750returntrue;

3751 }

3752

3753case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:

3754case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:

3755case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:

3756case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:

3757case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:

3758case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:

3759case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:

3760case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:

3761case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:

3762case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:

3763case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:

3764case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:

3765case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:

3766case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:

3767case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:

3768case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {

3769Info.opc =ISD::INTRINSIC_VOID;

3770Info.memVT = MVT::v8f32;

3771Info.ptrVal =I.getArgOperand(0);

3772Info.offset = 0;

3773Info.flags =MachineMemOperand::MOStore;

3774Info.align =Align(16);

3775returntrue;

3776 }

3777

3778case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:

3779case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:

3780case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:

3781case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:

3782case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:

3783case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:

3784case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:

3785case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:

3786case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:

3787case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:

3788case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:

3789case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {

3790Info.opc =ISD::INTRINSIC_VOID;

3791Info.memVT = MVT::v8i32;

3792Info.ptrVal =I.getArgOperand(0);

3793Info.offset = 0;

3794Info.flags =MachineMemOperand::MOStore;

3795Info.align =Align(16);

3796returntrue;

3797 }

3798

3799case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:

3800case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:

3801case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:

3802case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:

3803case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:

3804case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:

3805case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:

3806case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride: {

3807Info.opc =ISD::INTRINSIC_VOID;

3808Info.memVT = MVT::v2i32;

3809Info.ptrVal =I.getArgOperand(0);

3810Info.offset = 0;

3811Info.flags =MachineMemOperand::MOStore;

3812Info.align =Align(8);

3813returntrue;

3814 }

3815

3816case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:

3817case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:

3818case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:

3819case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {

3820Info.opc =ISD::INTRINSIC_VOID;

3821Info.memVT = MVT::v2f64;

3822Info.ptrVal =I.getArgOperand(0);

3823Info.offset = 0;

3824Info.flags =MachineMemOperand::MOStore;

3825Info.align =Align(16);

3826returntrue;

3827 }

3828

3829case Intrinsic::nvvm_atomic_load_inc_32:

3830case Intrinsic::nvvm_atomic_load_dec_32:

3831

3832case Intrinsic::nvvm_atomic_add_gen_f_cta:

3833case Intrinsic::nvvm_atomic_add_gen_f_sys:

3834case Intrinsic::nvvm_atomic_add_gen_i_cta:

3835case Intrinsic::nvvm_atomic_add_gen_i_sys:

3836case Intrinsic::nvvm_atomic_and_gen_i_cta:

3837case Intrinsic::nvvm_atomic_and_gen_i_sys:

3838case Intrinsic::nvvm_atomic_cas_gen_i_cta:

3839case Intrinsic::nvvm_atomic_cas_gen_i_sys:

3840case Intrinsic::nvvm_atomic_dec_gen_i_cta:

3841case Intrinsic::nvvm_atomic_dec_gen_i_sys:

3842case Intrinsic::nvvm_atomic_inc_gen_i_cta:

3843case Intrinsic::nvvm_atomic_inc_gen_i_sys:

3844case Intrinsic::nvvm_atomic_max_gen_i_cta:

3845case Intrinsic::nvvm_atomic_max_gen_i_sys:

3846case Intrinsic::nvvm_atomic_min_gen_i_cta:

3847case Intrinsic::nvvm_atomic_min_gen_i_sys:

3848case Intrinsic::nvvm_atomic_or_gen_i_cta:

3849case Intrinsic::nvvm_atomic_or_gen_i_sys:

3850case Intrinsic::nvvm_atomic_exch_gen_i_cta:

3851case Intrinsic::nvvm_atomic_exch_gen_i_sys:

3852case Intrinsic::nvvm_atomic_xor_gen_i_cta:

3853case Intrinsic::nvvm_atomic_xor_gen_i_sys: {

3854auto &DL =I.getDataLayout();

3855Info.opc =ISD::INTRINSIC_W_CHAIN;

3856Info.memVT =getValueType(DL,I.getType());

3857Info.ptrVal =I.getArgOperand(0);

3858Info.offset = 0;

3859Info.flags =MachineMemOperand::MOLoad |MachineMemOperand::MOStore;

3860Info.align.reset();

3861returntrue;

3862 }

3863

3864case Intrinsic::nvvm_ldu_global_i:

3865case Intrinsic::nvvm_ldu_global_f:

3866case Intrinsic::nvvm_ldu_global_p: {

3867auto &DL =I.getDataLayout();

3868Info.opc =ISD::INTRINSIC_W_CHAIN;

3869if (Intrinsic == Intrinsic::nvvm_ldu_global_i)

3870Info.memVT =getValueType(DL,I.getType());

3871elseif(Intrinsic == Intrinsic::nvvm_ldu_global_p)

3872Info.memVT =getPointerTy(DL);

3873else

3874Info.memVT =getValueType(DL,I.getType());

3875Info.ptrVal =I.getArgOperand(0);

3876Info.offset = 0;

3877Info.flags =MachineMemOperand::MOLoad;

3878Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();

3879

3880returntrue;

3881 }

3882case Intrinsic::nvvm_tex_1d_v4f32_s32:

3883case Intrinsic::nvvm_tex_1d_v4f32_f32:

3884case Intrinsic::nvvm_tex_1d_level_v4f32_f32:

3885case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:

3886case Intrinsic::nvvm_tex_1d_array_v4f32_s32:

3887case Intrinsic::nvvm_tex_1d_array_v4f32_f32:

3888case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:

3889case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:

3890case Intrinsic::nvvm_tex_2d_v4f32_s32:

3891case Intrinsic::nvvm_tex_2d_v4f32_f32:

3892case Intrinsic::nvvm_tex_2d_level_v4f32_f32:

3893case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:

3894case Intrinsic::nvvm_tex_2d_array_v4f32_s32:

3895case Intrinsic::nvvm_tex_2d_array_v4f32_f32:

3896case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:

3897case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:

3898case Intrinsic::nvvm_tex_3d_v4f32_s32:

3899case Intrinsic::nvvm_tex_3d_v4f32_f32:

3900case Intrinsic::nvvm_tex_3d_level_v4f32_f32:

3901case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:

3902case Intrinsic::nvvm_tex_cube_v4f32_f32:

3903case Intrinsic::nvvm_tex_cube_level_v4f32_f32:

3904case Intrinsic::nvvm_tex_cube_array_v4f32_f32:

3905case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:

3906case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:

3907case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:

3908case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:

3909case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:

3910case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:

3911case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:

3912case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:

3913case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:

3914case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:

3915case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:

3916case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:

3917case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:

3918case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:

3919case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:

3920case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:

3921case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:

3922case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:

3923case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:

3924case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:

3925case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:

3926case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:

3927case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:

3928case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:

3929case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:

3930case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:

3931case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:

3932case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:

3933case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:

3934case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:

3935case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:

3936case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:

3937case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:

3938case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:

3939case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:

3940Info.opc =ISD::INTRINSIC_W_CHAIN;

3941Info.memVT = MVT::v4f32;

3942Info.ptrVal =nullptr;

3943Info.offset = 0;

3944Info.flags =MachineMemOperand::MOLoad;

3945Info.align =Align(16);

3946returntrue;

3947

3948case Intrinsic::nvvm_tex_1d_v4s32_s32:

3949case Intrinsic::nvvm_tex_1d_v4s32_f32:

3950case Intrinsic::nvvm_tex_1d_level_v4s32_f32:

3951case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:

3952case Intrinsic::nvvm_tex_1d_array_v4s32_s32:

3953case Intrinsic::nvvm_tex_1d_array_v4s32_f32:

3954case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:

3955case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:

3956case Intrinsic::nvvm_tex_2d_v4s32_s32:

3957case Intrinsic::nvvm_tex_2d_v4s32_f32:

3958case Intrinsic::nvvm_tex_2d_level_v4s32_f32:

3959case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:

3960case Intrinsic::nvvm_tex_2d_array_v4s32_s32:

3961case Intrinsic::nvvm_tex_2d_array_v4s32_f32:

3962case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:

3963case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:

3964case Intrinsic::nvvm_tex_3d_v4s32_s32:

3965case Intrinsic::nvvm_tex_3d_v4s32_f32:

3966case Intrinsic::nvvm_tex_3d_level_v4s32_f32:

3967case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:

3968case Intrinsic::nvvm_tex_cube_v4s32_f32:

3969case Intrinsic::nvvm_tex_cube_level_v4s32_f32:

3970case Intrinsic::nvvm_tex_cube_array_v4s32_f32:

3971case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:

3972case Intrinsic::nvvm_tex_cube_v4u32_f32:

3973case Intrinsic::nvvm_tex_cube_level_v4u32_f32:

3974case Intrinsic::nvvm_tex_cube_array_v4u32_f32:

3975case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:

3976case Intrinsic::nvvm_tex_1d_v4u32_s32:

3977case Intrinsic::nvvm_tex_1d_v4u32_f32:

3978case Intrinsic::nvvm_tex_1d_level_v4u32_f32:

3979case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:

3980case Intrinsic::nvvm_tex_1d_array_v4u32_s32:

3981case Intrinsic::nvvm_tex_1d_array_v4u32_f32:

3982case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:

3983case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:

3984case Intrinsic::nvvm_tex_2d_v4u32_s32:

3985case Intrinsic::nvvm_tex_2d_v4u32_f32:

3986case Intrinsic::nvvm_tex_2d_level_v4u32_f32:

3987case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:

3988case Intrinsic::nvvm_tex_2d_array_v4u32_s32:

3989case Intrinsic::nvvm_tex_2d_array_v4u32_f32:

3990case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:

3991case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:

3992case Intrinsic::nvvm_tex_3d_v4u32_s32:

3993case Intrinsic::nvvm_tex_3d_v4u32_f32:

3994case Intrinsic::nvvm_tex_3d_level_v4u32_f32:

3995case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:

3996case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:

3997case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:

3998case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:

3999case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:

4000case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:

4001case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:

4002case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:

4003case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:

4004case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:

4005case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:

4006case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:

4007case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:

4008case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:

4009case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:

4010case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:

4011case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:

4012case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:

4013case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:

4014case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:

4015case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:

4016case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:

4017case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:

4018case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:

4019case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:

4020case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:

4021case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:

4022case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:

4023case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:

4024case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:

4025case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:

4026case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:

4027case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:

4028case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:

4029case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:

4030case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:

4031case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:

4032case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:

4033case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:

4034case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:

4035case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:

4036case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:

4037case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:

4038case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:

4039case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:

4040case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:

4041case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:

4042case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:

4043case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:

4044case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:

4045case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:

4046case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:

4047case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:

4048case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:

4049case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:

4050case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:

4051case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:

4052case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:

4053case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:

4054case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:

4055case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:

4056case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:

4057case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:

4058case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:

4059case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:

4060case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:

4061case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:

4062case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:

4063case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:

4064Info.opc =ISD::INTRINSIC_W_CHAIN;

4065Info.memVT = MVT::v4i32;

4066Info.ptrVal =nullptr;

4067Info.offset = 0;

4068Info.flags =MachineMemOperand::MOLoad;

4069Info.align =Align(16);

4070returntrue;

4071

4072case Intrinsic::nvvm_suld_1d_i8_clamp:

4073case Intrinsic::nvvm_suld_1d_v2i8_clamp:

4074case Intrinsic::nvvm_suld_1d_v4i8_clamp:

4075case Intrinsic::nvvm_suld_1d_array_i8_clamp:

4076case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:

4077case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:

4078case Intrinsic::nvvm_suld_2d_i8_clamp:

4079case Intrinsic::nvvm_suld_2d_v2i8_clamp:

4080case Intrinsic::nvvm_suld_2d_v4i8_clamp:

4081case Intrinsic::nvvm_suld_2d_array_i8_clamp:

4082case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:

4083case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:

4084case Intrinsic::nvvm_suld_3d_i8_clamp:

4085case Intrinsic::nvvm_suld_3d_v2i8_clamp:

4086case Intrinsic::nvvm_suld_3d_v4i8_clamp:

4087case Intrinsic::nvvm_suld_1d_i8_trap:

4088case Intrinsic::nvvm_suld_1d_v2i8_trap:

4089case Intrinsic::nvvm_suld_1d_v4i8_trap:

4090case Intrinsic::nvvm_suld_1d_array_i8_trap:

4091case Intrinsic::nvvm_suld_1d_array_v2i8_trap:

4092case Intrinsic::nvvm_suld_1d_array_v4i8_trap:

4093case Intrinsic::nvvm_suld_2d_i8_trap:

4094case Intrinsic::nvvm_suld_2d_v2i8_trap:

4095case Intrinsic::nvvm_suld_2d_v4i8_trap:

4096case Intrinsic::nvvm_suld_2d_array_i8_trap:

4097case Intrinsic::nvvm_suld_2d_array_v2i8_trap:

4098case Intrinsic::nvvm_suld_2d_array_v4i8_trap:

4099case Intrinsic::nvvm_suld_3d_i8_trap:

4100case Intrinsic::nvvm_suld_3d_v2i8_trap:

4101case Intrinsic::nvvm_suld_3d_v4i8_trap:

4102case Intrinsic::nvvm_suld_1d_i8_zero:

4103case Intrinsic::nvvm_suld_1d_v2i8_zero:

4104case Intrinsic::nvvm_suld_1d_v4i8_zero:

4105case Intrinsic::nvvm_suld_1d_array_i8_zero:

4106case Intrinsic::nvvm_suld_1d_array_v2i8_zero:

4107case Intrinsic::nvvm_suld_1d_array_v4i8_zero:

4108case Intrinsic::nvvm_suld_2d_i8_zero:

4109case Intrinsic::nvvm_suld_2d_v2i8_zero:

4110case Intrinsic::nvvm_suld_2d_v4i8_zero:

4111case Intrinsic::nvvm_suld_2d_array_i8_zero:

4112case Intrinsic::nvvm_suld_2d_array_v2i8_zero:

4113case Intrinsic::nvvm_suld_2d_array_v4i8_zero:

4114case Intrinsic::nvvm_suld_3d_i8_zero:

4115case Intrinsic::nvvm_suld_3d_v2i8_zero:

4116case Intrinsic::nvvm_suld_3d_v4i8_zero:

4117Info.opc =ISD::INTRINSIC_W_CHAIN;

4118Info.memVT = MVT::i8;

4119Info.ptrVal =nullptr;

4120Info.offset = 0;

4121Info.flags =MachineMemOperand::MOLoad;

4122Info.align =Align(16);

4123returntrue;

4124

4125case Intrinsic::nvvm_suld_1d_i16_clamp:

4126case Intrinsic::nvvm_suld_1d_v2i16_clamp:

4127case Intrinsic::nvvm_suld_1d_v4i16_clamp:

4128case Intrinsic::nvvm_suld_1d_array_i16_clamp:

4129case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:

4130case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:

4131case Intrinsic::nvvm_suld_2d_i16_clamp:

4132case Intrinsic::nvvm_suld_2d_v2i16_clamp:

4133case Intrinsic::nvvm_suld_2d_v4i16_clamp:

4134case Intrinsic::nvvm_suld_2d_array_i16_clamp:

4135case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:

4136case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:

4137case Intrinsic::nvvm_suld_3d_i16_clamp:

4138case Intrinsic::nvvm_suld_3d_v2i16_clamp:

4139case Intrinsic::nvvm_suld_3d_v4i16_clamp:

4140case Intrinsic::nvvm_suld_1d_i16_trap:

4141case Intrinsic::nvvm_suld_1d_v2i16_trap:

4142case Intrinsic::nvvm_suld_1d_v4i16_trap:

4143case Intrinsic::nvvm_suld_1d_array_i16_trap:

4144case Intrinsic::nvvm_suld_1d_array_v2i16_trap:

4145case Intrinsic::nvvm_suld_1d_array_v4i16_trap:

4146case Intrinsic::nvvm_suld_2d_i16_trap:

4147case Intrinsic::nvvm_suld_2d_v2i16_trap:

4148case Intrinsic::nvvm_suld_2d_v4i16_trap:

4149case Intrinsic::nvvm_suld_2d_array_i16_trap:

4150case Intrinsic::nvvm_suld_2d_array_v2i16_trap:

4151case Intrinsic::nvvm_suld_2d_array_v4i16_trap:

4152case Intrinsic::nvvm_suld_3d_i16_trap:

4153case Intrinsic::nvvm_suld_3d_v2i16_trap:

4154case Intrinsic::nvvm_suld_3d_v4i16_trap:

4155case Intrinsic::nvvm_suld_1d_i16_zero:

4156case Intrinsic::nvvm_suld_1d_v2i16_zero:

4157case Intrinsic::nvvm_suld_1d_v4i16_zero:

4158case Intrinsic::nvvm_suld_1d_array_i16_zero:

4159case Intrinsic::nvvm_suld_1d_array_v2i16_zero:

4160case Intrinsic::nvvm_suld_1d_array_v4i16_zero:

4161case Intrinsic::nvvm_suld_2d_i16_zero:

4162case Intrinsic::nvvm_suld_2d_v2i16_zero:

4163case Intrinsic::nvvm_suld_2d_v4i16_zero:

4164case Intrinsic::nvvm_suld_2d_array_i16_zero:

4165case Intrinsic::nvvm_suld_2d_array_v2i16_zero:

4166case Intrinsic::nvvm_suld_2d_array_v4i16_zero:

4167case Intrinsic::nvvm_suld_3d_i16_zero:

4168case Intrinsic::nvvm_suld_3d_v2i16_zero:

4169case Intrinsic::nvvm_suld_3d_v4i16_zero:

4170Info.opc =ISD::INTRINSIC_W_CHAIN;

4171Info.memVT = MVT::i16;

4172Info.ptrVal =nullptr;

4173Info.offset = 0;

4174Info.flags =MachineMemOperand::MOLoad;

4175Info.align =Align(16);

4176returntrue;

4177

4178case Intrinsic::nvvm_suld_1d_i32_clamp:

4179case Intrinsic::nvvm_suld_1d_v2i32_clamp:

4180case Intrinsic::nvvm_suld_1d_v4i32_clamp:

4181case Intrinsic::nvvm_suld_1d_array_i32_clamp:

4182case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:

4183case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:

4184case Intrinsic::nvvm_suld_2d_i32_clamp:

4185case Intrinsic::nvvm_suld_2d_v2i32_clamp:

4186case Intrinsic::nvvm_suld_2d_v4i32_clamp:

4187case Intrinsic::nvvm_suld_2d_array_i32_clamp:

4188case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:

4189case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:

4190case Intrinsic::nvvm_suld_3d_i32_clamp:

4191case Intrinsic::nvvm_suld_3d_v2i32_clamp:

4192case Intrinsic::nvvm_suld_3d_v4i32_clamp:

4193case Intrinsic::nvvm_suld_1d_i32_trap:

4194case Intrinsic::nvvm_suld_1d_v2i32_trap:

4195case Intrinsic::nvvm_suld_1d_v4i32_trap:

4196case Intrinsic::nvvm_suld_1d_array_i32_trap:

4197case Intrinsic::nvvm_suld_1d_array_v2i32_trap:

4198case Intrinsic::nvvm_suld_1d_array_v4i32_trap:

4199case Intrinsic::nvvm_suld_2d_i32_trap:

4200case Intrinsic::nvvm_suld_2d_v2i32_trap:

4201case Intrinsic::nvvm_suld_2d_v4i32_trap:

4202case Intrinsic::nvvm_suld_2d_array_i32_trap:

4203case Intrinsic::nvvm_suld_2d_array_v2i32_trap:

4204case Intrinsic::nvvm_suld_2d_array_v4i32_trap:

4205case Intrinsic::nvvm_suld_3d_i32_trap:

4206case Intrinsic::nvvm_suld_3d_v2i32_trap:

4207case Intrinsic::nvvm_suld_3d_v4i32_trap:

4208case Intrinsic::nvvm_suld_1d_i32_zero:

4209case Intrinsic::nvvm_suld_1d_v2i32_zero:

4210case Intrinsic::nvvm_suld_1d_v4i32_zero:

4211case Intrinsic::nvvm_suld_1d_array_i32_zero:

4212case Intrinsic::nvvm_suld_1d_array_v2i32_zero:

4213case Intrinsic::nvvm_suld_1d_array_v4i32_zero:

4214case Intrinsic::nvvm_suld_2d_i32_zero:

4215case Intrinsic::nvvm_suld_2d_v2i32_zero:

4216case Intrinsic::nvvm_suld_2d_v4i32_zero:

4217case Intrinsic::nvvm_suld_2d_array_i32_zero:

4218case Intrinsic::nvvm_suld_2d_array_v2i32_zero:

4219case Intrinsic::nvvm_suld_2d_array_v4i32_zero:

4220case Intrinsic::nvvm_suld_3d_i32_zero:

4221case Intrinsic::nvvm_suld_3d_v2i32_zero:

4222case Intrinsic::nvvm_suld_3d_v4i32_zero:

4223Info.opc =ISD::INTRINSIC_W_CHAIN;

4224Info.memVT = MVT::i32;

4225Info.ptrVal =nullptr;

4226Info.offset = 0;

4227Info.flags =MachineMemOperand::MOLoad;

4228Info.align =Align(16);

4229returntrue;

4230

4231case Intrinsic::nvvm_suld_1d_i64_clamp:

4232case Intrinsic::nvvm_suld_1d_v2i64_clamp:

4233case Intrinsic::nvvm_suld_1d_array_i64_clamp:

4234case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:

4235case Intrinsic::nvvm_suld_2d_i64_clamp:

4236case Intrinsic::nvvm_suld_2d_v2i64_clamp:

4237case Intrinsic::nvvm_suld_2d_array_i64_clamp:

4238case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:

4239case Intrinsic::nvvm_suld_3d_i64_clamp:

4240case Intrinsic::nvvm_suld_3d_v2i64_clamp:

4241case Intrinsic::nvvm_suld_1d_i64_trap:

4242case Intrinsic::nvvm_suld_1d_v2i64_trap:

4243case Intrinsic::nvvm_suld_1d_array_i64_trap:

4244case Intrinsic::nvvm_suld_1d_array_v2i64_trap:

4245case Intrinsic::nvvm_suld_2d_i64_trap:

4246case Intrinsic::nvvm_suld_2d_v2i64_trap:

4247case Intrinsic::nvvm_suld_2d_array_i64_trap:

4248case Intrinsic::nvvm_suld_2d_array_v2i64_trap:

4249case Intrinsic::nvvm_suld_3d_i64_trap:

4250case Intrinsic::nvvm_suld_3d_v2i64_trap:

4251case Intrinsic::nvvm_suld_1d_i64_zero:

4252case Intrinsic::nvvm_suld_1d_v2i64_zero:

4253case Intrinsic::nvvm_suld_1d_array_i64_zero:

4254case Intrinsic::nvvm_suld_1d_array_v2i64_zero:

4255case Intrinsic::nvvm_suld_2d_i64_zero:

4256case Intrinsic::nvvm_suld_2d_v2i64_zero:

4257case Intrinsic::nvvm_suld_2d_array_i64_zero:

4258case Intrinsic::nvvm_suld_2d_array_v2i64_zero:

4259case Intrinsic::nvvm_suld_3d_i64_zero:

4260case Intrinsic::nvvm_suld_3d_v2i64_zero:

4261Info.opc =ISD::INTRINSIC_W_CHAIN;

4262Info.memVT = MVT::i64;

4263Info.ptrVal =nullptr;

4264Info.offset = 0;

4265Info.flags =MachineMemOperand::MOLoad;

4266Info.align =Align(16);

4267returntrue;

4268 }

4269returnfalse;

4270}

4271

4272/// getFunctionParamOptimizedAlign - since function arguments are passed via

4273/// .param space, we may want to increase their alignment in a way that

4274/// ensures that we can effectively vectorize their loads & stores. We can

4275/// increase alignment only if the function has internal or has private

4276/// linkage as for other linkage types callers may already rely on default

4277/// alignment. To allow using 128-bit vectorized loads/stores, this function

4278/// ensures that alignment is 16 or greater.

4279Align NVPTXTargetLowering::getFunctionParamOptimizedAlign(

4280constFunction *F,Type *ArgTy,constDataLayout &DL) const{

4281// Capping the alignment to 128 bytes as that is the maximum alignment

4282// supported by PTX.

4283constAlign ABITypeAlign = std::min(Align(128),DL.getABITypeAlign(ArgTy));

4284

4285// If a function has linkage different from internal or private, we

4286// must use default ABI alignment as external users rely on it. Same

4287// for a function that may be called from a function pointer.

4288if (!F || !F->hasLocalLinkage() ||

4289F->hasAddressTaken(/*Users=*/nullptr,

4290/*IgnoreCallbackUses=*/false,

4291/*IgnoreAssumeLikeCalls=*/true,

4292/*IgnoreLLVMUsed=*/true))

4293return ABITypeAlign;

4294

4295assert(!isKernelFunction(*F) &&"Expect kernels to have non-local linkage");

4296return std::max(Align(16), ABITypeAlign);

4297}

4298

4299/// Helper for computing alignment of a device function byval parameter.

4300Align NVPTXTargetLowering::getFunctionByValParamAlign(

4301constFunction *F,Type *ArgTy,Align InitialAlign,

4302constDataLayout &DL) const{

4303Align ArgAlign = InitialAlign;

4304// Try to increase alignment to enhance vectorization options.

4305if (F)

4306 ArgAlign = std::max(ArgAlign,getFunctionParamOptimizedAlign(F, ArgTy,DL));

4307

4308// Old ptx versions have a bug. When PTX code takes address of

4309// byval parameter with alignment < 4, ptxas generates code to

4310// spill argument into memory. Alas on sm_50+ ptxas generates

4311// SASS code that fails with misaligned access. To work around

4312// the problem, make sure that we align byval parameters by at

4313// least 4. This bug seems to be fixed at least starting from

4314// ptxas > 9.0.

4315// TODO: remove this after verifying the bug is not reproduced

4316// on non-deprecated ptxas versions.

4317if (ForceMinByValParamAlign)

4318 ArgAlign = std::max(ArgAlign,Align(4));

4319

4320return ArgAlign;

4321}

4322

4323// Helper for getting a function parameter name. Name is composed from

4324// its index and the function name. Negative index corresponds to special

4325// parameter (unsized array) used for passing variable arguments.

4326std::stringNVPTXTargetLowering::getParamName(constFunction *F,

4327intIdx) const{

4328 std::string ParamName;

4329raw_string_ostream ParamStr(ParamName);

4330

4331 ParamStr <<getTargetMachine().getSymbol(F)->getName();

4332if (Idx < 0)

4333 ParamStr <<"_vararg";

4334else

4335 ParamStr <<"_param_" <<Idx;

4336

4337return ParamName;

4338}

4339

4340/// isLegalAddressingMode - Return true if the addressing mode represented

4341/// by AM is legal for this target, for a load/store of the specified type.

4342/// Used to guide target specific optimizations, like loop strength reduction

4343/// (LoopStrengthReduce.cpp) and memory optimization for address mode

4344/// (CodeGenPrepare.cpp)

4345boolNVPTXTargetLowering::isLegalAddressingMode(constDataLayout &DL,

4346constAddrMode &AM,Type *Ty,

4347unsigned AS,Instruction *I) const{

4348// AddrMode - This represents an addressing mode of:

4349// BaseGV + BaseOffs + BaseReg + Scale*ScaleReg

4350//

4351// The legal address modes are

4352// - [avar]

4353// - [areg]

4354// - [areg+immoff]

4355// - [immAddr]

4356

4357// immoff must fit in a signed 32-bit int

4358if (!APInt(64, AM.BaseOffs).isSignedIntN(32))

4359returnfalse;

4360

4361if (AM.BaseGV)

4362return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;

4363

4364switch (AM.Scale) {

4365case 0:// "r", "r+i" or "i" is allowed

4366break;

4367case 1:

4368if (AM.HasBaseReg)// "r+r+i" or "r+r" is not allowed.

4369returnfalse;

4370// Otherwise we have r+i.

4371break;

4372default:

4373// No scale > 1 is allowed

4374returnfalse;

4375 }

4376returntrue;

4377}

4378

4379//===----------------------------------------------------------------------===//

4380// NVPTX Inline Assembly Support

4381//===----------------------------------------------------------------------===//

4382

4383/// getConstraintType - Given a constraint letter, return the type of

4384/// constraint it is for this target.

4385NVPTXTargetLowering::ConstraintType

4386NVPTXTargetLowering::getConstraintType(StringRef Constraint) const{

4387if (Constraint.size() == 1) {

4388switch (Constraint[0]) {

4389default:

4390break;

4391case'b':

4392case'r':

4393case'h':

4394case'c':

4395case'l':

4396case'f':

4397case'd':

4398case'q':

4399case'0':

4400case'N':

4401returnC_RegisterClass;

4402 }

4403 }

4404returnTargetLowering::getConstraintType(Constraint);

4405}

4406

4407std::pair<unsigned, const TargetRegisterClass *>

4408NVPTXTargetLowering::getRegForInlineAsmConstraint(constTargetRegisterInfo *TRI,

4409StringRef Constraint,

4410MVT VT) const{

4411if (Constraint.size() == 1) {

4412switch (Constraint[0]) {

4413case'b':

4414return std::make_pair(0U, &NVPTX::Int1RegsRegClass);

4415case'c':

4416return std::make_pair(0U, &NVPTX::Int16RegsRegClass);

4417case'h':

4418return std::make_pair(0U, &NVPTX::Int16RegsRegClass);

4419case'r':

4420return std::make_pair(0U, &NVPTX::Int32RegsRegClass);

4421case'l':

4422case'N':

4423return std::make_pair(0U, &NVPTX::Int64RegsRegClass);

4424case'q': {

4425if (STI.getSmVersion() < 70)

4426report_fatal_error("Inline asm with 128 bit operands is only "

4427"supported for sm_70 and higher!");

4428return std::make_pair(0U, &NVPTX::Int128RegsRegClass);

4429 }

4430case'f':

4431return std::make_pair(0U, &NVPTX::Float32RegsRegClass);

4432case'd':

4433return std::make_pair(0U, &NVPTX::Float64RegsRegClass);

4434 }

4435 }

4436returnTargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

4437}

4438

4439//===----------------------------------------------------------------------===//

4440// NVPTX DAG Combining

4441//===----------------------------------------------------------------------===//

4442

4443boolNVPTXTargetLowering::allowFMA(MachineFunction &MF,

4444CodeGenOptLevel OptLevel) const{

4445// Always honor command-line argument

4446if (FMAContractLevelOpt.getNumOccurrences() > 0)

4447returnFMAContractLevelOpt > 0;

4448

4449// Do not contract if we're not optimizing the code.

4450if (OptLevel ==CodeGenOptLevel::None)

4451returnfalse;

4452

4453// Honor TargetOptions flags that explicitly say fusion is okay.

4454if (MF.getTarget().Options.AllowFPOpFusion ==FPOpFusion::Fast)

4455returntrue;

4456

4457returnallowUnsafeFPMath(MF);

4458}

4459

4460boolNVPTXTargetLowering::allowUnsafeFPMath(MachineFunction &MF) const{

4461// Honor TargetOptions flags that explicitly say unsafe math is okay.

4462if (MF.getTarget().Options.UnsafeFPMath)

4463returntrue;

4464

4465// Allow unsafe math if unsafe-fp-math attribute explicitly says so.

4466constFunction &F = MF.getFunction();

4467returnF.getFnAttribute("unsafe-fp-math").getValueAsBool();

4468}

4469

4470staticboolisConstZero(constSDValue &Operand) {

4471constauto *Const = dyn_cast<ConstantSDNode>(Operand);

4472return Const && Const->getZExtValue() == 0;

4473}

4474

4475/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with

4476/// operands N0 and N1. This is a helper for PerformADDCombine that is

4477/// called with the default operands, and if that fails, with commuted

4478/// operands.

4479staticSDValue

4480PerformADDCombineWithOperands(SDNode *N,SDValue N0,SDValue N1,

4481TargetLowering::DAGCombinerInfo &DCI) {

4482EVT VT = N0.getValueType();

4483

4484// Since integer multiply-add costs the same as integer multiply

4485// but is more costly than integer add, do the fusion only when

4486// the mul is only used in the add.

4487// TODO: this may not be true for later architectures, consider relaxing this

4488if (!N0.getNode()->hasOneUse())

4489returnSDValue();

4490

4491// fold (add (select cond, 0, (mul a, b)), c)

4492// -> (select cond, c, (add (mul a, b), c))

4493//

4494if (N0.getOpcode() ==ISD::SELECT) {

4495unsigned ZeroOpNum;

4496if (isConstZero(N0->getOperand(1)))

4497 ZeroOpNum = 1;

4498elseif (isConstZero(N0->getOperand(2)))

4499 ZeroOpNum = 2;

4500else

4501returnSDValue();

4502

4503SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);

4504if (M->getOpcode() !=ISD::MUL || !M.getNode()->hasOneUse())

4505returnSDValue();

4506

4507SDLoc DL(N);

4508SDValue Mul =

4509 DCI.DAG.getNode(ISD::MUL,DL, VT, M->getOperand(0), M->getOperand(1));

4510SDValue MAD = DCI.DAG.getNode(ISD::ADD,DL, VT,Mul, N1);

4511return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),

4512 ((ZeroOpNum == 1) ? N1 : MAD),

4513 ((ZeroOpNum == 1) ? MAD : N1));

4514 }

4515

4516returnSDValue();

4517}

4518

4519staticSDValue

4520PerformFADDCombineWithOperands(SDNode *N,SDValue N0,SDValue N1,

4521TargetLowering::DAGCombinerInfo &DCI,

4522CodeGenOptLevel OptLevel) {

4523EVT VT = N0.getValueType();

4524if (N0.getOpcode() ==ISD::FMUL) {

4525constauto *TLI =static_cast<constNVPTXTargetLowering *>(

4526 &DCI.DAG.getTargetLoweringInfo());

4527if (!TLI->allowFMA(DCI.DAG.getMachineFunction(), OptLevel))

4528returnSDValue();

4529

4530// For floating point:

4531// Do the fusion only when the mul has less than 5 uses and all

4532// are add.

4533// The heuristic is that if a use is not an add, then that use

4534// cannot be fused into fma, therefore mul is still needed anyway.

4535// If there are more than 4 uses, even if they are all add, fusing

4536// them will increase register pressue.

4537//

4538int numUses = 0;

4539int nonAddCount = 0;

4540for (constSDNode *User : N0.getNode()->users()) {

4541 numUses++;

4542if (User->getOpcode() !=ISD::FADD)

4543 ++nonAddCount;

4544if (numUses >= 5)

4545returnSDValue();

4546 }

4547if (nonAddCount) {

4548int orderNo =N->getIROrder();

4549int orderNo2 = N0.getNode()->getIROrder();

4550// simple heuristics here for considering potential register

4551// pressure, the logics here is that the differnce are used

4552// to measure the distance between def and use, the longer distance

4553// more likely cause register pressure.

4554if (orderNo - orderNo2 < 500)

4555returnSDValue();

4556

4557// Now, check if at least one of the FMUL's operands is live beyond the

4558// node N, which guarantees that the FMA will not increase register

4559// pressure at node N.

4560bool opIsLive =false;

4561constSDNode *left = N0.getOperand(0).getNode();

4562constSDNode *right = N0.getOperand(1).getNode();

4563

4564if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))

4565 opIsLive =true;

4566

4567if (!opIsLive)

4568for (constSDNode *User : left->users()) {

4569int orderNo3 =User->getIROrder();

4570if (orderNo3 > orderNo) {

4571 opIsLive =true;

4572break;

4573 }

4574 }

4575

4576if (!opIsLive)

4577for (constSDNode *User : right->users()) {

4578int orderNo3 =User->getIROrder();

4579if (orderNo3 > orderNo) {

4580 opIsLive =true;

4581break;

4582 }

4583 }

4584

4585if (!opIsLive)

4586returnSDValue();

4587 }

4588

4589return DCI.DAG.getNode(ISD::FMA,SDLoc(N), VT, N0.getOperand(0),

4590 N0.getOperand(1), N1);

4591 }

4592

4593returnSDValue();

4594}

4595

4596staticSDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front,

4597 std::size_t Back) {

4598if (all_of(N->ops().drop_front(Front).drop_back(Back),

4599 [](constSDUse &U) { return U.get()->isUndef(); }))

4600// Operand 0 is the previous value in the chain. Cannot return EntryToken

4601// as the previous value will become unused and eliminated later.

4602returnN->getOperand(0);

4603

4604returnSDValue();

4605}

4606

4607staticSDValue PerformStoreParamCombine(SDNode *N) {

4608// Operands from the 3rd to the 2nd last one are the values to be stored.

4609// {Chain, ArgID, Offset, Val, Glue}

4610returnPerformStoreCombineHelper(N, 3, 1);

4611}

4612

4613staticSDValue PerformStoreRetvalCombine(SDNode *N) {

4614// Operands from the 2nd to the last one are the values to be stored

4615returnPerformStoreCombineHelper(N, 2, 0);

4616}

4617

4618/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.

4619///

4620staticSDValue PerformADDCombine(SDNode *N,

4621TargetLowering::DAGCombinerInfo &DCI,

4622CodeGenOptLevel OptLevel) {

4623if (OptLevel ==CodeGenOptLevel::None)

4624returnSDValue();

4625

4626SDValue N0 =N->getOperand(0);

4627SDValue N1 =N->getOperand(1);

4628

4629// Skip non-integer, non-scalar case

4630EVT VT = N0.getValueType();

4631if (VT.isVector() || VT != MVT::i32)

4632returnSDValue();

4633

4634// First try with the default operand order.

4635if (SDValue Result =PerformADDCombineWithOperands(N, N0, N1, DCI))

4636return Result;

4637

4638// If that didn't work, try again with the operands commuted.

4639returnPerformADDCombineWithOperands(N, N1, N0, DCI);

4640}

4641

4642/// PerformFADDCombine - Target-specific dag combine xforms for ISD::FADD.

4643///

4644staticSDValue PerformFADDCombine(SDNode *N,

4645TargetLowering::DAGCombinerInfo &DCI,

4646CodeGenOptLevel OptLevel) {

4647SDValue N0 =N->getOperand(0);

4648SDValue N1 =N->getOperand(1);

4649

4650EVT VT = N0.getValueType();

4651if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))

4652returnSDValue();

4653

4654// First try with the default operand order.

4655if (SDValue Result =PerformFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))

4656return Result;

4657

4658// If that didn't work, try again with the operands commuted.

4659returnPerformFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);

4660}

4661

4662staticSDValue PerformANDCombine(SDNode *N,

4663TargetLowering::DAGCombinerInfo &DCI) {

4664// The type legalizer turns a vector load of i8 values into a zextload to i16

4665// registers, optionally ANY_EXTENDs it (if target type is integer),

4666// and ANDs off the high 8 bits. Since we turn this load into a

4667// target-specific DAG node, the DAG combiner fails to eliminate these AND

4668// nodes. Do that here.

4669SDValue Val =N->getOperand(0);

4670SDValue Mask =N->getOperand(1);

4671

4672if (isa<ConstantSDNode>(Val)) {

4673std::swap(Val, Mask);

4674 }

4675

4676SDValue AExt;

4677

4678// Convert BFE-> truncate i16 -> and 255

4679// To just BFE-> truncate i16, as the value already has all the bits in the

4680// right places.

4681if (Val.getOpcode() ==ISD::TRUNCATE) {

4682SDValue BFE = Val.getOperand(0);

4683if (BFE.getOpcode() !=NVPTXISD::BFE)

4684returnSDValue();

4685

4686ConstantSDNode *BFEBits = dyn_cast<ConstantSDNode>(BFE.getOperand(0));

4687if (!BFEBits)

4688returnSDValue();

4689uint64_t BFEBitsVal = BFEBits->getZExtValue();

4690

4691ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);

4692if (!MaskCnst) {

4693// Not an AND with a constant

4694returnSDValue();

4695 }

4696uint64_t MaskVal = MaskCnst->getZExtValue();

4697

4698if (MaskVal != (uint64_t(1) << BFEBitsVal) - 1)

4699returnSDValue();

4700// If we get here, the AND is unnecessary. Just replace it with the trunc

4701 DCI.CombineTo(N, Val,false);

4702 }

4703// Generally, we will see zextload -> IMOV16rr -> ANY_EXTEND -> and

4704if (Val.getOpcode() ==ISD::ANY_EXTEND) {

4705 AExt = Val;

4706 Val = Val->getOperand(0);

4707 }

4708

4709if (Val->getOpcode() ==NVPTXISD::LoadV2 ||

4710 Val->getOpcode() ==NVPTXISD::LoadV4) {

4711ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(Mask);

4712if (!MaskCnst) {

4713// Not an AND with a constant

4714returnSDValue();

4715 }

4716

4717uint64_t MaskVal = MaskCnst->getZExtValue();

4718if (MaskVal != 0xff) {

4719// Not an AND that chops off top 8 bits

4720returnSDValue();

4721 }

4722

4723MemSDNode *Mem = dyn_cast<MemSDNode>(Val);

4724if (!Mem) {

4725// Not a MemSDNode?!?

4726returnSDValue();

4727 }

4728

4729EVT MemVT = Mem->getMemoryVT();

4730if (MemVT != MVT::v2i8 && MemVT != MVT::v4i8) {

4731// We only handle the i8 case

4732returnSDValue();

4733 }

4734

4735unsigned ExtType = Val->getConstantOperandVal(Val->getNumOperands() - 1);

4736if (ExtType ==ISD::SEXTLOAD) {

4737// If for some reason the load is a sextload, the and is needed to zero

4738// out the high 8 bits

4739returnSDValue();

4740 }

4741

4742bool AddTo =false;

4743if (AExt.getNode() !=nullptr) {

4744// Re-insert the ext as a zext.

4745 Val = DCI.DAG.getNode(ISD::ZERO_EXTEND,SDLoc(N),

4746 AExt.getValueType(), Val);

4747 AddTo =true;

4748 }

4749

4750// If we get here, the AND is unnecessary. Just replace it with the load

4751 DCI.CombineTo(N, Val, AddTo);

4752 }

4753

4754returnSDValue();

4755}

4756

4757staticSDValue PerformREMCombine(SDNode *N,

4758TargetLowering::DAGCombinerInfo &DCI,

4759CodeGenOptLevel OptLevel) {

4760assert(N->getOpcode() ==ISD::SREM ||N->getOpcode() ==ISD::UREM);

4761

4762// Don't do anything at less than -O2.

4763if (OptLevel <CodeGenOptLevel::Default)

4764returnSDValue();

4765

4766SelectionDAG &DAG = DCI.DAG;

4767SDLoc DL(N);

4768EVT VT =N->getValueType(0);

4769bool IsSigned =N->getOpcode() ==ISD::SREM;

4770unsigned DivOpc = IsSigned ?ISD::SDIV :ISD::UDIV;

4771

4772constSDValue &Num =N->getOperand(0);

4773constSDValue &Den =N->getOperand(1);

4774

4775for (constSDNode *U : Num->users()) {

4776if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&

4777 U->getOperand(1) == Den) {

4778// Num % Den -> Num - (Num / Den) * Den

4779return DAG.getNode(ISD::SUB,DL, VT, Num,

4780 DAG.getNode(ISD::MUL,DL, VT,

4781 DAG.getNode(DivOpc,DL, VT, Num, Den),

4782 Den));

4783 }

4784 }

4785returnSDValue();

4786}

4787

4788enumOperandSignedness {

4789Signed = 0,

4790Unsigned,

4791Unknown

4792};

4793

4794/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand

4795/// that can be demoted to \p OptSize bits without loss of information. The

4796/// signedness of the operand, if determinable, is placed in \p S.

4797staticboolIsMulWideOperandDemotable(SDValue Op,

4798unsigned OptSize,

4799OperandSignedness &S) {

4800 S =Unknown;

4801

4802if (Op.getOpcode() ==ISD::SIGN_EXTEND ||

4803Op.getOpcode() ==ISD::SIGN_EXTEND_INREG) {

4804EVT OrigVT =Op.getOperand(0).getValueType();

4805if (OrigVT.getFixedSizeInBits() <= OptSize) {

4806 S =Signed;

4807returntrue;

4808 }

4809 }elseif (Op.getOpcode() ==ISD::ZERO_EXTEND) {

4810EVT OrigVT =Op.getOperand(0).getValueType();

4811if (OrigVT.getFixedSizeInBits() <= OptSize) {

4812 S =Unsigned;

4813returntrue;

4814 }

4815 }

4816

4817returnfalse;

4818}

4819

4820/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can

4821/// be demoted to \p OptSize bits without loss of information. If the operands

4822/// contain a constant, it should appear as the RHS operand. The signedness of

4823/// the operands is placed in \p IsSigned.

4824staticboolAreMulWideOperandsDemotable(SDValue LHS,SDValue RHS,

4825unsigned OptSize,

4826bool &IsSigned) {

4827OperandSignedness LHSSign;

4828

4829// The LHS operand must be a demotable op

4830if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))

4831returnfalse;

4832

4833// We should have been able to determine the signedness from the LHS

4834if (LHSSign ==Unknown)

4835returnfalse;

4836

4837 IsSigned = (LHSSign ==Signed);

4838

4839// The RHS can be a demotable op or a constant

4840if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(RHS)) {

4841constAPInt &Val = CI->getAPIntValue();

4842if (LHSSign ==Unsigned) {

4843return Val.isIntN(OptSize);

4844 }else {

4845return Val.isSignedIntN(OptSize);

4846 }

4847 }else {

4848OperandSignedness RHSSign;

4849if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))

4850returnfalse;

4851

4852return LHSSign == RHSSign;

4853 }

4854}

4855

4856/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply

4857/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform

4858/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift

4859/// amount.

4860staticSDValue TryMULWIDECombine(SDNode *N,

4861TargetLowering::DAGCombinerInfo &DCI) {

4862EVT MulType =N->getValueType(0);

4863if (MulType != MVT::i32 && MulType != MVT::i64) {

4864returnSDValue();

4865 }

4866

4867SDLoc DL(N);

4868unsigned OptSize = MulType.getSizeInBits() >> 1;

4869SDValue LHS =N->getOperand(0);

4870SDValue RHS =N->getOperand(1);

4871

4872// Canonicalize the multiply so the constant (if any) is on the right

4873if (N->getOpcode() ==ISD::MUL) {

4874if (isa<ConstantSDNode>(LHS)) {

4875std::swap(LHS,RHS);

4876 }

4877 }

4878

4879// If we have a SHL, determine the actual multiply amount

4880if (N->getOpcode() ==ISD::SHL) {

4881ConstantSDNode *ShlRHS = dyn_cast<ConstantSDNode>(RHS);

4882if (!ShlRHS) {

4883returnSDValue();

4884 }

4885

4886APInt ShiftAmt = ShlRHS->getAPIntValue();

4887unsignedBitWidth = MulType.getSizeInBits();

4888if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {

4889APInt MulVal =APInt(BitWidth, 1) << ShiftAmt;

4890RHS = DCI.DAG.getConstant(MulVal,DL, MulType);

4891 }else {

4892returnSDValue();

4893 }

4894 }

4895

4896boolSigned;

4897// Verify that our operands are demotable

4898if (!AreMulWideOperandsDemotable(LHS,RHS, OptSize,Signed)) {

4899returnSDValue();

4900 }

4901

4902EVT DemotedVT;

4903if (MulType == MVT::i32) {

4904 DemotedVT = MVT::i16;

4905 }else {

4906 DemotedVT = MVT::i32;

4907 }

4908

4909// Truncate the operands to the correct size. Note that these are just for

4910// type consistency and will (likely) be eliminated in later phases.

4911SDValue TruncLHS =

4912 DCI.DAG.getNode(ISD::TRUNCATE,DL, DemotedVT,LHS);

4913SDValue TruncRHS =

4914 DCI.DAG.getNode(ISD::TRUNCATE,DL, DemotedVT,RHS);

4915

4916unsigned Opc;

4917if (Signed) {

4918 Opc =NVPTXISD::MUL_WIDE_SIGNED;

4919 }else {

4920 Opc =NVPTXISD::MUL_WIDE_UNSIGNED;

4921 }

4922

4923return DCI.DAG.getNode(Opc,DL, MulType, TruncLHS, TruncRHS);

4924}

4925

4926staticboolisConstOne(constSDValue &Operand) {

4927constauto *Const = dyn_cast<ConstantSDNode>(Operand);

4928return Const && Const->getZExtValue() == 1;

4929}

4930

4931staticSDValue matchMADConstOnePattern(SDValue Add) {

4932if (Add->getOpcode() !=ISD::ADD)

4933returnSDValue();

4934

4935if (isConstOne(Add->getOperand(0)))

4936returnAdd->getOperand(1);

4937

4938if (isConstOne(Add->getOperand(1)))

4939returnAdd->getOperand(0);

4940

4941returnSDValue();

4942}

4943

4944staticSDValue combineMADConstOne(SDValue X,SDValue Add,EVT VT,SDLoc DL,

4945TargetLowering::DAGCombinerInfo &DCI) {

4946

4947if (SDValue Y =matchMADConstOnePattern(Add)) {

4948SDValue Mul = DCI.DAG.getNode(ISD::MUL,DL, VT,X,Y);

4949return DCI.DAG.getNode(ISD::ADD,DL, VT,Mul,X);

4950 }

4951

4952returnSDValue();

4953}

4954

4955staticSDValue combineMulSelectConstOne(SDValue X,SDValue Select,EVT VT,

4956SDLoc DL,

4957TargetLowering::DAGCombinerInfo &DCI) {

4958if (Select->getOpcode() !=ISD::SELECT)

4959returnSDValue();

4960

4961SDValue Cond =Select->getOperand(0);

4962

4963unsigned ConstOpNo;

4964if (isConstOne(Select->getOperand(1)))

4965 ConstOpNo = 1;

4966elseif (isConstOne(Select->getOperand(2)))

4967 ConstOpNo = 2;

4968else

4969returnSDValue();

4970

4971SDValue Y =Select->getOperand((ConstOpNo == 1) ? 2 : 1);

4972

4973// Do not combine if the resulting sequence is not obviously profitable.

4974if (!matchMADConstOnePattern(Y))

4975returnSDValue();

4976

4977SDValue NewMul = DCI.DAG.getNode(ISD::MUL,DL, VT,X,Y);

4978

4979return DCI.DAG.getNode(ISD::SELECT,DL, VT,Cond,

4980 (ConstOpNo == 1) ?X : NewMul,

4981 (ConstOpNo == 1) ? NewMul :X);

4982}

4983

4984staticSDValue

4985PerformMULCombineWithOperands(SDNode *N,SDValue N0,SDValue N1,

4986TargetLowering::DAGCombinerInfo &DCI) {

4987

4988EVT VT = N0.getValueType();

4989if (VT.isVector())

4990returnSDValue();

4991

4992if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)

4993returnSDValue();

4994

4995SDLoc DL(N);

4996

4997// (mul x, (add y, 1)) -> (add (mul x, y), x)

4998if (SDValue Res =combineMADConstOne(N0, N1, VT,DL, DCI))

4999return Res;

5000if (SDValue Res =combineMADConstOne(N1, N0, VT,DL, DCI))

5001return Res;

5002

5003// (mul x, (select y, 1)) -> (select (mul x, y), x)

5004if (SDValue Res =combineMulSelectConstOne(N0, N1, VT,DL, DCI))

5005return Res;

5006if (SDValue Res =combineMulSelectConstOne(N1, N0, VT,DL, DCI))

5007return Res;

5008

5009returnSDValue();

5010}

5011

5012/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.

5013staticSDValue PerformMULCombine(SDNode *N,

5014TargetLowering::DAGCombinerInfo &DCI,

5015CodeGenOptLevel OptLevel) {

5016if (OptLevel ==CodeGenOptLevel::None)

5017returnSDValue();

5018

5019if (SDValue Ret =TryMULWIDECombine(N, DCI))

5020return Ret;

5021

5022SDValue N0 =N->getOperand(0);

5023SDValue N1 =N->getOperand(1);

5024returnPerformMULCombineWithOperands(N, N0, N1, DCI);

5025}

5026

5027/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.

5028staticSDValue PerformSHLCombine(SDNode *N,

5029TargetLowering::DAGCombinerInfo &DCI,

5030CodeGenOptLevel OptLevel) {

5031if (OptLevel >CodeGenOptLevel::None) {

5032// Try mul.wide combining at OptLevel > 0

5033if (SDValue Ret =TryMULWIDECombine(N, DCI))

5034return Ret;

5035 }

5036

5037returnSDValue();

5038}

5039

5040staticSDValue PerformSETCCCombine(SDNode *N,

5041TargetLowering::DAGCombinerInfo &DCI,

5042unsignedintSmVersion) {

5043EVT CCType =N->getValueType(0);

5044SDValue A =N->getOperand(0);

5045SDValue B =N->getOperand(1);

5046

5047EVT AType =A.getValueType();

5048if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))

5049returnSDValue();

5050

5051if (A.getValueType() == MVT::v2bf16 &&SmVersion < 90)

5052returnSDValue();

5053

5054SDLoc DL(N);

5055// setp.f16x2 returns two scalar predicates, which we need to

5056// convert back to v2i1. The returned result will be scalarized by

5057// the legalizer, but the comparison will remain a single vector

5058// instruction.

5059SDValue CCNode = DCI.DAG.getNode(

5060A.getValueType() == MVT::v2f16 ?NVPTXISD::SETP_F16X2

5061 :NVPTXISD::SETP_BF16X2,

5062DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});

5063return DCI.DAG.getNode(ISD::BUILD_VECTOR,DL, CCType, CCNode.getValue(0),

5064 CCNode.getValue(1));

5065}

5066

5067staticSDValue PerformEXTRACTCombine(SDNode *N,

5068TargetLowering::DAGCombinerInfo &DCI) {

5069SDValue Vector =N->getOperand(0);

5070if (Vector->getOpcode() ==ISD::FREEZE)

5071Vector =Vector->getOperand(0);

5072SDLoc DL(N);

5073EVT VectorVT =Vector.getValueType();

5074if (Vector->getOpcode() ==ISD::LOAD && VectorVT.isSimple() &&

5075IsPTXVectorType(VectorVT.getSimpleVT()))

5076returnSDValue();// Native vector loads already combine nicely w/

5077// extract_vector_elt.

5078// Don't mess with singletons or v2*16, v4i8 and v8i8 types, we already

5079// handle them OK.

5080if (VectorVT.getVectorNumElements() == 1 ||Isv2x16VT(VectorVT) ||

5081 VectorVT == MVT::v4i8 || VectorVT == MVT::v8i8)

5082returnSDValue();

5083

5084// Don't mess with undef values as sra may be simplified to 0, not undef.

5085if (Vector->isUndef() ||ISD::allOperandsUndef(Vector.getNode()))

5086returnSDValue();

5087

5088uint64_t VectorBits = VectorVT.getSizeInBits();

5089// We only handle the types we can extract in-register.

5090if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))

5091returnSDValue();

5092

5093ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));

5094// Index == 0 is handled by generic DAG combiner.

5095if (!Index || Index->getZExtValue() == 0)

5096returnSDValue();

5097

5098MVT IVT =MVT::getIntegerVT(VectorBits);

5099EVT EltVT = VectorVT.getVectorElementType();

5100EVT EltIVT = EltVT.changeTypeToInteger();

5101uint64_t EltBits = EltVT.getScalarSizeInBits();

5102

5103SDValue Result = DCI.DAG.getNode(

5104ISD::TRUNCATE,DL, EltIVT,

5105 DCI.DAG.getNode(

5106ISD::SRA,DL, IVT, DCI.DAG.getNode(ISD::BITCAST,DL, IVT,Vector),

5107 DCI.DAG.getConstant(Index->getZExtValue() * EltBits,DL, IVT)));

5108

5109// If element has non-integer type, bitcast it back to the expected type.

5110if (EltVT != EltIVT)

5111 Result = DCI.DAG.getNode(ISD::BITCAST,DL, EltVT, Result);

5112// Past legalizer, we may need to extent i8 -> i16 to match the register type.

5113if (EltVT !=N->getValueType(0))

5114 Result = DCI.DAG.getNode(ISD::ANY_EXTEND,DL,N->getValueType(0), Result);

5115

5116return Result;

5117}

5118

5119staticSDValue PerformVSELECTCombine(SDNode *N,

5120TargetLowering::DAGCombinerInfo &DCI) {

5121SDValue VA =N->getOperand(1);

5122EVT VectorVT = VA.getValueType();

5123if (VectorVT != MVT::v4i8)

5124returnSDValue();

5125

5126// We need to split vselect into individual per-element operations Because we

5127// use BFE/BFI instruction for byte extraction/insertion, we do end up with

5128// 32-bit values, so we may as well do comparison as i32 to avoid conversions

5129// to/from i16 normally used for i8 values.

5130SmallVector<SDValue, 4> E;

5131SDLoc DL(N);

5132SDValue VCond =N->getOperand(0);

5133SDValue VB =N->getOperand(2);

5134for (intI = 0;I < 4; ++I) {

5135SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i1, VCond,

5136 DCI.DAG.getConstant(I,DL, MVT::i32));

5137SDValue EA = DCI.DAG.getAnyExtOrTrunc(

5138 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i8, VA,

5139 DCI.DAG.getConstant(I,DL, MVT::i32)),

5140DL, MVT::i32);

5141SDValue EB = DCI.DAG.getAnyExtOrTrunc(

5142 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT,DL, MVT::i8, VB,

5143 DCI.DAG.getConstant(I,DL, MVT::i32)),

5144DL, MVT::i32);

5145 E.push_back(DCI.DAG.getAnyExtOrTrunc(

5146 DCI.DAG.getNode(ISD::SELECT,DL, MVT::i32,C, EA, EB),DL, MVT::i8));

5147 }

5148return DCI.DAG.getNode(ISD::BUILD_VECTOR,DL, MVT::v4i8, E);

5149}

5150

5151staticSDValue

5152PerformBUILD_VECTORCombine(SDNode *N,TargetLowering::DAGCombinerInfo &DCI) {

5153auto VT =N->getValueType(0);

5154if (!DCI.isAfterLegalizeDAG() || !Isv2x16VT(VT))

5155returnSDValue();

5156

5157auto Op0 =N->getOperand(0);

5158auto Op1 =N->getOperand(1);

5159

5160// Start out by assuming we want to take the lower 2 bytes of each i32

5161// operand.

5162uint64_t Op0Bytes = 0x10;

5163uint64_t Op1Bytes = 0x54;

5164

5165 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},

5166 {&Op1, &Op1Bytes}};

5167

5168// Check that each operand is an i16, truncated from an i32 operand. We'll

5169// select individual bytes from those original operands. Optionally, fold in a

5170// shift right of that original operand.

5171for (auto &[Op, OpBytes] : OpData) {

5172// Eat up any bitcast

5173if (Op->getOpcode() ==ISD::BITCAST)

5174 *Op =Op->getOperand(0);

5175

5176if (!(Op->getValueType() == MVT::i16 &&Op->getOpcode() ==ISD::TRUNCATE &&

5177Op->getOperand(0).getValueType() == MVT::i32))

5178returnSDValue();

5179

5180// If the truncate has multiple uses, this optimization can increase

5181// register pressure

5182if (!Op->hasOneUse())

5183returnSDValue();

5184

5185 *Op =Op->getOperand(0);

5186

5187// Optionally, fold in a shift-right of the original operand and let permute

5188// pick the two higher bytes of the original value directly.

5189if (Op->getOpcode() ==ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {

5190if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {

5191// Shift the PRMT byte selector to pick upper bytes from each respective

5192// value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76

5193assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&

5194"PRMT selector values out of range");

5195 *OpBytes += 0x22;

5196 *Op =Op->getOperand(0);

5197 }

5198 }

5199 }

5200

5201SDLoc DL(N);

5202auto &DAG = DCI.DAG;

5203

5204auto PRMT = DAG.getNode(

5205NVPTXISD::PRMT,DL, MVT::v4i8,

5206 {Op0, Op1, DAG.getConstant((Op1Bytes << 8) | Op0Bytes,DL, MVT::i32),

5207 DAG.getConstant(NVPTX::PTXPrmtMode::NONE,DL, MVT::i32)});

5208return DAG.getNode(ISD::BITCAST,DL, VT, PRMT);

5209}

5210

5211SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,

5212 DAGCombinerInfo &DCI) const{

5213CodeGenOptLevel OptLevel =getTargetMachine().getOptLevel();

5214switch (N->getOpcode()) {

5215default:break;

5216caseISD::ADD:

5217returnPerformADDCombine(N, DCI, OptLevel);

5218caseISD::FADD:

5219returnPerformFADDCombine(N, DCI, OptLevel);

5220caseISD::MUL:

5221returnPerformMULCombine(N, DCI, OptLevel);

5222caseISD::SHL:

5223returnPerformSHLCombine(N, DCI, OptLevel);

5224caseISD::AND:

5225returnPerformANDCombine(N, DCI);

5226caseISD::UREM:

5227caseISD::SREM:

5228returnPerformREMCombine(N, DCI, OptLevel);

5229caseISD::SETCC:

5230returnPerformSETCCCombine(N, DCI, STI.getSmVersion());

5231caseNVPTXISD::StoreRetval:

5232caseNVPTXISD::StoreRetvalV2:

5233caseNVPTXISD::StoreRetvalV4:

5234returnPerformStoreRetvalCombine(N);

5235caseNVPTXISD::StoreParam:

5236caseNVPTXISD::StoreParamV2:

5237caseNVPTXISD::StoreParamV4:

5238returnPerformStoreParamCombine(N);

5239caseISD::EXTRACT_VECTOR_ELT:

5240returnPerformEXTRACTCombine(N, DCI);

5241caseISD::VSELECT:

5242returnPerformVSELECTCombine(N, DCI);

5243caseISD::BUILD_VECTOR:

5244returnPerformBUILD_VECTORCombine(N, DCI);

5245 }

5246returnSDValue();

5247}

5248

5249staticvoidReplaceBITCAST(SDNode *Node,SelectionDAG &DAG,

5250SmallVectorImpl<SDValue> &Results) {

5251// Handle bitcasting to v2i8 without hitting the default promotion

5252// strategy which goes through stack memory.

5253SDValue Op(Node, 0);

5254EVT ToVT =Op->getValueType(0);

5255if (ToVT != MVT::v2i8) {

5256return;

5257 }

5258

5259// Bitcast to i16 and unpack elements into a vector

5260SDLoc DL(Node);

5261SDValue AsInt =MaybeBitcast(DAG,DL, MVT::i16,Op->getOperand(0));

5262SDValue Vec0 = DAG.getNode(ISD::TRUNCATE,DL, MVT::i8, AsInt);

5263SDValue Const8 = DAG.getConstant(8,DL, MVT::i16);

5264SDValue Vec1 =

5265 DAG.getNode(ISD::TRUNCATE,DL, MVT::i8,

5266 DAG.getNode(ISD::SRL,DL, MVT::i16, {AsInt, Const8}));

5267Results.push_back(

5268 DAG.getNode(ISD::BUILD_VECTOR,DL, MVT::v2i8, {Vec0, Vec1}));

5269}

5270

5271/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.

5272staticvoidReplaceLoadVector(SDNode *N,SelectionDAG &DAG,

5273SmallVectorImpl<SDValue> &Results) {

5274EVT ResVT =N->getValueType(0);

5275SDLoc DL(N);

5276

5277assert(ResVT.isVector() &&"Vector load must have vector type");

5278

5279auto NumEltsAndEltVT =getVectorLoweringShape(ResVT);

5280if (!NumEltsAndEltVT)

5281return;

5282auto [NumElts, EltVT] = NumEltsAndEltVT.value();

5283

5284LoadSDNode *LD = cast<LoadSDNode>(N);

5285

5286Align Alignment = LD->getAlign();

5287auto &TD = DAG.getDataLayout();

5288Align PrefAlign =

5289 TD.getPrefTypeAlign(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));

5290if (Alignment < PrefAlign) {

5291// This load is not sufficiently aligned, so bail out and let this vector

5292// load be scalarized. Note that we may still be able to emit smaller

5293// vector loads. For example, if we are loading a <4 x float> with an

5294// alignment of 8, this check will fail but the legalizer will try again

5295// with 2 x <2 x float>, which will succeed with an alignment of 8.

5296return;

5297 }

5298

5299// Since LoadV2 is a target node, we cannot rely on DAG type legalization.

5300// Therefore, we must ensure the type is legal. For i1 and i8, we set the

5301// loaded type to i16 and propagate the "real" type as the memory type.

5302bool NeedTrunc =false;

5303if (EltVT.getSizeInBits() < 16) {

5304 EltVT = MVT::i16;

5305 NeedTrunc =true;

5306 }

5307

5308unsigned Opcode = 0;

5309SDVTList LdResVTs;

5310

5311switch (NumElts) {

5312default:

5313return;

5314case 2:

5315 Opcode =NVPTXISD::LoadV2;

5316 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);

5317break;

5318case 4: {

5319 Opcode =NVPTXISD::LoadV4;

5320EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };

5321 LdResVTs = DAG.getVTList(ListVTs);

5322break;

5323 }

5324 }

5325

5326// Copy regular operands

5327SmallVector<SDValue, 8> OtherOps(N->ops());

5328

5329// The select routine does not have access to the LoadSDNode instance, so

5330// pass along the extension information

5331 OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType(),DL));

5332

5333SDValue NewLD = DAG.getMemIntrinsicNode(Opcode,DL, LdResVTs, OtherOps,

5334 LD->getMemoryVT(),

5335 LD->getMemOperand());

5336

5337SmallVector<SDValue> ScalarRes;

5338assert(NumElts <= ResVT.getVectorNumElements() &&

5339"NumElts should not increase, only decrease or stay the same.");

5340if (NumElts < ResVT.getVectorNumElements()) {

5341// If the number of elements has decreased, getVectorLoweringShape has

5342// upsized the element types

5343assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 &&

5344 EltVT.getVectorNumElements() <= 4 &&"Unexpected upsized type.");

5345// Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back

5346// into individual elements.

5347for (unsigned i = 0; i < NumElts; ++i) {

5348SDValue SubVector = NewLD.getValue(i);

5349 DAG.ExtractVectorElements(SubVector, ScalarRes);

5350 }

5351 }else {

5352for (unsigned i = 0; i < NumElts; ++i) {

5353SDValue Res = NewLD.getValue(i);

5354if (NeedTrunc)

5355 Res = DAG.getNode(ISD::TRUNCATE,DL, ResVT.getVectorElementType(), Res);

5356 ScalarRes.push_back(Res);

5357 }

5358 }

5359

5360SDValue LoadChain = NewLD.getValue(NumElts);

5361

5362SDValue BuildVec = DAG.getBuildVector(ResVT,DL, ScalarRes);

5363

5364Results.push_back(BuildVec);

5365Results.push_back(LoadChain);

5366}

5367

5368staticvoidReplaceINTRINSIC_W_CHAIN(SDNode *N,SelectionDAG &DAG,

5369SmallVectorImpl<SDValue> &Results) {

5370SDValue Chain =N->getOperand(0);

5371SDValue Intrin =N->getOperand(1);

5372SDLoc DL(N);

5373

5374// Get the intrinsic ID

5375unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();

5376switch (IntrinNo) {

5377default:

5378return;

5379case Intrinsic::nvvm_ldu_global_i:

5380case Intrinsic::nvvm_ldu_global_f:

5381case Intrinsic::nvvm_ldu_global_p: {

5382EVT ResVT =N->getValueType(0);

5383

5384if (ResVT.isVector()) {

5385// Vector LDG/LDU

5386

5387unsigned NumElts = ResVT.getVectorNumElements();

5388EVT EltVT = ResVT.getVectorElementType();

5389

5390// Since LDU/LDG are target nodes, we cannot rely on DAG type

5391// legalization.

5392// Therefore, we must ensure the type is legal. For i1 and i8, we set the

5393// loaded type to i16 and propagate the "real" type as the memory type.

5394bool NeedTrunc =false;

5395if (EltVT.getSizeInBits() < 16) {

5396 EltVT = MVT::i16;

5397 NeedTrunc =true;

5398 }

5399

5400unsigned Opcode = 0;

5401SDVTList LdResVTs;

5402

5403switch (NumElts) {

5404default:

5405return;

5406case 2:

5407 Opcode =NVPTXISD::LDUV2;

5408 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);

5409break;

5410case 4: {

5411 Opcode =NVPTXISD::LDUV4;

5412EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };

5413 LdResVTs = DAG.getVTList(ListVTs);

5414break;

5415 }

5416 }

5417

5418SmallVector<SDValue, 8> OtherOps;

5419

5420// Copy regular operands

5421

5422 OtherOps.push_back(Chain);// Chain

5423// Skip operand 1 (intrinsic ID)

5424// Others

5425 OtherOps.append(N->op_begin() + 2,N->op_end());

5426

5427MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);

5428

5429SDValue NewLD = DAG.getMemIntrinsicNode(Opcode,DL, LdResVTs, OtherOps,

5430 MemSD->getMemoryVT(),

5431 MemSD->getMemOperand());

5432

5433SmallVector<SDValue, 4> ScalarRes;

5434

5435for (unsigned i = 0; i < NumElts; ++i) {

5436SDValue Res = NewLD.getValue(i);

5437if (NeedTrunc)

5438 Res =

5439 DAG.getNode(ISD::TRUNCATE,DL, ResVT.getVectorElementType(), Res);

5440 ScalarRes.push_back(Res);

5441 }

5442

5443SDValue LoadChain = NewLD.getValue(NumElts);

5444

5445SDValue BuildVec =

5446 DAG.getBuildVector(ResVT,DL, ScalarRes);

5447

5448Results.push_back(BuildVec);

5449Results.push_back(LoadChain);

5450 }else {

5451// i8 LDG/LDU

5452assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&

5453"Custom handling of non-i8 ldu/ldg?");

5454

5455// Just copy all operands as-is

5456SmallVector<SDValue, 4> Ops(N->ops());

5457

5458// Force output to i16

5459SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);

5460

5461MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);

5462

5463// We make sure the memory type is i8, which will be used during isel

5464// to select the proper instruction.

5465SDValue NewLD =

5466 DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN,DL, LdResVTs, Ops,

5467 MVT::i8, MemSD->getMemOperand());

5468

5469Results.push_back(DAG.getNode(ISD::TRUNCATE,DL, MVT::i8,

5470 NewLD.getValue(0)));

5471Results.push_back(NewLD.getValue(1));

5472 }

5473 }

5474 }

5475}

5476

5477staticvoidReplaceCopyFromReg_128(SDNode *N,SelectionDAG &DAG,

5478SmallVectorImpl<SDValue> &Results) {

5479// Change the CopyFromReg to output 2 64-bit results instead of a 128-bit

5480// result so that it can pass the legalization

5481SDLoc DL(N);

5482SDValue Chain =N->getOperand(0);

5483SDValue Reg =N->getOperand(1);

5484SDValue Glue =N->getOperand(2);

5485

5486assert(Reg.getValueType() == MVT::i128 &&

5487"Custom lowering for CopyFromReg with 128-bit reg only");

5488SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64,N->getValueType(1),

5489N->getValueType(2)};

5490SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};

5491

5492SDValue NewValue = DAG.getNode(ISD::CopyFromReg,DL, ResultsType, NewOps);

5493SDValue Pair = DAG.getNode(ISD::BUILD_PAIR,DL, MVT::i128,

5494 {NewValue.getValue(0), NewValue.getValue(1)});

5495

5496Results.push_back(Pair);

5497Results.push_back(NewValue.getValue(2));

5498Results.push_back(NewValue.getValue(3));

5499}

5500

5501void NVPTXTargetLowering::ReplaceNodeResults(

5502SDNode *N,SmallVectorImpl<SDValue> &Results,SelectionDAG &DAG) const{

5503switch (N->getOpcode()) {

5504default:

5505report_fatal_error("Unhandled custom legalization");

5506caseISD::BITCAST:

5507ReplaceBITCAST(N, DAG,Results);

5508return;

5509caseISD::LOAD:

5510ReplaceLoadVector(N, DAG,Results);

5511return;

5512caseISD::INTRINSIC_W_CHAIN:

5513ReplaceINTRINSIC_W_CHAIN(N, DAG,Results);

5514return;

5515caseISD::CopyFromReg:

5516ReplaceCopyFromReg_128(N, DAG,Results);

5517return;

5518 }

5519}

5520

5521NVPTXTargetLowering::AtomicExpansionKind

5522NVPTXTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const{

5523Type *Ty = AI->getValOperand()->getType();

5524

5525if (AI->isFloatingPointOperation()) {

5526if (AI->getOperation() ==AtomicRMWInst::BinOp::FAdd) {

5527if (Ty->isHalfTy() && STI.getSmVersion() >= 70 &&

5528 STI.getPTXVersion() >= 63)

5529returnAtomicExpansionKind::None;

5530if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&

5531 STI.getPTXVersion() >= 78)

5532returnAtomicExpansionKind::None;

5533if (Ty->isFloatTy())

5534returnAtomicExpansionKind::None;

5535if (Ty->isDoubleTy() && STI.hasAtomAddF64())

5536returnAtomicExpansionKind::None;

5537 }

5538returnAtomicExpansionKind::CmpXChg;

5539 }

5540

5541assert(Ty->isIntegerTy() &&"Ty should be integer at this point");

5542auto ITy = cast<llvm::IntegerType>(Ty);

5543

5544switch (AI->getOperation()) {

5545default:

5546returnAtomicExpansionKind::CmpXChg;

5547caseAtomicRMWInst::BinOp::And:

5548caseAtomicRMWInst::BinOp::Or:

5549caseAtomicRMWInst::BinOp::Xor:

5550caseAtomicRMWInst::BinOp::Xchg:

5551switch (ITy->getBitWidth()) {

5552case 8:

5553case 16:

5554returnAtomicExpansionKind::CmpXChg;

5555case 32:

5556returnAtomicExpansionKind::None;

5557case 64:

5558if (STI.hasAtomBitwise64())

5559returnAtomicExpansionKind::None;

5560returnAtomicExpansionKind::CmpXChg;

5561default:

5562llvm_unreachable("unsupported width encountered");

5563 }

5564caseAtomicRMWInst::BinOp::Add:

5565caseAtomicRMWInst::BinOp::Sub:

5566caseAtomicRMWInst::BinOp::Max:

5567caseAtomicRMWInst::BinOp::Min:

5568caseAtomicRMWInst::BinOp::UMax:

5569caseAtomicRMWInst::BinOp::UMin:

5570switch (ITy->getBitWidth()) {

5571case 8:

5572case 16:

5573returnAtomicExpansionKind::CmpXChg;

5574case 32:

5575returnAtomicExpansionKind::None;

5576case 64:

5577if (STI.hasAtomMinMax64())

5578returnAtomicExpansionKind::None;

5579returnAtomicExpansionKind::CmpXChg;

5580default:

5581llvm_unreachable("unsupported width encountered");

5582 }

5583 }

5584

5585returnAtomicExpansionKind::CmpXChg;

5586}

5587

5588// Pin NVPTXTargetObjectFile's vtables to this file.

5589NVPTXTargetObjectFile::~NVPTXTargetObjectFile() =default;

5590

5591MCSection *NVPTXTargetObjectFile::SelectSectionForGlobal(

5592constGlobalObject *GO,SectionKind Kind,constTargetMachine &TM) const{

5593returngetDataSection();

5594}

MAKE_CASE

#define MAKE_CASE(V)

F32

static const LLT F32

Definition:AMDGPULegalizerInfo.cpp:286

Select

AMDGPU Register Bank Select

Definition:AMDGPURegBankSelect.cpp:71

APInt.h

This file implements a class to represent arbitrary precision integral constant values and operations...

PerformADDCombineWithOperands

static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)

PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.

Definition:ARMISelLowering.cpp:13593

PerformADDCombine

static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)

PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.

Definition:ARMISelLowering.cpp:14078

PerformVSELECTCombine

static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)

Definition:ARMISelLowering.cpp:13475

PerformMULCombine

static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)

Definition:ARMISelLowering.cpp:14270

PerformFADDCombine

static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)

Definition:ARMISelLowering.cpp:17068

PerformANDCombine

static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)

Definition:ARMISelLowering.cpp:14468

PerformBUILD_VECTORCombine

static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)

PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.

Definition:ARMISelLowering.cpp:15326

MBB

MachineBasicBlock & MBB

Definition:ARMSLSHardening.cpp:71

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

Definition:ARMSLSHardening.cpp:73

Results

Function Alias Analysis Results

Definition:AliasAnalysis.cpp:731

Alignment.h

Attributes.h

This file contains the simple types necessary to represent the attributes associated with functions a...

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

Info

Analysis containing CSE Info

Definition:CSEInfo.cpp:27

This file contains the declarations for the subclasses of Constant, which represent the different fla...

DataLayout.h

RetTy

return RetTy

Definition:DeadArgumentElimination.cpp:361

Idx

Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx

Definition:DeadArgumentElimination.cpp:353

DerivedTypes.h

DiagnosticInfo.h

Size

uint64_t Size

Definition:ELFObjHandler.cpp:81

static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")

FPEnv.h

This file contains the declarations of entities that describe floating point environment and related ...

Module.h This file contains the declarations for the Module class.

static LVOptions Options

Definition:LVOptions.cpp:25

#define F(x, y, z)

Definition:MD5.cpp:55

#define I(x, y, z)

Definition:MD5.cpp:58

MachineFunction.h

getDebugLoc

static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)

Return the first found DebugLoc that has a DILocation, given a range of instructions.

Definition:MachineInstrBundle.cpp:109

MachineJumpTableInfo.h

MachineMemOperand.h

TRI

unsigned const TargetRegisterInfo * TRI

Definition:MachineSink.cpp:2029

MachineValueType.h

NVPTXAddrSpace.h

NVPTX address space definition.

NVPTXBaseInfo.h

shouldConvertToIndirectCall

static bool shouldConvertToIndirectCall(const CallBase *CB, const GlobalAddressSDNode *Func)

Definition:NVPTXISelLowering.cpp:1402

sched4reg

static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))

PerformEXTRACTCombine

static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)

Definition:NVPTXISelLowering.cpp:5067

isConstOne

static bool isConstOne(const SDValue &Operand)

Definition:NVPTXISelLowering.cpp:4926

FMAContractLevelOpt

static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))

IsPTXVectorType

static bool IsPTXVectorType(MVT VT)

Definition:NVPTXISelLowering.cpp:138

UsePrecDivF32

static cl::opt< int > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specifies: 0 use div.approx, 1 use div.full, 2 use" " IEEE Compliant F32 div.rnd if available."), cl::init(2))

PerformStoreParamCombine

static SDValue PerformStoreParamCombine(SDNode *N)

Definition:NVPTXISelLowering.cpp:4607

ReplaceLoadVector

static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)

ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.

Definition:NVPTXISelLowering.cpp:5272

ReplaceBITCAST

static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)

Definition:NVPTXISelLowering.cpp:5249

ReplaceCopyFromReg_128

static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)

Definition:NVPTXISelLowering.cpp:5477

Is16bitsType

static bool Is16bitsType(MVT VT)

Definition:NVPTXISelLowering.cpp:167

combineMADConstOne

static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)

Definition:NVPTXISelLowering.cpp:4944

IsTypePassedAsArray

static bool IsTypePassedAsArray(const Type *Ty)

Definition:NVPTXISelLowering.cpp:1147

VectorizePTXValueVTs

static SmallVector< ParamVectorizationFlags, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment, bool IsVAArg=false)

Definition:NVPTXISelLowering.cpp:443

CanMergeParamLoadStoresStartingAt

static unsigned CanMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< uint64_t > &Offsets, Align ParamAlignment)

Definition:NVPTXISelLowering.cpp:380

ReplaceINTRINSIC_W_CHAIN

static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)

Definition:NVPTXISelLowering.cpp:5368

PerformFADDCombineWithOperands

static SDValue PerformFADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)

Definition:NVPTXISelLowering.cpp:4520

isConstZero

static bool isConstZero(const SDValue &Operand)

Definition:NVPTXISelLowering.cpp:4470

LowerVectorArith

static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)

Definition:NVPTXISelLowering.cpp:2628

ComputePTXValueVTs

static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > *Offsets=nullptr, uint64_t StartingOffset=0)

ComputePTXValueVTs - For the given Type Ty, returns the set of primitive EVTs that compose it.

Definition:NVPTXISelLowering.cpp:239

IsMulWideOperandDemotable

static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)

IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...

Definition:NVPTXISelLowering.cpp:4797

LowerUnalignedStoreParam

static SDValue LowerUnalignedStoreParam(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue StVal, SDValue &InGlue, unsigned ArgID, const SDLoc &dl)

Definition:NVPTXISelLowering.cpp:1326

PerformREMCombine

static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)

Definition:NVPTXISelLowering.cpp:4757

getVectorLoweringShape

static std::optional< std::pair< unsigned int, EVT > > getVectorLoweringShape(EVT VectorVT)

Definition:NVPTXISelLowering.cpp:180

PerformMULCombineWithOperands

static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)

Definition:NVPTXISelLowering.cpp:4985

PerformStoreRetvalCombine

static SDValue PerformStoreRetvalCombine(SDNode *N)

Definition:NVPTXISelLowering.cpp:4613

AreMulWideOperandsDemotable

static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)

AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...

Definition:NVPTXISelLowering.cpp:4824

PerformStoreCombineHelper

static SDValue PerformStoreCombineHelper(SDNode *N, std::size_t Front, std::size_t Back)

Definition:NVPTXISelLowering.cpp:4596

adjustElementType

static bool adjustElementType(EVT &ElementType)

Definition:NVPTXISelLowering.cpp:1303

TryMULWIDECombine

static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)

TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...

Definition:NVPTXISelLowering.cpp:4860

combineMulSelectConstOne

static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)

Definition:NVPTXISelLowering.cpp:4955

matchMADConstOnePattern

static SDValue matchMADConstOnePattern(SDValue Add)

Definition:NVPTXISelLowering.cpp:4931

MaybeBitcast

static SDValue MaybeBitcast(SelectionDAG &DAG, SDLoc DL, EVT VT, SDValue Value)

Definition:NVPTXISelLowering.cpp:491

UsePrecSqrtF32

static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))

ParamVectorizationFlags

Definition:NVPTXISelLowering.cpp:426

PVF_FIRST

@ PVF_FIRST

Definition:NVPTXISelLowering.cpp:428

PVF_SCALAR

@ PVF_SCALAR

Definition:NVPTXISelLowering.cpp:431

PVF_INNER

@ PVF_INNER

Definition:NVPTXISelLowering.cpp:427

PVF_LAST

@ PVF_LAST

Definition:NVPTXISelLowering.cpp:429

LowerUnalignedStoreRet

static SDValue LowerUnalignedStoreRet(SelectionDAG &DAG, SDValue Chain, uint64_t Offset, EVT ElementType, SDValue RetVal, const SDLoc &dl)

Definition:NVPTXISelLowering.cpp:3278

PromoteBinOpToF32

static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)

Definition:NVPTXISelLowering.cpp:2512

PromoteScalarIntegerPTX

static bool PromoteScalarIntegerPTX(const EVT &VT, MVT *PromotedVT)

PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...

Definition:NVPTXISelLowering.cpp:341

OperandSignedness

Definition:NVPTXISelLowering.cpp:4788

Unsigned

@ Unsigned

Definition:NVPTXISelLowering.cpp:4790

Signed

@ Signed

Definition:NVPTXISelLowering.cpp:4789

PerformSETCCCombine

static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)

Definition:NVPTXISelLowering.cpp:5040

LowerUnalignedLoadRetParam

static SDValue LowerUnalignedLoadRetParam(SelectionDAG &DAG, SDValue &Chain, uint64_t Offset, EVT ElementType, SDValue &InGlue, SmallVectorImpl< SDValue > &TempProxyRegOps, const SDLoc &dl)

Definition:NVPTXISelLowering.cpp:1357

GlobalUniqueCallSite

static std::atomic< unsigned > GlobalUniqueCallSite

Definition:NVPTXISelLowering.cpp:74

ForceMinByValParamAlign

static cl::opt< bool > ForceMinByValParamAlign("nvptx-force-min-byval-param-align", cl::Hidden, cl::desc("NVPTX Specific: force 4-byte minimal alignment for byval" " params of device functions."), cl::init(false))

UseApproxLog2F32

static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))

Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....

PerformSHLCombine

static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)

PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.

Definition:NVPTXISelLowering.cpp:5028

NVPTXISelLowering.h

NVPTXSubtarget.h

NVPTXTargetMachine.h

NVPTXTargetObjectFile.h

NVPTXUtilities.h

NVPTX.h

SmVersion

unsigned SmVersion

Definition:NVVMReflect.cpp:81

static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")

#define P(N)

if(PassOpts->AAPipeline)

Definition:PassBuilderBindings.cpp:64

Cond

const SmallVectorImpl< MachineOperand > & Cond

Definition:RISCVRedundantCopyElimination.cpp:75

auto CC

Definition:RISCVRedundantCopyElimination.cpp:79

assert

assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

STLExtras.h

This file contains some templates that are useful if you are working with the STL at all.

SelectionDAGNodes.h

SelectionDAG.h

SmallVector.h

This file defines the SmallVector class.

Enabled

static bool Enabled

Definition:Statistic.cpp:46

StringRef.h

TargetCallingConv.h

Ptr

@ Ptr

Definition:TargetLibraryInfo.cpp:77

TargetLowering.h

This file describes how to lower LLVM code to machine code.

TargetOptions.h

ValueTypes.h

RHS

Value * RHS

Definition:X86PartialReduction.cpp:74

LHS

Value * LHS

Definition:X86PartialReduction.cpp:73

ArrayType

Definition:ItaniumDemangle.h:785

Node

Definition:ItaniumDemangle.h:163

llvm::APInt

Class for arbitrary precision integers.

Definition:APInt.h:78

llvm::APInt::isSignedIntN

bool isSignedIntN(unsigned N) const

Check if this APInt has an N-bits signed integer value.

Definition:APInt.h:435

llvm::APInt::slt

bool slt(const APInt &RHS) const

Signed less than comparison.

Definition:APInt.h:1130

llvm::APInt::isIntN

bool isIntN(unsigned N) const

Check if this APInt has an N-bits unsigned integer value.

Definition:APInt.h:432

llvm::APInt::sge

bool sge(const APInt &RHS) const

Signed greater or equal comparison.

Definition:APInt.h:1237

llvm::Argument

This class represents an incoming formal argument to a Function.

Definition:Argument.h:31

llvm::ArrayRef

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

Definition:ArrayRef.h:41

llvm::ArrayRef::back

const T & back() const

back - Get the last element.

Definition:ArrayRef.h:177

llvm::ArrayRef::drop_back

ArrayRef< T > drop_back(size_t N=1) const

Drop the last N elements of the array.

Definition:ArrayRef.h:213

llvm::ArrayRef::empty

bool empty() const

empty - Check if the array is empty.

Definition:ArrayRef.h:163

llvm::AtomicRMWInst

an instruction that atomically reads a memory location, combines it with another value,...

Definition:Instructions.h:704

llvm::AtomicRMWInst::Add

@ Add

*p = old + v

Definition:Instructions.h:720

llvm::AtomicRMWInst::FAdd

@ FAdd

*p = old + v

Definition:Instructions.h:741

llvm::AtomicRMWInst::Min

@ Min

*p = old <signed v ? old : v

Definition:Instructions.h:734

llvm::AtomicRMWInst::Or

@ Or

*p = old | v

Definition:Instructions.h:728

llvm::AtomicRMWInst::Sub

@ Sub

*p = old - v

Definition:Instructions.h:722

llvm::AtomicRMWInst::And

@ And

*p = old & v

Definition:Instructions.h:724

llvm::AtomicRMWInst::Xor

@ Xor

*p = old ^ v

Definition:Instructions.h:730

llvm::AtomicRMWInst::Max

@ Max

*p = old >signed v ? old : v

Definition:Instructions.h:732

llvm::AtomicRMWInst::UMin

@ UMin

*p = old <unsigned v ? old : v

Definition:Instructions.h:738

llvm::AtomicRMWInst::UMax

@ UMax

*p = old >unsigned v ? old : v

Definition:Instructions.h:736

llvm::AtomicRMWInst::Xchg

@ Xchg

*p = v

Definition:Instructions.h:718

llvm::AtomicRMWInst::isFloatingPointOperation

bool isFloatingPointOperation() const

Definition:Instructions.h:882

llvm::AtomicRMWInst::getOperation

BinOp getOperation() const

Definition:Instructions.h:805

llvm::AtomicRMWInst::getValOperand

Value * getValOperand()

Definition:Instructions.h:874

llvm::AttributeList

Definition:Attributes.h:490

llvm::AttributeList::hasParamAttr

bool hasParamAttr(unsigned ArgNo, Attribute::AttrKind Kind) const

Return true if the attribute exists for the given argument.

Definition:Attributes.h:833

llvm::AttributeList::FirstArgIndex

@ FirstArgIndex

Definition:Attributes.h:495

llvm::CallBase

Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...

Definition:InstrTypes.h:1112

llvm::CallBase::getCalledFunction

Function * getCalledFunction() const

Returns the function called, or null if this is an indirect function invocation or the function signa...

Definition:InstrTypes.h:1341

llvm::CallBase::getFunctionType

FunctionType * getFunctionType() const

Definition:InstrTypes.h:1199

llvm::CallInst

This class represents a function call, abstracting a target machine's calling convention.

Definition:Instructions.h:1479

llvm::ConstantSDNode

Definition:SelectionDAGNodes.h:1684

llvm::ConstantSDNode::getZExtValue

uint64_t getZExtValue() const

Definition:SelectionDAGNodes.h:1701

llvm::ConstantSDNode::getAPIntValue

const APInt & getAPIntValue() const

Definition:SelectionDAGNodes.h:1700

llvm::Constant::getNullValue

static Constant * getNullValue(Type *Ty)

Constructor to create a '0' constant of arbitrary type.

Definition:Constants.cpp:373

llvm::DWARFExpression::Operation

This class represents an Operation in the Expression.

Definition:DWARFExpression.h:32

llvm::DWARFExpression::Operation::getNumOperands

uint64_t getNumOperands() const

Definition:DWARFExpression.h:90

llvm::DataLayout

A parsed version of the target data layout string in and methods for querying it.

Definition:DataLayout.h:63

llvm::DataLayout::getTypeAllocSize

TypeSize getTypeAllocSize(Type *Ty) const

Returns the offset in bytes between successive objects of the specified type, including alignment pad...

Definition:DataLayout.h:457

llvm::DataLayout::getPrefTypeAlign

Align getPrefTypeAlign(Type *Ty) const

Returns the preferred stack/global alignment for the specified type.

Definition:DataLayout.cpp:847

llvm::DiagnosticInfoUnsupported

Diagnostic information for unsupported feature in backend.

Definition:DiagnosticInfo.h:1097

llvm::Function

Definition:Function.h:63

llvm::Function::addFnAttr

void addFnAttr(Attribute::AttrKind Kind)

Add function attributes to this function.

Definition:Function.cpp:641

llvm::Function::getReturnType

Type * getReturnType() const

Returns the type of the ret val.

Definition:Function.h:221

llvm::GlobalAddressSDNode

Definition:SelectionDAGNodes.h:1876

llvm::GlobalAddressSDNode::getAddressSpace

unsigned getAddressSpace() const

Definition:SelectionDAG.cpp:13070

llvm::GlobalAddressSDNode::getGlobal

const GlobalValue * getGlobal() const

Definition:SelectionDAGNodes.h:1890

llvm::GlobalObject

Definition:GlobalObject.h:27

llvm::Instruction

Definition:Instruction.h:68

llvm::LLVMContext

This is an important class for using LLVM in a threaded context.

Definition:LLVMContext.h:67

llvm::LLVMContext::diagnose

void diagnose(const DiagnosticInfo &DI)

Report a message to the currently installed diagnostic handler.

Definition:LLVMContext.cpp:245

llvm::LoadSDNode

This class is used to represent ISD::LOAD nodes.

Definition:SelectionDAGNodes.h:2464

llvm::MCObjectFileInfo::getDataSection

MCSection * getDataSection() const

Definition:MCObjectFileInfo.h:272

llvm::MCSection

Instances of this class represent a uniqued identifier for a section in the current translation unit.

Definition:MCSection.h:36

llvm::MCSymbol::getName

StringRef getName() const

getName - Get the symbol name.

Definition:MCSymbol.h:205

llvm::MVT

Machine Value Type.

Definition:MachineValueType.h:35

llvm::MVT::SimpleTy

SimpleValueType SimpleTy

Definition:MachineValueType.h:55

llvm::MVT::getVectorNumElements

unsigned getVectorNumElements() const

Definition:MachineValueType.h:294

llvm::MVT::isScalableVector

bool isScalableVector() const

Return true if this is a vector value type where the runtime length is machine dependent.

Definition:MachineValueType.h:113

llvm::MVT::integer_valuetypes

static auto integer_valuetypes()

Definition:MachineValueType.h:525

llvm::MVT::fixedlen_vector_valuetypes

static auto fixedlen_vector_valuetypes()

Definition:MachineValueType.h:542

llvm::MVT::getVectorVT

static MVT getVectorVT(MVT VT, unsigned NumElements)

Definition:MachineValueType.h:451

llvm::MVT::getIntegerVT

static MVT getIntegerVT(unsigned BitWidth)

Definition:MachineValueType.h:441

llvm::MVT::getScalarType

MVT getScalarType() const

If this is a vector, return the element type, otherwise return this.

Definition:MachineValueType.h:259

llvm::MachineBasicBlock

Definition:MachineBasicBlock.h:125

llvm::MachineFunction

Definition:MachineFunction.h:267

llvm::MachineFunction::getDenormalMode

DenormalMode getDenormalMode(const fltSemantics &FPType) const

Returns the denormal handling type for the default rounding mode of the function.

Definition:MachineFunction.cpp:324

llvm::MachineFunction::getRegInfo

MachineRegisterInfo & getRegInfo()

getRegInfo - Return information about the registers currently in use.

Definition:MachineFunction.h:743

llvm::MachineFunction::getFunction

Function & getFunction()

Return the LLVM function that this machine code represents.

Definition:MachineFunction.h:704

llvm::MachineFunction::getJumpTableInfo

const MachineJumpTableInfo * getJumpTableInfo() const

getJumpTableInfo - Return the jump table info object for the current function.

Definition:MachineFunction.h:756

llvm::MachineFunction::getTarget

const TargetMachine & getTarget() const

getTarget - Return the target machine this machine code is compiled with

Definition:MachineFunction.h:729

llvm::MachineJumpTableInfo

Definition:MachineJumpTableInfo.h:46

llvm::MachineJumpTableInfo::EK_Inline

@ EK_Inline

EK_Inline - Jump table entries are emitted inline at their point of use.

Definition:MachineJumpTableInfo.h:82

llvm::MachineJumpTableInfo::getJumpTables

const std::vector< MachineJumpTableEntry > & getJumpTables() const

Definition:MachineJumpTableInfo.h:110

llvm::MachineMemOperand::MODereferenceable

@ MODereferenceable

The memory access is dereferenceable (i.e., doesn't trap).

Definition:MachineMemOperand.h:144

llvm::MachineMemOperand::MOLoad

@ MOLoad

The memory access reads data.

Definition:MachineMemOperand.h:136

llvm::MachineMemOperand::MOInvariant

@ MOInvariant

The memory access always returns the same value (or traps).

Definition:MachineMemOperand.h:146

llvm::MachineMemOperand::MOStore

@ MOStore

The memory access writes data.

Definition:MachineMemOperand.h:138

llvm::MachineRegisterInfo

MachineRegisterInfo - Keep track of information for virtual and physical registers,...

Definition:MachineRegisterInfo.h:51

llvm::MachineRegisterInfo::createVirtualRegister

createVirtualRegister - Create and return a new virtual register in the function with the specified r...

Definition:MachineRegisterInfo.cpp:156

llvm::MemIntrinsicSDNode

This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.

Definition:SelectionDAGNodes.h:1601

llvm::MemSDNode

This is an abstract virtual class for memory operations.

Definition:SelectionDAGNodes.h:1352

llvm::MemSDNode::getAlign

Align getAlign() const

Definition:SelectionDAGNodes.h:1370

llvm::MemSDNode::getMemOperand

MachineMemOperand * getMemOperand() const

Return a MachineMemOperand object describing the memory reference performed by operation.

Definition:SelectionDAGNodes.h:1436

llvm::MemSDNode::getMemoryVT

EVT getMemoryVT() const

Return the type of the in-memory value.

Definition:SelectionDAGNodes.h:1432

llvm::NVPTXSubtarget

Definition:NVPTXSubtarget.h:30

llvm::NVPTXSubtarget::getMaxRequiredAlignment

unsigned getMaxRequiredAlignment() const

Definition:NVPTXSubtarget.h:130

llvm::NVPTXSubtarget::hasAtomMinMax64

bool hasAtomMinMax64() const

Definition:NVPTXSubtarget.h:78

llvm::NVPTXSubtarget::hasAtomAddF64

bool hasAtomAddF64() const

Definition:NVPTXSubtarget.h:75

llvm::NVPTXSubtarget::hasHWROT32

bool hasHWROT32() const

Definition:NVPTXSubtarget.h:82

llvm::NVPTXSubtarget::getTargetLowering

const NVPTXTargetLowering * getTargetLowering() const override

Definition:NVPTXSubtarget.h:69

llvm::NVPTXSubtarget::getMinCmpXchgSizeInBits

unsigned getMinCmpXchgSizeInBits() const

Definition:NVPTXSubtarget.h:132

llvm::NVPTXSubtarget::getPTXVersion

unsigned getPTXVersion() const

Definition:NVPTXSubtarget.h:134

llvm::NVPTXSubtarget::hasNativeBF16Support

bool hasNativeBF16Support(int Opcode) const

Definition:NVPTXSubtarget.cpp:73

llvm::NVPTXSubtarget::getRegisterInfo

const NVPTXRegisterInfo * getRegisterInfo() const override

Definition:NVPTXSubtarget.h:66

llvm::NVPTXSubtarget::getSmVersion

unsigned int getSmVersion() const

Definition:NVPTXSubtarget.h:106

llvm::NVPTXSubtarget::hasAtomBitwise64

bool hasAtomBitwise64() const

Definition:NVPTXSubtarget.h:77

llvm::NVPTXSubtarget::hasBF16Math

bool hasBF16Math() const

Definition:NVPTXSubtarget.h:84

llvm::NVPTXSubtarget::allowFP16Math

bool allowFP16Math() const

Definition:NVPTXSubtarget.cpp:69

llvm::NVPTXTargetLowering

Definition:NVPTXISelLowering.h:99

llvm::NVPTXTargetLowering::getConstraintType

ConstraintType getConstraintType(StringRef Constraint) const override

getConstraintType - Given a constraint letter, return the type of constraint it is for this target.

Definition:NVPTXISelLowering.cpp:4386

llvm::NVPTXTargetLowering::LowerOperation

SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override

This callback is invoked for operations that are unsupported by the target, which are registered to u...

Definition:NVPTXISelLowering.cpp:2649

llvm::NVPTXTargetLowering::nvTM

const NVPTXTargetMachine * nvTM

Definition:NVPTXISelLowering.h:193

llvm::NVPTXTargetLowering::LowerGlobalAddress

SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const

Definition:NVPTXISelLowering.cpp:1139

llvm::NVPTXTargetLowering::NVPTXTargetLowering

NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)

Definition:NVPTXISelLowering.cpp:499

llvm::NVPTXTargetLowering::useF32FTZ

bool useF32FTZ(const MachineFunction &MF) const

Definition:NVPTXISelLowering.cpp:133

llvm::NVPTXTargetLowering::LowerSTACKSAVE

SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const

Definition:NVPTXISelLowering.cpp:2061

llvm::NVPTXTargetLowering::getFunctionArgumentAlignment

Align getFunctionArgumentAlignment(const Function *F, Type *Ty, unsigned Idx, const DataLayout &DL) const

Definition:NVPTXISelLowering.cpp:1266

llvm::NVPTXTargetLowering::getSqrtEstimate

SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override

Hooks for building estimates in place of slower divisions and square roots.

Definition:NVPTXISelLowering.cpp:1089

llvm::NVPTXTargetLowering::LowerReturn

SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override

This hook must be implemented to lower outgoing return values, described by the Outs array,...

Definition:NVPTXISelLowering.cpp:3304

llvm::NVPTXTargetLowering::LowerFormalArguments

SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override

This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...

Definition:NVPTXISelLowering.cpp:3079

llvm::NVPTXTargetLowering::LowerAsmOperandForConstraint

void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override

Lower the specified operand into the Ops vector.

Definition:NVPTXISelLowering.cpp:3426

llvm::NVPTXTargetLowering::LowerSTACKRESTORE

SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const

Definition:NVPTXISelLowering.cpp:2038

llvm::NVPTXTargetLowering::getParamName

std::string getParamName(const Function *F, int Idx) const

Definition:NVPTXISelLowering.cpp:4326

llvm::NVPTXTargetLowering::getPreferredVectorAction

TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override

Return the preferred vector type legalization action.

Definition:NVPTXISelLowering.cpp:1082

llvm::NVPTXTargetLowering::getPrototype

std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &, const SmallVectorImpl< ISD::OutputArg > &, MaybeAlign retAlignment, std::optional< std::pair< unsigned, const APInt & > > VAInfo, const CallBase &CB, unsigned UniqueCallSite) const

Definition:NVPTXISelLowering.cpp:1152

llvm::NVPTXTargetLowering::getFunctionParamOptimizedAlign

Align getFunctionParamOptimizedAlign(const Function *F, Type *ArgTy, const DataLayout &DL) const

getFunctionParamOptimizedAlign - since function arguments are passed via .param space,...

Definition:NVPTXISelLowering.cpp:4279

llvm::NVPTXTargetLowering::LowerDYNAMIC_STACKALLOC

SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const

Definition:NVPTXISelLowering.cpp:2007

llvm::NVPTXTargetLowering::getSetCCResultType

EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override

Return the ValueType of the result of SETCC operations.

Definition:NVPTXISelLowering.h:153

llvm::NVPTXTargetLowering::getRegForInlineAsmConstraint

std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override

Given a physical register constraint (e.g.

Definition:NVPTXISelLowering.cpp:4408

llvm::NVPTXTargetLowering::isLegalAddressingMode

bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override

isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...

Definition:NVPTXISelLowering.cpp:4345

llvm::NVPTXTargetLowering::shouldExpandAtomicRMWInIR

AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override

Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.

Definition:NVPTXISelLowering.cpp:5522

llvm::NVPTXTargetLowering::getFunctionByValParamAlign

Align getFunctionByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL) const

Helper for computing alignment of a device function byval parameter.

Definition:NVPTXISelLowering.cpp:4300

llvm::NVPTXTargetLowering::getTgtMemIntrinsic

bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, MachineFunction &MF, unsigned Intrinsic) const override

Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...

Definition:NVPTXISelLowering.cpp:3439

llvm::NVPTXTargetLowering::getTargetNodeName

const char * getTargetNodeName(unsigned Opcode) const override

This method returns the name of a target specific DAG node.

Definition:NVPTXISelLowering.cpp:1003

llvm::NVPTXTargetLowering::allowFMA

bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const

Definition:NVPTXISelLowering.cpp:4443

llvm::NVPTXTargetLowering::usePrecSqrtF32

bool usePrecSqrtF32() const

Definition:NVPTXISelLowering.cpp:123

llvm::NVPTXTargetLowering::getJumpTableEncoding

unsigned getJumpTableEncoding() const override

Return the entry encoding for a jump table in the current function.

Definition:NVPTXISelLowering.cpp:2766

llvm::NVPTXTargetLowering::allowUnsafeFPMath

bool allowUnsafeFPMath(MachineFunction &MF) const

Definition:NVPTXISelLowering.cpp:4460

llvm::NVPTXTargetLowering::getDivF32Level

int getDivF32Level() const

Definition:NVPTXISelLowering.cpp:110

llvm::NVPTXTargetLowering::LowerCall

SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override

This hook must be implemented to lower calls into the specified DAG.

Definition:NVPTXISelLowering.cpp:1411

llvm::NVPTXTargetMachine

NVPTXTargetMachine.

Definition:NVPTXTargetMachine.h:25

llvm::NVPTXTargetMachine::is64Bit

bool is64Bit() const

Definition:NVPTXTargetMachine.h:46

llvm::NVPTXTargetMachine::getStrPool

UniqueStringSaver & getStrPool() const

Definition:NVPTXTargetMachine.h:48

llvm::NVPTXTargetObjectFile::SelectSectionForGlobal

MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override

Definition:NVPTXISelLowering.cpp:5591

llvm::NVPTXTargetObjectFile::~NVPTXTargetObjectFile

~NVPTXTargetObjectFile() override

llvm::PointerType::get

static PointerType * get(Type *ElementType, unsigned AddressSpace)

This constructs a pointer to an object of the specified type in a numbered address space.

llvm::SDLoc

Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...

Definition:SelectionDAGNodes.h:1182

llvm::SDNode

Represents one node in the SelectionDAG.

Definition:SelectionDAGNodes.h:496

llvm::SDNode::getAsAPIntVal

const APInt & getAsAPIntVal() const

Helper method returns the APInt value of a ConstantSDNode.

Definition:SelectionDAGNodes.h:1735

llvm::SDNode::getOpcode

unsigned getOpcode() const

Return the SelectionDAG opcode value for this node.

Definition:SelectionDAGNodes.h:687

llvm::SDNode::hasOneUse

bool hasOneUse() const

Return true if there is exactly one use of this node.

Definition:SelectionDAGNodes.h:739

llvm::SDNode::getIROrder

unsigned getIROrder() const

Return the node ordering.

Definition:SelectionDAGNodes.h:758

llvm::SDNode::getAsZExtVal

uint64_t getAsZExtVal() const

Helper method returns the zero-extended integer value of a ConstantSDNode.

Definition:SelectionDAGNodes.h:1727

llvm::SDNode::getNumOperands

unsigned getNumOperands() const

Return the number of values used by this operation.

Definition:SelectionDAGNodes.h:973

llvm::SDNode::getVTList

SDVTList getVTList() const

Definition:SelectionDAGNodes.h:1020

llvm::SDNode::getOperand

const SDValue & getOperand(unsigned Num) const

Definition:SelectionDAGNodes.h:992

llvm::SDNode::getConstantOperandVal

uint64_t getConstantOperandVal(unsigned Num) const

Helper method returns the integer value of a ConstantSDNode operand.

Definition:SelectionDAGNodes.h:1723

llvm::SDNode::getConstantOperandAPInt

const APInt & getConstantOperandAPInt(unsigned Num) const

Helper method returns the APInt of a ConstantSDNode operand.

Definition:SelectionDAGNodes.h:1731

llvm::SDNode::getValueType

EVT getValueType(unsigned ResNo) const

Return the type of a specified result.

Definition:SelectionDAGNodes.h:1062

llvm::SDNode::isUndef

bool isUndef() const

Return true if the type of the node type undefined.

Definition:SelectionDAGNodes.h:694

llvm::SDNode::users

iterator_range< user_iterator > users()

Definition:SelectionDAGNodes.h:871

llvm::SDUse

Represents a use of a SDNode.

Definition:SelectionDAGNodes.h:283

llvm::SDValue

Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.

Definition:SelectionDAGNodes.h:145

llvm::SDValue::getNode

SDNode * getNode() const

get the SDNode which holds the desired result

Definition:SelectionDAGNodes.h:159

llvm::SDValue::getValue

SDValue getValue(unsigned R) const

Definition:SelectionDAGNodes.h:179

llvm::SDValue::getValueType

EVT getValueType() const

Return the ValueType of the referenced return value.

Definition:SelectionDAGNodes.h:1217

llvm::SDValue::getValueSizeInBits

TypeSize getValueSizeInBits() const

Returns the size of the value in bits.

Definition:SelectionDAGNodes.h:199

llvm::SDValue::getOperand

const SDValue & getOperand(unsigned i) const

Definition:SelectionDAGNodes.h:1225

llvm::SDValue::getSimpleValueType

MVT getSimpleValueType() const

Return the simple ValueType of the referenced return value.

Definition:SelectionDAGNodes.h:190

llvm::SDValue::getOpcode

unsigned getOpcode() const

Definition:SelectionDAGNodes.h:1213

llvm::SectionKind

SectionKind - This is a simple POD value that classifies the properties of a section.

Definition:SectionKind.h:22

llvm::SelectionDAG

This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...

Definition:SelectionDAG.h:228

llvm::SelectionDAG::getExtLoad

SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())

Definition:SelectionDAG.cpp:9287

llvm::SelectionDAG::getTargetGlobalAddress

SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)

Definition:SelectionDAG.h:751

llvm::SelectionDAG::getRoot

const SDValue & getRoot() const

Return the root tag of the SelectionDAG.

Definition:SelectionDAG.h:577

llvm::SelectionDAG::getAddrSpaceCast

SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)

Return an AddrSpaceCastSDNode.

Definition:SelectionDAG.cpp:2440

llvm::SelectionDAG::getCopyToReg

SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)

Definition:SelectionDAG.h:802

llvm::SelectionDAG::getMergeValues

SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)

Create a MERGE_VALUES node from the given operands.

Definition:SelectionDAG.cpp:9034

llvm::SelectionDAG::getVTList

SDVTList getVTList(EVT VT)

Return an SDVTList that represents the list of values specified.

Definition:SelectionDAG.cpp:10708

llvm::SelectionDAG::ExtractVectorElements

void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())

Append the extracted elements from Start to Count out of the vector Op in Args.

Definition:SelectionDAG.cpp:13053

llvm::SelectionDAG::getSetCC

SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)

Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...

Definition:SelectionDAG.h:1251

llvm::SelectionDAG::getSymbolFunctionGlobalAddress

SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)

Return a GlobalAddress of the function from the current module with name matching the given ExternalS...

Definition:SelectionDAG.cpp:12178

llvm::SelectionDAG::getConstantFP

SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)

Create a ConstantFPSDNode wrapping a constant value.

Definition:SelectionDAG.cpp:1873

llvm::SelectionDAG::getLoad

SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)

Loads are not normal binary operators: their result type is not determined by their operands,...

Definition:SelectionDAG.cpp:9270

llvm::SelectionDAG::getTargetLoweringInfo

const TargetLowering & getTargetLoweringInfo() const

Definition:SelectionDAG.h:503

llvm::SelectionDAG::MorphNodeTo

SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)

This mutates the specified node to have the specified return type, opcode, and operands.

Definition:SelectionDAG.cpp:11048

llvm::SelectionDAG::getCALLSEQ_END

SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)

Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).

Definition:SelectionDAG.h:1106

llvm::SelectionDAG::getBuildVector

SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)

Return an ISD::BUILD_VECTOR node.

Definition:SelectionDAG.h:857

llvm::SelectionDAG::getBitcast

SDValue getBitcast(EVT VT, SDValue V)

Return a bitcast using the SDLoc of the value operand, and casting to the provided type.

Definition:SelectionDAG.cpp:2433

llvm::SelectionDAG::getCopyFromReg

SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)

Definition:SelectionDAG.h:828

llvm::SelectionDAG::getSelect

SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())

Helper function to make it easier to build Select's if you just have operands and don't want to check...

Definition:SelectionDAG.h:1280

llvm::SelectionDAG::getDataLayout

const DataLayout & getDataLayout() const

Definition:SelectionDAG.h:497

llvm::SelectionDAG::getConstant

SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)

Create a ConstantSDNode wrapping a constant value.

Definition:SelectionDAG.cpp:1666

llvm::SelectionDAG::getTruncStore

SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())

Definition:SelectionDAG.cpp:9371

llvm::SelectionDAG::ReplaceAllUsesWith

void ReplaceAllUsesWith(SDValue From, SDValue To)

Modify anything using 'From' to use 'To' instead.

Definition:SelectionDAG.cpp:11653

llvm::SelectionDAG::getStore

SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())

Helper function to build ISD::STORE nodes.

Definition:SelectionDAG.cpp:9320

llvm::SelectionDAG::getSignedConstant

SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)

Definition:SelectionDAG.cpp:1794

llvm::SelectionDAG::getCALLSEQ_START

SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)

Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...

Definition:SelectionDAG.h:1094

llvm::SelectionDAG::RemoveDeadNode

void RemoveDeadNode(SDNode *N)

Remove the specified node from the system.

Definition:SelectionDAG.cpp:1084

llvm::SelectionDAG::getBasicBlock

SDValue getBasicBlock(MachineBasicBlock *MBB)

Definition:SelectionDAG.cpp:2024

llvm::SelectionDAG::getAnyExtOrTrunc

SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)

Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...

Definition:SelectionDAG.cpp:1496

llvm::SelectionDAG::getSelectCC

SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)

Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...

Definition:SelectionDAG.h:1290

llvm::SelectionDAG::getIntPtrConstant

SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)

Definition:SelectionDAG.cpp:1806

llvm::SelectionDAG::getNode

SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)

Gets or creates the specified node.

Definition:SelectionDAG.cpp:10327

llvm::SelectionDAG::getFPExtendOrRound

SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)

Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...

Definition:SelectionDAG.cpp:1475

llvm::SelectionDAG::getTargetConstant

SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)

Definition:SelectionDAG.h:701

llvm::SelectionDAG::getMachineFunction

MachineFunction & getMachineFunction() const

Definition:SelectionDAG.h:492

llvm::SelectionDAG::getZExtOrTrunc

SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)

Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...

Definition:SelectionDAG.cpp:1508

llvm::SelectionDAG::getContext

LLVMContext * getContext() const

Definition:SelectionDAG.h:510

llvm::SelectionDAG::setRoot

const SDValue & setRoot(SDValue N)

Set the current root tag of the SelectionDAG.

Definition:SelectionDAG.h:586

llvm::SelectionDAG::getMemIntrinsicNode

SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())

Creates a MemIntrinsicNode that may produce a result and takes a list of operands.

Definition:SelectionDAG.cpp:9045

llvm::SelectionDAG::getTargetExternalSymbol

SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)

Definition:SelectionDAG.cpp:2069

llvm::SelectionDAG::getEntryNode

SDValue getEntryNode() const

Return the token chain corresponding to the entry of the function.

Definition:SelectionDAG.h:580

llvm::ShuffleVectorSDNode

This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...

Definition:SelectionDAGNodes.h:1625

llvm::ShuffleVectorSDNode::getMask

ArrayRef< int > getMask() const

Definition:SelectionDAGNodes.h:1638

llvm::SmallVectorBase::empty

bool empty() const

Definition:SmallVector.h:81

llvm::SmallVectorBase::size

size_t size() const

Definition:SmallVector.h:78

llvm::SmallVectorImpl

This class consists of common code factored out of the SmallVector class to reduce code duplication b...

Definition:SmallVector.h:573

llvm::SmallVectorImpl::assign

void assign(size_type NumElts, ValueParamT Elt)

Definition:SmallVector.h:704

llvm::SmallVectorImpl::append

void append(ItTy in_start, ItTy in_end)

Add the specified range to the end of the SmallVector.

Definition:SmallVector.h:683

llvm::SmallVectorImpl::clear

void clear()

Definition:SmallVector.h:610

llvm::SmallVectorImpl::resize

void resize(size_type N)

Definition:SmallVector.h:638

llvm::SmallVectorTemplateBase::push_back

void push_back(const T &Elt)

Definition:SmallVector.h:413

llvm::SmallVector

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Definition:SmallVector.h:1196

llvm::StoreSDNode

This class is used to represent ISD::STORE nodes.

Definition:SelectionDAGNodes.h:2492

llvm::StringRef

StringRef - Represent a constant reference to a string, i.e.

Definition:StringRef.h:51

llvm::StringRef::size

constexpr size_t size() const

size - Get the string size.

Definition:StringRef.h:150

llvm::StringRef::data

constexpr const char * data() const

data - Get a pointer to the start of the string (which may not be null terminated).

Definition:StringRef.h:144

llvm::StructType

Class to represent struct types.

Definition:DerivedTypes.h:218

llvm::TargetLoweringBase::setBooleanVectorContents

void setBooleanVectorContents(BooleanContent Ty)

Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...

Definition:TargetLowering.h:2493

llvm::TargetLoweringBase::setOperationAction

void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)

Indicate that the specified operation does not work with the specified type and indicate what to do a...

Definition:TargetLowering.h:2562

llvm::TargetLoweringBase::setMaxDivRemBitWidthSupported

void setMaxDivRemBitWidthSupported(unsigned SizeInBits)

Set the size in bits of the maximum div/rem the backend supports.

Definition:TargetLowering.h:2772

llvm::TargetLoweringBase::Enabled

@ Enabled

Definition:TargetLowering.h:576

llvm::TargetLoweringBase::Unspecified

@ Unspecified

Definition:TargetLowering.h:574

llvm::TargetLoweringBase::getValueType

EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const

Return the EVT corresponding to this LLVM type.

Definition:TargetLowering.h:1677

llvm::TargetLoweringBase::LegalizeAction

LegalizeAction

This enum indicates whether operations are valid for a target, and if not, what action should be used...

Definition:TargetLowering.h:199

llvm::TargetLoweringBase::Custom

@ Custom

Definition:TargetLowering.h:204

llvm::TargetLoweringBase::Expand

@ Expand

Definition:TargetLowering.h:202

llvm::TargetLoweringBase::Promote

@ Promote

Definition:TargetLowering.h:201

llvm::TargetLoweringBase::MaxStoresPerMemcpyOptSize

unsigned MaxStoresPerMemcpyOptSize

Likewise for functions with the OptSize attribute.

Definition:TargetLowering.h:3718

llvm::TargetLoweringBase::getRegClassFor

virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const

Return the register class that should be used for the specified value type.

Definition:TargetLowering.h:1042

llvm::TargetLoweringBase::getTargetMachine

const TargetMachine & getTargetMachine() const

Definition:TargetLowering.h:364

llvm::TargetLoweringBase::setOperationPromotedToType

void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)

Convenience method to set an operation to Promote and specify the type in a single call.

Definition:TargetLowering.h:2716

llvm::TargetLoweringBase::LegalizeTypeAction

LegalizeTypeAction

This enum indicates whether a types are legal for a target, and if not, what action should be used to...

Definition:TargetLowering.h:209

llvm::TargetLoweringBase::TypeSplitVector

@ TypeSplitVector

Definition:TargetLowering.h:216

llvm::TargetLoweringBase::addBypassSlowDiv

void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)

Tells the code generator which bitwidths to bypass.

Definition:TargetLowering.h:2538

llvm::TargetLoweringBase::getNumRegisters

virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const

Return the number of registers that this ValueType will eventually require.

Definition:TargetLowering.h:1763

llvm::TargetLoweringBase::setMaxAtomicSizeInBitsSupported

void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)

Set the maximum atomic operation size supported by the backend.

Definition:TargetLowering.h:2766

llvm::TargetLoweringBase::getPreferredVectorAction

virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const

Return the preferred vector type legalization action.

Definition:TargetLowering.h:517

llvm::TargetLoweringBase::MaxStoresPerMemsetOptSize

unsigned MaxStoresPerMemsetOptSize

Likewise for functions with the OptSize attribute.

Definition:TargetLowering.h:3703

llvm::TargetLoweringBase::setBooleanContents

void setBooleanContents(BooleanContent Ty)

Specify how the target extends the result of integer and floating point boolean values from i1 to a w...

Definition:TargetLowering.h:2479

llvm::TargetLoweringBase::MaxStoresPerMemmove

unsigned MaxStoresPerMemmove

Specify maximum number of store instructions per memmove call.

Definition:TargetLowering.h:3751

llvm::TargetLoweringBase::computeRegisterProperties

void computeRegisterProperties(const TargetRegisterInfo *TRI)

Once all of the register classes are added, this allows us to compute derived properties we expose.

Definition:TargetLoweringBase.cpp:1275

llvm::TargetLoweringBase::MaxStoresPerMemmoveOptSize

unsigned MaxStoresPerMemmoveOptSize

Likewise for functions with the OptSize attribute.

Definition:TargetLowering.h:3753

llvm::TargetLoweringBase::addRegisterClass

void addRegisterClass(MVT VT, const TargetRegisterClass *RC)

Add the specified register class as an available regclass for the specified value type.

Definition:TargetLowering.h:2545

llvm::TargetLoweringBase::getPointerTy

virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const

Return the pointer type for the given address space, defaults to the pointer type from the data layou...

Definition:TargetLowering.h:371

llvm::TargetLoweringBase::MaxStoresPerMemset

unsigned MaxStoresPerMemset

Specify maximum number of store instructions per memset call.

Definition:TargetLowering.h:3701

llvm::TargetLoweringBase::setTruncStoreAction

void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)

Indicate that the specified truncating store does not work with the specified type and indicate what ...

Definition:TargetLowering.h:2625

llvm::TargetLoweringBase::ZeroOrNegativeOneBooleanContent

@ ZeroOrNegativeOneBooleanContent

Definition:TargetLowering.h:237

llvm::TargetLoweringBase::setMinCmpXchgSizeInBits

void setMinCmpXchgSizeInBits(unsigned SizeInBits)

Sets the minimum cmpxchg or ll/sc size supported by the backend.

Definition:TargetLowering.h:2783

llvm::TargetLoweringBase::AddPromotedToType

void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)

If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...

Definition:TargetLowering.h:2710

llvm::TargetLoweringBase::AtomicExpansionKind

AtomicExpansionKind

Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.

Definition:TargetLowering.h:253

llvm::TargetLoweringBase::AtomicExpansionKind::CmpXChg

@ CmpXChg

llvm::TargetLoweringBase::AtomicExpansionKind::None

@ None

llvm::TargetLoweringBase::setCondCodeAction

void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)

Indicate that the specified condition code is or isn't supported on the target and indicate what to d...

Definition:TargetLowering.h:2686

llvm::TargetLoweringBase::setTargetDAGCombine

void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)

Targets should invoke this method for each target independent node that they want to provide a custom...

Definition:TargetLowering.h:2731

llvm::TargetLoweringBase::getMinStackArgumentAlignment

Align getMinStackArgumentAlignment() const

Return the minimum stack alignment of an argument.

Definition:TargetLowering.h:2038

llvm::TargetLoweringBase::setLoadExtAction

void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)

Indicate that the specified load with extension does not work with the specified type and indicate wh...

Definition:TargetLowering.h:2579

llvm::TargetLoweringBase::ArgListTy

std::vector< ArgListEntry > ArgListTy

Definition:TargetLowering.h:329

llvm::TargetLoweringBase::allowsMemoryAccessForAlignment

bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const

This function returns true if the memory access is aligned or if the target allows this specific unal...

Definition:TargetLoweringBase.cpp:1708

llvm::TargetLoweringBase::MaxStoresPerMemcpy

unsigned MaxStoresPerMemcpy

Specify maximum number of store instructions per memcpy call.

Definition:TargetLowering.h:3716

llvm::TargetLoweringBase::setSchedulingPreference

void setSchedulingPreference(Sched::Preference Pref)

Specify the target scheduling preference.

Definition:TargetLowering.h:2498

llvm::TargetLoweringBase::setJumpIsExpensive

void setJumpIsExpensive(bool isExpensive=true)

Tells the code generator not to expand logic operations on comparison predicates into separate sequen...

Definition:TargetLoweringBase.cpp:941

llvm::TargetLoweringBase::getOperationAction

LegalizeAction getOperationAction(unsigned Op, EVT VT) const

Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...

Definition:TargetLowering.h:1270

llvm::TargetLowering

This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...

Definition:TargetLowering.h:3780

llvm::TargetLowering::ConstraintType

ConstraintType

Definition:TargetLowering.h:4950

llvm::TargetLowering::C_RegisterClass

@ C_RegisterClass

Definition:TargetLowering.h:4952

llvm::TargetLowering::expandUnalignedStore

SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const

Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.

Definition:TargetLowering.cpp:10277

llvm::TargetLowering::getConstraintType

virtual ConstraintType getConstraintType(StringRef Constraint) const

Given a constraint, return the type of constraint it is for this target.

Definition:TargetLowering.cpp:5525

llvm::TargetLowering::expandUnalignedLoad

std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const

Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.

Definition:TargetLowering.cpp:10128

llvm::TargetLowering::getRegForInlineAsmConstraint

virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const

Given a physical register constraint (e.g.

Definition:TargetLowering.cpp:5669

llvm::TargetLowering::expandRoundInexactToOdd

SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const

Truncate Op to ResultVT.

Definition:TargetLowering.cpp:11593

llvm::TargetLowering::expandFP_ROUND

SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const

Expand round(fp) to fp conversion.

Definition:TargetLowering.cpp:11661

llvm::TargetLowering::LowerAsmOperandForConstraint

virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const

Lower the specified operand into the Ops vector.

Definition:TargetLowering.cpp:5587

llvm::TargetMachine

Primary interface to the complete machine description for the target machine.

Definition:TargetMachine.h:77

llvm::TargetMachine::getOptLevel

CodeGenOptLevel getOptLevel() const

Returns the optimization level: None, Less, Default, or Aggressive.

Definition:TargetMachine.h:257

llvm::TargetMachine::Options

TargetOptions Options

Definition:TargetMachine.h:118

llvm::TargetMachine::getSymbol

MCSymbol * getSymbol(const GlobalValue *GV) const

Definition:TargetMachine.cpp:283

llvm::TargetOptions::UnsafeFPMath

unsigned UnsafeFPMath

UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...

Definition:TargetOptions.h:176

llvm::TargetOptions::AllowFPOpFusion

FPOpFusion::FPOpFusionMode AllowFPOpFusion

AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.

Definition:TargetOptions.h:420

llvm::TargetRegisterInfo

TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...

Definition:TargetRegisterInfo.h:235

llvm::TypeSize

Definition:TypeSize.h:334

llvm::Type

The instances of the Type class are immutable: once they are created, they are never changed.

Definition:Type.h:45

llvm::Type::isVectorTy

bool isVectorTy() const

True if this is an instance of VectorType.

Definition:Type.h:270

llvm::Type::isFloatTy

bool isFloatTy() const

Return true if this is 'float', a 32-bit IEEE fp type.

Definition:Type.h:153

llvm::Type::isBFloatTy

bool isBFloatTy() const

Return true if this is 'bfloat', a 16-bit bfloat type.

Definition:Type.h:145

llvm::Type::VoidTyID

@ VoidTyID

type with no size

Definition:Type.h:63

llvm::Type::isAggregateType

bool isAggregateType() const

Return true if the type is an aggregate type.

Definition:Type.h:303

llvm::Type::isHalfTy

bool isHalfTy() const

Return true if this is 'half', a 16-bit IEEE fp type.

Definition:Type.h:142

llvm::Type::isDoubleTy

bool isDoubleTy() const

Return true if this is 'double', a 64-bit IEEE fp type.

Definition:Type.h:156

llvm::Type::isFloatingPointTy

bool isFloatingPointTy() const

Return true if this is one of the floating-point types.

Definition:Type.h:184

llvm::Type::isIntegerTy

bool isIntegerTy() const

True if this is an instance of IntegerType.

Definition:Type.h:237

llvm::Type::getTypeID

TypeID getTypeID() const

Return the type id for the type.

Definition:Type.h:136

llvm::Type::getPrimitiveSizeInBits

TypeSize getPrimitiveSizeInBits() const LLVM_READONLY

Return the basic size of this type if it is a primitive type.

llvm::UniqueStringSaver::save

StringRef save(const char *S)

Definition:StringSaver.h:52

llvm::User

Definition:User.h:44

llvm::Value

LLVM Value Representation.

Definition:Value.h:74

llvm::Value::getType

Type * getType() const

All values are typed, get the type of this value.

Definition:Value.h:255

llvm::cl::Option::getNumOccurrences

int getNumOccurrences() const

Definition:CommandLine.h:399

llvm::cl::opt

Definition:CommandLine.h:1423

llvm::raw_string_ostream

A raw_ostream that writes to an std::string.

Definition:raw_ostream.h:661

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

Definition:ErrorHandling.h:143

TargetMachine.h

llvm::AMDGPUISD::BFI

@ BFI

Definition:AMDGPUISelLowering.h:496

llvm::APIntOps::pow

APInt pow(const APInt &X, int64_t N)

Compute X^N for N>=0.

Definition:APInt.cpp:3112

llvm::ARM_MB::LD

@ LD

Definition:ARMBaseInfo.h:72

llvm::ARM_MB::ST

@ ST

Definition:ARMBaseInfo.h:73

llvm::CallingConv::C

@ C

The default llvm calling convention, compatible with C.

Definition:CallingConv.h:34

llvm::FPOpFusion::Fast

@ Fast

Definition:TargetOptions.h:37

llvm::IRSimilarity::Legal

@ Legal

Definition:IRSimilarityIdentifier.h:76

llvm::ISD::NodeType

NodeType

ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.

Definition:ISDOpcodes.h:40

llvm::ISD::SETCC

@ SETCC

SetCC operator - This evaluates to a true value iff the condition is true.

Definition:ISDOpcodes.h:780

llvm::ISD::STACKRESTORE

@ STACKRESTORE

STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.

Definition:ISDOpcodes.h:1197

llvm::ISD::STACKSAVE

@ STACKSAVE

STACKSAVE - STACKSAVE has one operand, an input chain.

Definition:ISDOpcodes.h:1193

llvm::ISD::STORE

@ STORE

Definition:ISDOpcodes.h:1103

llvm::ISD::SREM

@ SREM

Definition:ISDOpcodes.h:251

llvm::ISD::SMUL_LOHI

@ SMUL_LOHI

SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...

Definition:ISDOpcodes.h:257

llvm::ISD::SSUBO_CARRY

@ SSUBO_CARRY

Definition:ISDOpcodes.h:321

llvm::ISD::UDIV

@ UDIV

Definition:ISDOpcodes.h:250

llvm::ISD::UINT_TO_FP

@ UINT_TO_FP

Definition:ISDOpcodes.h:842

llvm::ISD::UMIN

@ UMIN

Definition:ISDOpcodes.h:699

llvm::ISD::BSWAP

@ BSWAP

Byte Swap and Counting operators.

Definition:ISDOpcodes.h:744

llvm::ISD::ROTR

@ ROTR

Definition:ISDOpcodes.h:739

llvm::ISD::VAEND

@ VAEND

VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.

Definition:ISDOpcodes.h:1226

llvm::ISD::ConstantFP

@ ConstantFP

Definition:ISDOpcodes.h:77

llvm::ISD::UADDO

@ UADDO

Definition:ISDOpcodes.h:331

llvm::ISD::FTRUNC

@ FTRUNC

Definition:ISDOpcodes.h:1013

llvm::ISD::SDIV

@ SDIV

Definition:ISDOpcodes.h:249

llvm::ISD::ADDC

@ ADDC

Carry-setting nodes for multiple precision addition and subtraction.

Definition:ISDOpcodes.h:276

llvm::ISD::FMAXNUM_IEEE

@ FMAXNUM_IEEE

Definition:ISDOpcodes.h:1045

llvm::ISD::ADD

@ ADD

Simple integer binary arithmetic operators.

Definition:ISDOpcodes.h:246

llvm::ISD::LOAD

@ LOAD

LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...

Definition:ISDOpcodes.h:1102

llvm::ISD::ANY_EXTEND

@ ANY_EXTEND

ANY_EXTEND - Used for integer types. The high bits are undefined.

Definition:ISDOpcodes.h:814

llvm::ISD::FSUB

@ FSUB

Definition:ISDOpcodes.h:398

llvm::ISD::FMA

@ FMA

FMA - Perform a * b + c with no intermediate rounding step.

Definition:ISDOpcodes.h:498

llvm::ISD::SUBC

@ SUBC

Definition:ISDOpcodes.h:277

llvm::ISD::FABS

@ FABS

Definition:ISDOpcodes.h:982

llvm::ISD::FNEARBYINT

@ FNEARBYINT

Definition:ISDOpcodes.h:1015

llvm::ISD::INTRINSIC_VOID

@ INTRINSIC_VOID

OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...

Definition:ISDOpcodes.h:205

llvm::ISD::RETURNADDR

@ RETURNADDR

Definition:ISDOpcodes.h:101

llvm::ISD::GlobalAddress

@ GlobalAddress

Definition:ISDOpcodes.h:78

llvm::ISD::SINT_TO_FP

@ SINT_TO_FP

[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...

Definition:ISDOpcodes.h:841

llvm::ISD::CONCAT_VECTORS

@ CONCAT_VECTORS

CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...

Definition:ISDOpcodes.h:558

llvm::ISD::FADD

@ FADD

Simple binary floating point operators.

Definition:ISDOpcodes.h:397

llvm::ISD::ABS

@ ABS

ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.

Definition:ISDOpcodes.h:717

llvm::ISD::UDIVREM

@ UDIVREM

Definition:ISDOpcodes.h:263

llvm::ISD::SDIVREM

@ SDIVREM

SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.

Definition:ISDOpcodes.h:262

llvm::ISD::SRL

@ SRL

Definition:ISDOpcodes.h:737

llvm::ISD::FMAXIMUM

@ FMAXIMUM

Definition:ISDOpcodes.h:1051

llvm::ISD::BITCAST

@ BITCAST

BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...

Definition:ISDOpcodes.h:954

llvm::ISD::BUILD_PAIR

@ BUILD_PAIR

BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.

Definition:ISDOpcodes.h:236

llvm::ISD::FFLOOR

@ FFLOOR

Definition:ISDOpcodes.h:1018

llvm::ISD::SRA

@ SRA

Definition:ISDOpcodes.h:736

llvm::ISD::USUBO

@ USUBO

Definition:ISDOpcodes.h:335

llvm::ISD::SIGN_EXTEND

@ SIGN_EXTEND

Conversion operators.

Definition:ISDOpcodes.h:805

llvm::ISD::FLOG2

@ FLOG2

Definition:ISDOpcodes.h:1007

llvm::ISD::READSTEADYCOUNTER

@ READSTEADYCOUNTER

READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.

Definition:ISDOpcodes.h:1259

llvm::ISD::USHLSAT

@ USHLSAT

Definition:ISDOpcodes.h:367

llvm::ISD::UADDSAT

@ UADDSAT

Definition:ISDOpcodes.h:348

llvm::ISD::FMAXNUM

@ FMAXNUM

Definition:ISDOpcodes.h:1032

llvm::ISD::FRINT

@ FRINT

Definition:ISDOpcodes.h:1014

llvm::ISD::FNEG

@ FNEG

Perform various unary floating-point operations inspired by libm.

Definition:ISDOpcodes.h:981

llvm::ISD::BR_CC

@ BR_CC

BR_CC - Conditional branch.

Definition:ISDOpcodes.h:1148

llvm::ISD::CTTZ

@ CTTZ

Definition:ISDOpcodes.h:745

llvm::ISD::SSUBO

@ SSUBO

Same for subtraction.

Definition:ISDOpcodes.h:334

llvm::ISD::FP_TO_UINT

@ FP_TO_UINT

Definition:ISDOpcodes.h:888

llvm::ISD::BRIND

@ BRIND

BRIND - Indirect branch.

Definition:ISDOpcodes.h:1123

llvm::ISD::BR_JT

@ BR_JT

BR_JT - Jumptable branch.

Definition:ISDOpcodes.h:1127

llvm::ISD::OR

@ OR

Definition:ISDOpcodes.h:710

llvm::ISD::SSUBSAT

@ SSUBSAT

RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...

Definition:ISDOpcodes.h:356

llvm::ISD::UMULO

@ UMULO

Definition:ISDOpcodes.h:339

llvm::ISD::SRA_PARTS

@ SRA_PARTS

Definition:ISDOpcodes.h:795

llvm::ISD::SELECT

@ SELECT

Select(COND, TRUEVAL, FALSEVAL).

Definition:ISDOpcodes.h:757

llvm::ISD::UMUL_LOHI

@ UMUL_LOHI

Definition:ISDOpcodes.h:258

llvm::ISD::UNDEF

@ UNDEF

UNDEF - An undefined node.

Definition:ISDOpcodes.h:218

llvm::ISD::VACOPY

@ VACOPY

VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...

Definition:ISDOpcodes.h:1222

llvm::ISD::FSHL

@ FSHL

Definition:ISDOpcodes.h:740

llvm::ISD::CopyFromReg

@ CopyFromReg

CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...

Definition:ISDOpcodes.h:215

llvm::ISD::SADDO

@ SADDO

RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.

Definition:ISDOpcodes.h:330

llvm::ISD::FSHR

@ FSHR

Definition:ISDOpcodes.h:741

llvm::ISD::FROUND

@ FROUND

Definition:ISDOpcodes.h:1016

llvm::ISD::USUBSAT

@ USUBSAT

Definition:ISDOpcodes.h:357

llvm::ISD::MULHU

@ MULHU

MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...

Definition:ISDOpcodes.h:674

llvm::ISD::SHL

@ SHL

Shift and rotation operations.

Definition:ISDOpcodes.h:735

llvm::ISD::VECTOR_SHUFFLE

@ VECTOR_SHUFFLE

VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.

Definition:ISDOpcodes.h:615

llvm::ISD::EXTRACT_SUBVECTOR

@ EXTRACT_SUBVECTOR

EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.

Definition:ISDOpcodes.h:588

llvm::ISD::FMINNUM_IEEE

@ FMINNUM_IEEE

FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...

Definition:ISDOpcodes.h:1044

llvm::ISD::FCOS

@ FCOS

Definition:ISDOpcodes.h:986

llvm::ISD::EXTRACT_VECTOR_ELT

@ EXTRACT_VECTOR_ELT

EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...

Definition:ISDOpcodes.h:550

llvm::ISD::CopyToReg

@ CopyToReg

CopyToReg - This node has three operands: a chain, a register number to set to this value,...

Definition:ISDOpcodes.h:209

llvm::ISD::ZERO_EXTEND

@ ZERO_EXTEND

ZERO_EXTEND - Used for integer types, zeroing the new bits.

Definition:ISDOpcodes.h:811

llvm::ISD::DEBUGTRAP

@ DEBUGTRAP

DEBUGTRAP - Trap intended to get the attention of a debugger.

Definition:ISDOpcodes.h:1282

llvm::ISD::CTPOP

@ CTPOP

Definition:ISDOpcodes.h:747

llvm::ISD::SELECT_CC

@ SELECT_CC

Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...

Definition:ISDOpcodes.h:772

llvm::ISD::FMUL

@ FMUL

Definition:ISDOpcodes.h:399

llvm::ISD::SRL_PARTS

@ SRL_PARTS

Definition:ISDOpcodes.h:796

llvm::ISD::FMINNUM

@ FMINNUM

FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.

Definition:ISDOpcodes.h:1031

llvm::ISD::SUB

@ SUB

Definition:ISDOpcodes.h:247

llvm::ISD::MULHS

@ MULHS

Definition:ISDOpcodes.h:675

llvm::ISD::SSHLSAT

@ SSHLSAT

RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.

Definition:ISDOpcodes.h:366

llvm::ISD::SMULO

@ SMULO

Same for multiplication.

Definition:ISDOpcodes.h:338

llvm::ISD::PARITY

@ PARITY

Definition:ISDOpcodes.h:749

llvm::ISD::DYNAMIC_STACKALLOC

@ DYNAMIC_STACKALLOC

DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.

Definition:ISDOpcodes.h:1112

llvm::ISD::SIGN_EXTEND_INREG

@ SIGN_EXTEND_INREG

SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...

Definition:ISDOpcodes.h:849

llvm::ISD::SMIN

@ SMIN

[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.

Definition:ISDOpcodes.h:697

llvm::ISD::FP_EXTEND

@ FP_EXTEND

X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.

Definition:ISDOpcodes.h:939

llvm::ISD::VSELECT

@ VSELECT

Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...

Definition:ISDOpcodes.h:766

llvm::ISD::UADDO_CARRY

@ UADDO_CARRY

Carry-using nodes for multiple precision addition and subtraction.

Definition:ISDOpcodes.h:310

llvm::ISD::FROUNDEVEN

@ FROUNDEVEN

Definition:ISDOpcodes.h:1017

llvm::ISD::FDIV

@ FDIV

Definition:ISDOpcodes.h:400

llvm::ISD::BF16_TO_FP

@ BF16_TO_FP

BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.

Definition:ISDOpcodes.h:973

llvm::ISD::FRAMEADDR

@ FRAMEADDR

FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.

Definition:ISDOpcodes.h:100

llvm::ISD::FREM

@ FREM

Definition:ISDOpcodes.h:401

llvm::ISD::FMINIMUM

@ FMINIMUM

FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....

Definition:ISDOpcodes.h:1050

llvm::ISD::FP_TO_SINT

@ FP_TO_SINT

FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.

Definition:ISDOpcodes.h:887

llvm::ISD::READCYCLECOUNTER

@ READCYCLECOUNTER

READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.

Definition:ISDOpcodes.h:1253

llvm::ISD::AND

@ AND

Bitwise operators - logical and, logical or, logical xor.

Definition:ISDOpcodes.h:709

llvm::ISD::TRAP

@ TRAP

TRAP - Trapping instruction.

Definition:ISDOpcodes.h:1279

llvm::ISD::INTRINSIC_WO_CHAIN

@ INTRINSIC_WO_CHAIN

RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...

Definition:ISDOpcodes.h:190

llvm::ISD::USUBO_CARRY

@ USUBO_CARRY

Definition:ISDOpcodes.h:311

llvm::ISD::SUBE

@ SUBE

Definition:ISDOpcodes.h:287

llvm::ISD::ADDE

@ ADDE

Carry-using nodes for multiple precision addition and subtraction.

Definition:ISDOpcodes.h:286

llvm::ISD::UREM

@ UREM

Definition:ISDOpcodes.h:252

llvm::ISD::FREEZE

@ FREEZE

FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or is evaluated to UNDEF),...

Definition:ISDOpcodes.h:223

llvm::ISD::INSERT_VECTOR_ELT

@ INSERT_VECTOR_ELT

INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.

Definition:ISDOpcodes.h:539

llvm::ISD::TokenFactor

@ TokenFactor

TokenFactor - This node takes multiple tokens as input and produces a single token result.

Definition:ISDOpcodes.h:52

llvm::ISD::FSIN

@ FSIN

Definition:ISDOpcodes.h:985

llvm::ISD::FCEIL

@ FCEIL

Definition:ISDOpcodes.h:1012

llvm::ISD::MUL

@ MUL

Definition:ISDOpcodes.h:248

llvm::ISD::FP_ROUND

@ FP_ROUND

X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...

Definition:ISDOpcodes.h:920

llvm::ISD::CTLZ

@ CTLZ

Definition:ISDOpcodes.h:746

llvm::ISD::VASTART

@ VASTART

Definition:ISDOpcodes.h:1227

llvm::ISD::FSQRT

@ FSQRT

Definition:ISDOpcodes.h:983

llvm::ISD::TRUNCATE

@ TRUNCATE

TRUNCATE - Completely drop the high bits.

Definition:ISDOpcodes.h:817

llvm::ISD::VAARG

@ VAARG

VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.

Definition:ISDOpcodes.h:1217

llvm::ISD::ROTL

@ ROTL

Definition:ISDOpcodes.h:738

llvm::ISD::SHL_PARTS

@ SHL_PARTS

SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.

Definition:ISDOpcodes.h:794

llvm::ISD::BITREVERSE

@ BITREVERSE

Definition:ISDOpcodes.h:748

llvm::ISD::FCOPYSIGN

@ FCOPYSIGN

FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.

Definition:ISDOpcodes.h:508

llvm::ISD::SADDSAT

@ SADDSAT

RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...

Definition:ISDOpcodes.h:347

llvm::ISD::FEXP2

@ FEXP2

Definition:ISDOpcodes.h:1010

llvm::ISD::SMAX

@ SMAX

Definition:ISDOpcodes.h:698

llvm::ISD::UMAX

@ UMAX

Definition:ISDOpcodes.h:700

llvm::ISD::SADDO_CARRY

@ SADDO_CARRY

Carry-using overflow-aware nodes for multiple precision addition and subtraction.

Definition:ISDOpcodes.h:320

llvm::ISD::INTRINSIC_W_CHAIN

@ INTRINSIC_W_CHAIN

RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...

Definition:ISDOpcodes.h:198

llvm::ISD::BUILD_VECTOR

@ BUILD_VECTOR

BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...

Definition:ISDOpcodes.h:530

llvm::ISD::allOperandsUndef

bool allOperandsUndef(const SDNode *N)

Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...

Definition:SelectionDAG.cpp:350

llvm::ISD::SETOLT

@ SETOLT

Definition:ISDOpcodes.h:1616

llvm::ISD::SETNE

@ SETNE

Definition:ISDOpcodes.h:1635

llvm::ISD::SETUGT

@ SETUGT

Definition:ISDOpcodes.h:1622

llvm::ISD::SETOGT

@ SETOGT

Definition:ISDOpcodes.h:1614

llvm::ISD::SETULT

@ SETULT

Definition:ISDOpcodes.h:1624

llvm::ISD::SETGT

@ SETGT

Definition:ISDOpcodes.h:1631

llvm::ISD::SETLT

@ SETLT

Definition:ISDOpcodes.h:1633

llvm::ISD::SETGE

@ SETGE

Definition:ISDOpcodes.h:1632

llvm::ISD::SETUGE

@ SETUGE

Definition:ISDOpcodes.h:1623

llvm::ISD::SETLE

@ SETLE

Definition:ISDOpcodes.h:1634

llvm::ISD::SETULE

@ SETULE

Definition:ISDOpcodes.h:1625

llvm::ISD::SETEQ

@ SETEQ

Definition:ISDOpcodes.h:1630

llvm::ISD::NON_EXTLOAD

@ NON_EXTLOAD

Definition:ISDOpcodes.h:1590

llvm::ISD::SEXTLOAD

@ SEXTLOAD

Definition:ISDOpcodes.h:1590

llvm::ISD::ZEXTLOAD

@ ZEXTLOAD

Definition:ISDOpcodes.h:1590

llvm::ISD::EXTLOAD

@ EXTLOAD

Definition:ISDOpcodes.h:1590

llvm::LegacyLegalizeActions::Bitcast

@ Bitcast

Perform the operation on a different, but equivalently sized type.

Definition:LegacyLegalizerInfo.h:55

llvm::M68k::MemAddrModeKind::L

@ L

llvm::NVPTXAS::ADDRESS_SPACE_PARAM

@ ADDRESS_SPACE_PARAM

Definition:NVPTXAddrSpace.h:27

llvm::NVPTXAS::ADDRESS_SPACE_LOCAL

@ ADDRESS_SPACE_LOCAL

Definition:NVPTXAddrSpace.h:25

llvm::NVPTXAS::ADDRESS_SPACE_GENERIC

@ ADDRESS_SPACE_GENERIC

Definition:NVPTXAddrSpace.h:21

llvm::NVPTXISD::NodeType

NodeType

Definition:NVPTXISelLowering.h:23

llvm::NVPTXISD::Prototype

@ Prototype

Definition:NVPTXISelLowering.h:46

llvm::NVPTXISD::PrintConvergentCallUni

@ PrintConvergentCallUni

Definition:NVPTXISelLowering.h:38

llvm::NVPTXISD::LastCallArg

@ LastCallArg

Definition:NVPTXISelLowering.h:41

llvm::NVPTXISD::LOAD_PARAM

@ LOAD_PARAM

Definition:NVPTXISelLowering.h:29

llvm::NVPTXISD::CallArg

@ CallArg

Definition:NVPTXISelLowering.h:40

llvm::NVPTXISD::DeclareRetParam

@ DeclareRetParam

Definition:NVPTXISelLowering.h:32

llvm::NVPTXISD::STACKSAVE

@ STACKSAVE

Definition:NVPTXISelLowering.h:66

llvm::NVPTXISD::PRMT

@ PRMT

Definition:NVPTXISelLowering.h:62

llvm::NVPTXISD::StoreParamS32

@ StoreParamS32

Definition:NVPTXISelLowering.h:85

llvm::NVPTXISD::MoveParam

@ MoveParam

Definition:NVPTXISelLowering.h:47

llvm::NVPTXISD::CALL

@ CALL

Definition:NVPTXISelLowering.h:27

llvm::NVPTXISD::CallSymbol

@ CallSymbol

Definition:NVPTXISelLowering.h:45

llvm::NVPTXISD::BrxItem

@ BrxItem

Definition:NVPTXISelLowering.h:68

llvm::NVPTXISD::LoadParamV2

@ LoadParamV2

Definition:NVPTXISelLowering.h:80

llvm::NVPTXISD::PrintConvergentCall

@ PrintConvergentCall

Definition:NVPTXISelLowering.h:36

llvm::NVPTXISD::StoreV2

@ StoreV2

Definition:NVPTXISelLowering.h:77

llvm::NVPTXISD::RETURN

@ RETURN

Definition:NVPTXISelLowering.h:49

llvm::NVPTXISD::CallSeqBegin

@ CallSeqBegin

Definition:NVPTXISelLowering.h:50

llvm::NVPTXISD::FIRST_NUMBER

@ FIRST_NUMBER

Definition:NVPTXISelLowering.h:25

llvm::NVPTXISD::StoreRetval

@ StoreRetval

Definition:NVPTXISelLowering.h:87

llvm::NVPTXISD::StoreRetvalV2

@ StoreRetvalV2

Definition:NVPTXISelLowering.h:88

llvm::NVPTXISD::LDUV2

@ LDUV2

Definition:NVPTXISelLowering.h:75

llvm::NVPTXISD::MUL_WIDE_SIGNED

@ MUL_WIDE_SIGNED

Definition:NVPTXISelLowering.h:56

llvm::NVPTXISD::FSHL_CLAMP

@ FSHL_CLAMP

Definition:NVPTXISelLowering.h:54

llvm::NVPTXISD::SETP_F16X2

@ SETP_F16X2

Definition:NVPTXISelLowering.h:58

llvm::NVPTXISD::ProxyReg

@ ProxyReg

Definition:NVPTXISelLowering.h:53

llvm::NVPTXISD::StoreV4

@ StoreV4

Definition:NVPTXISelLowering.h:78

llvm::NVPTXISD::CallVal

@ CallVal

Definition:NVPTXISelLowering.h:44

llvm::NVPTXISD::BrxEnd

@ BrxEnd

Definition:NVPTXISelLowering.h:69

llvm::NVPTXISD::LoadParamV4

@ LoadParamV4

Definition:NVPTXISelLowering.h:81

llvm::NVPTXISD::Dummy

@ Dummy

Definition:NVPTXISelLowering.h:70

llvm::NVPTXISD::PrintCall

@ PrintCall

Definition:NVPTXISelLowering.h:35

llvm::NVPTXISD::CallPrototype

@ CallPrototype

Definition:NVPTXISelLowering.h:52

llvm::NVPTXISD::DeclareScalarRet

@ DeclareScalarRet

Definition:NVPTXISelLowering.h:34

llvm::NVPTXISD::DYNAMIC_STACKALLOC

@ DYNAMIC_STACKALLOC

Definition:NVPTXISelLowering.h:64

llvm::NVPTXISD::LoadV2

@ LoadV2

Definition:NVPTXISelLowering.h:73

llvm::NVPTXISD::CallArgEnd

@ CallArgEnd

Definition:NVPTXISelLowering.h:42

llvm::NVPTXISD::StoreRetvalV4

@ StoreRetvalV4

Definition:NVPTXISelLowering.h:89

llvm::NVPTXISD::BrxStart

@ BrxStart

Definition:NVPTXISelLowering.h:67

llvm::NVPTXISD::StoreParamV4

@ StoreParamV4

Definition:NVPTXISelLowering.h:84

llvm::NVPTXISD::CallArgBegin

@ CallArgBegin

Definition:NVPTXISelLowering.h:39

llvm::NVPTXISD::BFI

@ BFI

Definition:NVPTXISelLowering.h:61

llvm::NVPTXISD::StoreParamV2

@ StoreParamV2

Definition:NVPTXISelLowering.h:83

llvm::NVPTXISD::STACKRESTORE

@ STACKRESTORE

Definition:NVPTXISelLowering.h:65

llvm::NVPTXISD::Wrapper

@ Wrapper

Definition:NVPTXISelLowering.h:26

llvm::NVPTXISD::SETP_BF16X2

@ SETP_BF16X2

Definition:NVPTXISelLowering.h:59

llvm::NVPTXISD::DeclareParam

@ DeclareParam

Definition:NVPTXISelLowering.h:30

llvm::NVPTXISD::LDUV4

@ LDUV4

Definition:NVPTXISelLowering.h:76

llvm::NVPTXISD::CallVoid

@ CallVoid

Definition:NVPTXISelLowering.h:43

llvm::NVPTXISD::StoreParam

@ StoreParam

Definition:NVPTXISelLowering.h:82

llvm::NVPTXISD::StoreParamU32

@ StoreParamU32

Definition:NVPTXISelLowering.h:86

llvm::NVPTXISD::PrintCallUni

@ PrintCallUni

Definition:NVPTXISelLowering.h:37

llvm::NVPTXISD::DeclareRet

@ DeclareRet

Definition:NVPTXISelLowering.h:33

llvm::NVPTXISD::FSHR_CLAMP

@ FSHR_CLAMP

Definition:NVPTXISelLowering.h:55

llvm::NVPTXISD::DeclareScalarParam

@ DeclareScalarParam

Definition:NVPTXISelLowering.h:31

llvm::NVPTXISD::CallSeqEnd

@ CallSeqEnd

Definition:NVPTXISelLowering.h:51

llvm::NVPTXISD::BFE

@ BFE

Definition:NVPTXISelLowering.h:60

llvm::NVPTXISD::RET_GLUE

@ RET_GLUE

Definition:NVPTXISelLowering.h:28

llvm::NVPTXISD::FCOPYSIGN

@ FCOPYSIGN

Definition:NVPTXISelLowering.h:63

llvm::NVPTXISD::PseudoUseParam

@ PseudoUseParam

Definition:NVPTXISelLowering.h:48

llvm::NVPTXISD::MUL_WIDE_UNSIGNED

@ MUL_WIDE_UNSIGNED

Definition:NVPTXISelLowering.h:57

llvm::NVPTXISD::LoadV4

@ LoadV4

Definition:NVPTXISelLowering.h:74

llvm::NVPTXISD::LoadParam

@ LoadParam

Definition:NVPTXISelLowering.h:79

llvm::NVPTX::PTXLdStInstCode::V2

@ V2

Definition:NVPTX.h:163

llvm::NVPTX::PTXPrmtMode::NONE

@ NONE

Definition:NVPTX.h:219

llvm::NVPTX::Const

@ Const

Definition:NVPTX.h:147

llvm::RISCVFenceField::R

@ R

Definition:RISCVBaseInfo.h:373

llvm::SPII::Store

@ Store

Definition:SparcInstrInfo.h:33

llvm::SPII::Load

@ Load

Definition:SparcInstrInfo.h:32

llvm::Sched::RegPressure

@ RegPressure

Definition:TargetLowering.h:103

llvm::Sched::Source

@ Source

Definition:TargetLowering.h:102

llvm::X86::FirstMacroFusionInstKind::Cmp

@ Cmp

llvm::cl::Hidden

@ Hidden

Definition:CommandLine.h:137

llvm::cl::init

initializer< Ty > init(const Ty &Val)

Definition:CommandLine.h:443

llvm::dwarf::Index

Index

Definition:Dwarf.h:882

llvm::ms_demangle::QualifierMangleMode::Result

@ Result

llvm::tgtok::TrueVal

@ TrueVal

Definition:TGLexer.h:58

llvm::tgtok::FalseVal

@ FalseVal

Definition:TGLexer.h:59

llvm

This is an optimization pass for GlobalISel generic memory operations.

Definition:AddressRanges.h:18

llvm::Offset

@ Offset

Definition:DWP.cpp:480

llvm::isIndirectCall

static bool isIndirectCall(const MachineInstr &MI)

Definition:ARMBaseInstrInfo.h:666

llvm::shouldEmitPTXNoReturn

bool shouldEmitPTXNoReturn(const Value *V, const TargetMachine &TM)

Definition:NVPTXUtilities.cpp:367

llvm::CGDataKind::Unknown

@ Unknown

llvm::all_of

bool all_of(R &&range, UnaryPredicate P)

Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1739

llvm::size

auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)

Get the size of a range.

Definition:STLExtras.h:1697

llvm::Isv2x16VT

bool Isv2x16VT(EVT VT)

Definition:NVPTXUtilities.cpp:386

llvm::enumerate

auto enumerate(FirstRange &&First, RestRanges &&...Rest)

Given two or more input ranges, returns a new range whose values are tuples (A, B,...

Definition:STLExtras.h:2448

llvm::AlignStyle::Right

@ Right

llvm::AlignStyle::Left

@ Left

llvm::getAlign

MaybeAlign getAlign(const Function &F, unsigned Index)

Definition:NVPTXUtilities.cpp:323

llvm::PowerOf2Ceil

uint64_t PowerOf2Ceil(uint64_t A)

Returns the power of two which is greater than or equal to the given value.

Definition:MathExtras.h:395

llvm::transform

OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)

Wrapper function around std::transform to apply a function to a range and store the result elsewhere.

Definition:STLExtras.h:1952

llvm::isPowerOf2_32

constexpr bool isPowerOf2_32(uint32_t Value)

Return true if the argument is a power of two > 0.

Definition:MathExtras.h:292

llvm::promoteScalarArgumentSize

unsigned promoteScalarArgumentSize(unsigned size)

Definition:NVPTXUtilities.h:75

llvm::report_fatal_error

void report_fatal_error(Error Err, bool gen_crash_diag=true)

Report a serious error, calling any installed error handler.

Definition:Error.cpp:167

llvm::CodeGenOptLevel

CodeGenOptLevel

Code generation optimization level.

Definition:CodeGen.h:54

llvm::CodeGenOptLevel::None

@ None

-O0

llvm::CodeGenOptLevel::Default

@ Default

-O2, -Os

llvm::PackElem::Hi

@ Hi

llvm::PackElem::Lo

@ Lo

llvm::RecurKind::Mul

@ Mul

Product of integers.

llvm::RecurKind::Add

@ Add

Sum of integers.

llvm::alignTo

uint64_t alignTo(uint64_t Size, Align A)

Returns a multiple of A needed to store Size bytes.

Definition:Alignment.h:155

llvm::Op

DWARFExpression::Operation Op

Definition:DWARFExpression.cpp:22

llvm::ComputeValueVTs

void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())

ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...

Definition:Analysis.cpp:79

llvm::BitWidth

constexpr unsigned BitWidth

Definition:BitmaskEnum.h:217

llvm::isKernelFunction

bool isKernelFunction(const Function &F)

Definition:NVPTXUtilities.cpp:313

llvm::getMaybeBitcastedCallee

Function * getMaybeBitcastedCallee(const CallBase *CB)

Definition:NVPTXUtilities.cpp:363

llvm::commonAlignment

Align commonAlignment(Align A, uint64_t Offset)

Returns the alignment that satisfies both alignments.

Definition:Alignment.h:212

llvm::VFParamKind::Vector

@ Vector

std::swap

void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)

Implement std::swap in terms of BitVector swap.

Definition:BitVector.h:860

raw_ostream.h

#define N

llvm::APFloatBase::IEEEsingle

static const fltSemantics & IEEEsingle() LLVM_READNONE

Definition:APFloat.cpp:257

llvm::Align

This struct is a compact representation of a valid (non-zero power of two) alignment.

Definition:Alignment.h:39

llvm::Align::value

uint64_t value() const

This is a hole in the type system and should not be abused.

Definition:Alignment.h:85

llvm::DenormalMode::PreserveSign

@ PreserveSign

The sign of a flushed-to-zero number is preserved in the sign of 0.

Definition:FloatingPointMode.h:80

llvm::DenormalMode::Output

DenormalModeKind Output

Denormal flushing mode for floating point instruction results in the default floating point environme...

Definition:FloatingPointMode.h:91

llvm::EVT

Extended Value Type.

Definition:ValueTypes.h:35

llvm::EVT::getStoreSize

TypeSize getStoreSize() const

Return the number of bytes overwritten by a store of the specified value type.

Definition:ValueTypes.h:390

llvm::EVT::isSimple

bool isSimple() const

Test if the given EVT is simple (as opposed to being extended).

Definition:ValueTypes.h:137

llvm::EVT::getVectorVT

static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)

Returns the EVT that represents a vector NumElements in length, where each element is of type VT.

Definition:ValueTypes.h:74

llvm::EVT::changeTypeToInteger

EVT changeTypeToInteger() const

Return the type converted to an equivalently sized integer or vector with integer element type.

Definition:ValueTypes.h:121

llvm::EVT::isFloatingPoint

bool isFloatingPoint() const

Return true if this is a FP or a vector FP type.

Definition:ValueTypes.h:147

llvm::EVT::getVectorElementCount

ElementCount getVectorElementCount() const

Definition:ValueTypes.h:345

llvm::EVT::getSizeInBits

TypeSize getSizeInBits() const

Return the size of the specified value type in bits.

Definition:ValueTypes.h:368

llvm::EVT::getScalarSizeInBits

uint64_t getScalarSizeInBits() const

Definition:ValueTypes.h:380

llvm::EVT::getSimpleVT

MVT getSimpleVT() const

Return the SimpleValueType held in the specified simple EVT.

Definition:ValueTypes.h:311

llvm::EVT::getFixedSizeInBits

uint64_t getFixedSizeInBits() const

Return the size of the specified fixed width value type in bits.

Definition:ValueTypes.h:376

llvm::EVT::isVector

bool isVector() const

Return true if this is a vector value type.

Definition:ValueTypes.h:168

llvm::EVT::getScalarType

EVT getScalarType() const

If this is a vector type, return the element type, otherwise return this.

Definition:ValueTypes.h:318

llvm::EVT::bitsEq

bool bitsEq(EVT VT) const

Return true if this has the same number of bits as VT.

Definition:ValueTypes.h:251

llvm::EVT::getTypeForEVT

Type * getTypeForEVT(LLVMContext &Context) const

This method returns an LLVM type corresponding to the specified EVT.

Definition:ValueTypes.cpp:210

llvm::EVT::getVectorElementType

EVT getVectorElementType() const

Given a vector type, return the type of each element.

Definition:ValueTypes.h:323

llvm::EVT::isScalarInteger

bool isScalarInteger() const

Return true if this is an integer, but not a vector.

Definition:ValueTypes.h:157

llvm::EVT::changeVectorElementType

EVT changeVectorElementType(EVT EltVT) const

Return a VT for a vector type whose attributes match ourselves with the exception of the element type...

Definition:ValueTypes.h:102

llvm::EVT::getVectorNumElements

unsigned getVectorNumElements() const

Given a vector type, return the number of elements it contains.

Definition:ValueTypes.h:331

llvm::EVT::isInteger

bool isInteger() const

Return true if this is an integer or a vector integer type.

Definition:ValueTypes.h:152

llvm::MachinePointerInfo

This class contains a discriminated union of information about pointers in memory operands,...

Definition:MachineMemOperand.h:41

llvm::MaybeAlign

This struct is a compact representation of a valid (power of two) or undefined (0) alignment.

Definition:Alignment.h:117

llvm::SDVTList

This represents a list of ValueType's that has been intern'd by a SelectionDAG.

Definition:SelectionDAGNodes.h:79

llvm::TargetLoweringBase::AddrMode

This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...

Definition:TargetLowering.h:2816

llvm::TargetLoweringBase::AddrMode::BaseOffs

int64_t BaseOffs

Definition:TargetLowering.h:2818

llvm::TargetLoweringBase::AddrMode::BaseGV

GlobalValue * BaseGV

Definition:TargetLowering.h:2817

llvm::TargetLoweringBase::AddrMode::HasBaseReg

bool HasBaseReg

Definition:TargetLowering.h:2819

llvm::TargetLoweringBase::AddrMode::Scale

int64_t Scale

Definition:TargetLowering.h:2820

llvm::TargetLoweringBase::IntrinsicInfo

Definition:TargetLowering.h:1202

llvm::TargetLowering::CallLoweringInfo

This structure contains all information that is necessary for lowering calls.

Definition:TargetLowering.h:4529

llvm::TargetLowering::CallLoweringInfo::Args

ArgListTy Args

Definition:TargetLowering.h:4553

llvm::TargetLowering::CallLoweringInfo::IsTailCall

bool IsTailCall

Definition:TargetLowering.h:4545

llvm::TargetLowering::CallLoweringInfo::Callee

SDValue Callee

Definition:TargetLowering.h:4552

llvm::TargetLowering::CallLoweringInfo::DL

SDLoc DL

Definition:TargetLowering.h:4555

llvm::TargetLowering::CallLoweringInfo::IsVarArg

bool IsVarArg

Definition:TargetLowering.h:4534

llvm::TargetLowering::CallLoweringInfo::Ins

SmallVector< ISD::InputArg, 32 > Ins

Definition:TargetLowering.h:4559

llvm::TargetLowering::CallLoweringInfo::NumFixedArgs

unsigned NumFixedArgs

Definition:TargetLowering.h:4550

llvm::TargetLowering::CallLoweringInfo::Chain

SDValue Chain

Definition:TargetLowering.h:4530

llvm::TargetLowering::CallLoweringInfo::getArgs

ArgListTy & getArgs()

Definition:TargetLowering.h:4708

llvm::TargetLowering::CallLoweringInfo::CB

const CallBase * CB

Definition:TargetLowering.h:4556

llvm::TargetLowering::CallLoweringInfo::Outs

SmallVector< ISD::OutputArg, 32 > Outs

Definition:TargetLowering.h:4557

llvm::TargetLowering::CallLoweringInfo::OutVals

SmallVector< SDValue, 32 > OutVals

Definition:TargetLowering.h:4558

llvm::TargetLowering::CallLoweringInfo::RetTy

Type * RetTy

Definition:TargetLowering.h:4531

llvm::TargetLowering::CallLoweringInfo::IsConvergent

bool IsConvergent

Definition:TargetLowering.h:4538

llvm::TargetLowering::CallLoweringInfo::DAG

SelectionDAG & DAG

Definition:TargetLowering.h:4554

llvm::TargetLowering::DAGCombinerInfo

Definition:TargetLowering.h:4228

llvm::TargetLowering::DAGCombinerInfo::isAfterLegalizeDAG

bool isAfterLegalizeDAG() const

Definition:TargetLowering.h:4241

llvm::TargetLowering::DAGCombinerInfo::DAG

SelectionDAG & DAG

Definition:TargetLowering.h:4234

llvm::TargetLowering::DAGCombinerInfo::CombineTo

SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)

Definition:DAGCombiner.cpp:921

llvm::cl::desc

Definition:CommandLine.h:409