Movatterモバイル変換

Go to the documentation of this file.

1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//

2//

3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4// See https://llvm.org/LICENSE.txt for license information.

5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6//

7//===----------------------------------------------------------------------===//

8//

9// This pass tries to fuse DS instructions with close by immediate offsets.

10// This will fuse operations such as

11// ds_read_b32 v0, v2 offset:16

12// ds_read_b32 v1, v2 offset:32

13// ==>

14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8

15//

16// The same is done for certain SMEM and VMEM opcodes, e.g.:

17// s_buffer_load_dword s4, s[0:3], 4

18// s_buffer_load_dword s5, s[0:3], 8

19// ==>

20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4

21//

22// This pass also tries to promote constant offset to the immediate by

23// adjusting the base. It tries to use a base from the nearby instructions that

24// allows it to have a 13bit constant offset and then promotes the 13bit offset

25// to the immediate.

26// E.g.

27// s_movk_i32 s0, 0x1800

28// v_add_co_u32_e32 v0, vcc, s0, v2

29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc

30//

31// s_movk_i32 s0, 0x1000

32// v_add_co_u32_e32 v5, vcc, s0, v2

33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc

34// global_load_dwordx2 v[5:6], v[5:6], off

35// global_load_dwordx2 v[0:1], v[0:1], off

36// =>

37// s_movk_i32 s0, 0x1000

38// v_add_co_u32_e32 v5, vcc, s0, v2

39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc

40// global_load_dwordx2 v[5:6], v[5:6], off

41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048

42//

43// Future improvements:

44//

45// - This is currently missing stores of constants because loading

46// the constant into the data register is placed between the stores, although

47// this is arguably a scheduling problem.

48//

49// - Live interval recomputing seems inefficient. This currently only matches

50// one pair, and recomputes live intervals and moves on to the next pair. It

51// would be better to compute a list of all merges that need to occur.

52//

53// - With a list of instructions to process, we can also merge more. If a

54// cluster of loads have offsets that are too large to fit in the 8-bit

55// offsets, but are close enough to fit in the 8 bits, we can add to the base

56// pointer and use the new reduced offsets.

57//

58//===----------------------------------------------------------------------===//

60#include "SILoadStoreOptimizer.h"

61#include "AMDGPU.h"

62#include "GCNSubtarget.h"

63#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

64#include "llvm/Analysis/AliasAnalysis.h"

65#include "llvm/CodeGen/MachineFunctionPass.h"

66#include "llvm/InitializePasses.h"

68using namespacellvm;

70#define DEBUG_TYPE "si-load-store-opt"

72namespace{

73enum InstClassEnum {

74 UNKNOWN,

75 DS_READ,

76 DS_WRITE,

77 S_BUFFER_LOAD_IMM,

78 S_BUFFER_LOAD_SGPR_IMM,

79 S_LOAD_IMM,

80 BUFFER_LOAD,

81 BUFFER_STORE,

82 MIMG,

83 TBUFFER_LOAD,

84 TBUFFER_STORE,

85 GLOBAL_LOAD_SADDR,

86 GLOBAL_STORE_SADDR,

87 FLAT_LOAD,

88 FLAT_STORE,

89 GLOBAL_LOAD,// GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of

90 GLOBAL_STORE// any CombineInfo, they are only ever returned by

91// getCommonInstClass.

92};

94structAddressRegs {

95unsignedchar NumVAddrs = 0;

96bool SBase =false;

97bool SRsrc =false;

98bool SOffset =false;

99bool SAddr =false;

100bool VAddr =false;

101boolAddr =false;

102bool SSamp =false;

103};

104

105// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.

106constunsigned MaxAddressRegs = 12 + 1 + 1;

107

108classSILoadStoreOptimizer {

109structCombineInfo {

110MachineBasicBlock::iterator I;

111unsigned EltSize;

112unsigned Offset;

113unsigned Width;

114unsigned Format;

115unsigned BaseOff;

116unsigned DMask;

117 InstClassEnum InstClass;

118unsigned CPol = 0;

119bool IsAGPR;

120bool UseST64;

121int AddrIdx[MaxAddressRegs];

122constMachineOperand *AddrReg[MaxAddressRegs];

123unsigned NumAddresses;

124unsigned Order;

125

126bool hasSameBaseAddress(const CombineInfo &CI) {

127if (NumAddresses != CI.NumAddresses)

128returnfalse;

129

130constMachineInstr &MI = *CI.I;

131for (unsigned i = 0; i < NumAddresses; i++) {

132constMachineOperand &AddrRegNext =MI.getOperand(AddrIdx[i]);

133

134if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {

135if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||

136 AddrReg[i]->getImm() != AddrRegNext.getImm()) {

137returnfalse;

138 }

139continue;

140 }

141

142// Check same base pointer. Be careful of subregisters, which can occur

143// with vectors of pointers.

144if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||

145 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {

146returnfalse;

147 }

148 }

149returntrue;

150 }

151

152bool hasMergeableAddress(constMachineRegisterInfo &MRI) {

153for (unsigned i = 0; i < NumAddresses; ++i) {

154constMachineOperand *AddrOp = AddrReg[i];

155// Immediates are always OK.

156if (AddrOp->isImm())

157continue;

158

159// Don't try to merge addresses that aren't either immediates or registers.

160// TODO: Should be possible to merge FrameIndexes and maybe some other

161// non-register

162if (!AddrOp->isReg())

163returnfalse;

164

165// TODO: We should be able to merge instructions with other physical reg

166// addresses too.

167if (AddrOp->getReg().isPhysical() &&

168 AddrOp->getReg() != AMDGPU::SGPR_NULL)

169returnfalse;

170

171// If an address has only one use then there will be no other

172// instructions with the same address, so we can't merge this one.

173if (MRI.hasOneNonDBGUse(AddrOp->getReg()))

174returnfalse;

175 }

176returntrue;

177 }

178

179void setMI(MachineBasicBlock::iterator MI,const SILoadStoreOptimizer &LSO);

180

181// Compare by pointer order.

182booloperator<(const CombineInfo&Other) const{

183return (InstClass == MIMG) ? DMask <Other.DMask : Offset <Other.Offset;

184 }

185 };

186

187structBaseRegisters {

188Register LoReg;

189Register HiReg;

190

191unsigned LoSubReg = 0;

192unsigned HiSubReg = 0;

193 };

194

195structMemAddress {

196 BaseRegistersBase;

197 int64_t Offset = 0;

198 };

199

200usingMemInfoMap =DenseMap<MachineInstr *, MemAddress>;

201

202private:

203constGCNSubtarget *STM =nullptr;

204constSIInstrInfo *TII =nullptr;

205constSIRegisterInfo *TRI =nullptr;

206MachineRegisterInfo *MRI =nullptr;

207AliasAnalysis *AA =nullptr;

208bool OptimizeAgain;

209

210bool canSwapInstructions(constDenseSet<Register> &ARegDefs,

211constDenseSet<Register> &ARegUses,

212constMachineInstr &A,constMachineInstr &B)const;

213staticbool dmasksCanBeCombined(const CombineInfo &CI,

214constSIInstrInfo &TII,

215const CombineInfo &Paired);

216staticbool offsetsCanBeCombined(CombineInfo &CI,constGCNSubtarget &STI,

217 CombineInfo &Paired,bool Modify =false);

218staticbool widthsFit(constGCNSubtarget &STI,const CombineInfo &CI,

219const CombineInfo &Paired);

220unsigned getNewOpcode(const CombineInfo &CI,const CombineInfo &Paired);

221static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,

222const CombineInfo &Paired);

223constTargetRegisterClass *

224 getTargetRegisterClass(const CombineInfo &CI,

225const CombineInfo &Paired)const;

226constTargetRegisterClass *getDataRegClass(constMachineInstr &MI)const;

227

228 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);

229

230void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,

231MachineBasicBlock::iterator InsertBefore,intOpName,

232Register DestReg)const;

233Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,

234MachineBasicBlock::iterator InsertBefore,

235intOpName)const;

236

237unsigned read2Opcode(unsigned EltSize)const;

238unsigned read2ST64Opcode(unsigned EltSize)const;

239MachineBasicBlock::iterator

240 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,

241MachineBasicBlock::iterator InsertBefore);

242

243unsigned write2Opcode(unsigned EltSize)const;

244unsigned write2ST64Opcode(unsigned EltSize)const;

245MachineBasicBlock::iterator

246 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,

247MachineBasicBlock::iterator InsertBefore);

248MachineBasicBlock::iterator

249 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,

250MachineBasicBlock::iterator InsertBefore);

251MachineBasicBlock::iterator

252 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,

253MachineBasicBlock::iterator InsertBefore);

254MachineBasicBlock::iterator

255 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,

256MachineBasicBlock::iterator InsertBefore);

257MachineBasicBlock::iterator

258 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,

259MachineBasicBlock::iterator InsertBefore);

260MachineBasicBlock::iterator

261 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,

262MachineBasicBlock::iterator InsertBefore);

263MachineBasicBlock::iterator

264 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,

265MachineBasicBlock::iterator InsertBefore);

266MachineBasicBlock::iterator

267 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,

268MachineBasicBlock::iterator InsertBefore);

269MachineBasicBlock::iterator

270 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,

271MachineBasicBlock::iterator InsertBefore);

272

273void updateBaseAndOffset(MachineInstr &I,Register NewBase,

274 int32_t NewOffset)const;

275Register computeBase(MachineInstr &MI,const MemAddress &Addr)const;

276MachineOperand createRegOrImm(int32_t Val,MachineInstr &MI)const;

277 std::optional<int32_t> extractConstOffset(constMachineOperand &Op)const;

278void processBaseWithConstOffset(constMachineOperand &Base, MemAddress &Addr)const;

279 /// Promotes constant offset to the immediate by adjusting the base. It

280 /// tries to use a base from the nearby instructions that allows it to have

281 /// a 13bit constant offset which gets promoted to the immediate.

282bool promoteConstantOffsetToImm(MachineInstr &CI,

283 MemInfoMap &Visited,

284SmallPtrSet<MachineInstr *, 4> &Promoted)const;

285voidaddInstToMergeableList(const CombineInfo &CI,

286 std::list<std::list<CombineInfo> > &MergeableInsts)const;

287

288 std::pair<MachineBasicBlock::iterator, bool>collectMergeableInsts(

289MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,

290 MemInfoMap &Visited,SmallPtrSet<MachineInstr *, 4> &AnchorList,

291 std::list<std::list<CombineInfo>> &MergeableInsts)const;

292

293staticMachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,

294const CombineInfo &Paired);

295

296static InstClassEnum getCommonInstClass(const CombineInfo &CI,

297const CombineInfo &Paired);

298

299bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,

300bool &OptimizeListAgain);

301booloptimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);

302

303public:

304 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}

305bool run(MachineFunction &MF);

306};

307

308classSILoadStoreOptimizerLegacy :publicMachineFunctionPass {

309public:

310staticcharID;

311

312 SILoadStoreOptimizerLegacy() :MachineFunctionPass(ID) {}

313

314boolrunOnMachineFunction(MachineFunction &MF)override;

315

316StringRef getPassName() const override{return"SI Load Store Optimizer"; }

317

318voidgetAnalysisUsage(AnalysisUsage &AU) const override{

319 AU.setPreservesCFG();

320 AU.addRequired<AAResultsWrapperPass>();

321

322MachineFunctionPass::getAnalysisUsage(AU);

323 }

324

325MachineFunctionProperties getRequiredProperties() const override{

326returnMachineFunctionProperties()

327 .set(MachineFunctionProperties::Property::IsSSA);

328 }

329};

330

331staticunsigned getOpcodeWidth(constMachineInstr &MI,constSIInstrInfo &TII) {

332constunsigned Opc =MI.getOpcode();

333

334if (TII.isMUBUF(Opc)) {

335// FIXME: Handle d16 correctly

336returnAMDGPU::getMUBUFElements(Opc);

337 }

338if (TII.isImage(MI)) {

339uint64_t DMaskImm =

340TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();

341returnllvm::popcount(DMaskImm);

342 }

343if (TII.isMTBUF(Opc)) {

344returnAMDGPU::getMTBUFElements(Opc);

345 }

346

347switch (Opc) {

348case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

349case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

350case AMDGPU::S_LOAD_DWORD_IMM:

351case AMDGPU::GLOBAL_LOAD_DWORD:

352case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

353case AMDGPU::GLOBAL_STORE_DWORD:

354case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

355case AMDGPU::FLAT_LOAD_DWORD:

356case AMDGPU::FLAT_STORE_DWORD:

357return 1;

358case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

359case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

360case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

361case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

362case AMDGPU::S_LOAD_DWORDX2_IMM:

363case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

364case AMDGPU::GLOBAL_LOAD_DWORDX2:

365case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

366case AMDGPU::GLOBAL_STORE_DWORDX2:

367case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

368case AMDGPU::FLAT_LOAD_DWORDX2:

369case AMDGPU::FLAT_STORE_DWORDX2:

370return 2;

371case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

372case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

373case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

374case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

375case AMDGPU::S_LOAD_DWORDX3_IMM:

376case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

377case AMDGPU::GLOBAL_LOAD_DWORDX3:

378case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

379case AMDGPU::GLOBAL_STORE_DWORDX3:

380case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

381case AMDGPU::FLAT_LOAD_DWORDX3:

382case AMDGPU::FLAT_STORE_DWORDX3:

383return 3;

384case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

385case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

386case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

387case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

388case AMDGPU::S_LOAD_DWORDX4_IMM:

389case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

390case AMDGPU::GLOBAL_LOAD_DWORDX4:

391case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

392case AMDGPU::GLOBAL_STORE_DWORDX4:

393case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

394case AMDGPU::FLAT_LOAD_DWORDX4:

395case AMDGPU::FLAT_STORE_DWORDX4:

396return 4;

397case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

398case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

399case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

400case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

401case AMDGPU::S_LOAD_DWORDX8_IMM:

402case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

403return 8;

404case AMDGPU::DS_READ_B32:

405case AMDGPU::DS_READ_B32_gfx9:

406case AMDGPU::DS_WRITE_B32:

407case AMDGPU::DS_WRITE_B32_gfx9:

408return 1;

409case AMDGPU::DS_READ_B64:

410case AMDGPU::DS_READ_B64_gfx9:

411case AMDGPU::DS_WRITE_B64:

412case AMDGPU::DS_WRITE_B64_gfx9:

413return 2;

414default:

415return 0;

416 }

417}

418

419/// Maps instruction opcode to enum InstClassEnum.

420static InstClassEnum getInstClass(unsigned Opc,constSIInstrInfo &TII) {

421switch (Opc) {

422default:

423if (TII.isMUBUF(Opc)) {

424switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {

425default:

426return UNKNOWN;

427case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:

428case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:

429case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:

430case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:

431case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:

432case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:

433case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:

434case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:

435case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:

436case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:

437case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:

438case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:

439case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:

440case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:

441case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:

442case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:

443return BUFFER_LOAD;

444case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:

445case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:

446case AMDGPU::BUFFER_STORE_DWORD_IDXEN:

447case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:

448case AMDGPU::BUFFER_STORE_DWORD_OFFEN:

449case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:

450case AMDGPU::BUFFER_STORE_DWORD_OFFSET:

451case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:

452case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:

453case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:

454case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:

455case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:

456case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:

457case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:

458case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:

459case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:

460return BUFFER_STORE;

461 }

462 }

463if (TII.isImage(Opc)) {

464// Ignore instructions encoded without vaddr.

465if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&

466 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))

467return UNKNOWN;

468// Ignore BVH instructions

469if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)

470return UNKNOWN;

471// TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.

472if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||

473TII.isGather4(Opc))

474return UNKNOWN;

475return MIMG;

476 }

477if (TII.isMTBUF(Opc)) {

478switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {

479default:

480return UNKNOWN;

481case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:

482case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:

483case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:

484case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:

485case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:

486case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:

487case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:

488case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:

489case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:

490case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:

491case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:

492case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:

493case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:

494case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:

495case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:

496case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:

497return TBUFFER_LOAD;

498case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:

499case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:

500case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:

501case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:

502case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:

503case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:

504case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:

505case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:

506return TBUFFER_STORE;

507 }

508 }

509return UNKNOWN;

510case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

511case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

512case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

513case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

514case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

515case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

516case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

517case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

518case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

519return S_BUFFER_LOAD_IMM;

520case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

521case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

522case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

523case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

524case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

525case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

526case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

527case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

528case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

529return S_BUFFER_LOAD_SGPR_IMM;

530case AMDGPU::S_LOAD_DWORD_IMM:

531case AMDGPU::S_LOAD_DWORDX2_IMM:

532case AMDGPU::S_LOAD_DWORDX3_IMM:

533case AMDGPU::S_LOAD_DWORDX4_IMM:

534case AMDGPU::S_LOAD_DWORDX8_IMM:

535case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

536case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

537case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

538case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

539return S_LOAD_IMM;

540case AMDGPU::DS_READ_B32:

541case AMDGPU::DS_READ_B32_gfx9:

542case AMDGPU::DS_READ_B64:

543case AMDGPU::DS_READ_B64_gfx9:

544return DS_READ;

545case AMDGPU::DS_WRITE_B32:

546case AMDGPU::DS_WRITE_B32_gfx9:

547case AMDGPU::DS_WRITE_B64:

548case AMDGPU::DS_WRITE_B64_gfx9:

549return DS_WRITE;

550case AMDGPU::GLOBAL_LOAD_DWORD:

551case AMDGPU::GLOBAL_LOAD_DWORDX2:

552case AMDGPU::GLOBAL_LOAD_DWORDX3:

553case AMDGPU::GLOBAL_LOAD_DWORDX4:

554case AMDGPU::FLAT_LOAD_DWORD:

555case AMDGPU::FLAT_LOAD_DWORDX2:

556case AMDGPU::FLAT_LOAD_DWORDX3:

557case AMDGPU::FLAT_LOAD_DWORDX4:

558return FLAT_LOAD;

559case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

560case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

561case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

562case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

563return GLOBAL_LOAD_SADDR;

564case AMDGPU::GLOBAL_STORE_DWORD:

565case AMDGPU::GLOBAL_STORE_DWORDX2:

566case AMDGPU::GLOBAL_STORE_DWORDX3:

567case AMDGPU::GLOBAL_STORE_DWORDX4:

568case AMDGPU::FLAT_STORE_DWORD:

569case AMDGPU::FLAT_STORE_DWORDX2:

570case AMDGPU::FLAT_STORE_DWORDX3:

571case AMDGPU::FLAT_STORE_DWORDX4:

572return FLAT_STORE;

573case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

574case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

575case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

576case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

577return GLOBAL_STORE_SADDR;

578 }

579}

580

581/// Determines instruction subclass from opcode. Only instructions

582/// of the same subclass can be merged together. The merged instruction may have

583/// a different subclass but must have the same class.

584staticunsigned getInstSubclass(unsigned Opc,constSIInstrInfo &TII) {

585switch (Opc) {

586default:

587if (TII.isMUBUF(Opc))

588returnAMDGPU::getMUBUFBaseOpcode(Opc);

589if (TII.isImage(Opc)) {

590constAMDGPU::MIMGInfo *Info =AMDGPU::getMIMGInfo(Opc);

591assert(Info);

592returnInfo->BaseOpcode;

593 }

594if (TII.isMTBUF(Opc))

595returnAMDGPU::getMTBUFBaseOpcode(Opc);

596return -1;

597case AMDGPU::DS_READ_B32:

598case AMDGPU::DS_READ_B32_gfx9:

599case AMDGPU::DS_READ_B64:

600case AMDGPU::DS_READ_B64_gfx9:

601case AMDGPU::DS_WRITE_B32:

602case AMDGPU::DS_WRITE_B32_gfx9:

603case AMDGPU::DS_WRITE_B64:

604case AMDGPU::DS_WRITE_B64_gfx9:

605return Opc;

606case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

607case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

608case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

609case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

610case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

611case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

612case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

613case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

614case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

615return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;

616case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

617case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

618case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

619case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

620case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

621case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

622case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

623case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

624case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

625return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;

626case AMDGPU::S_LOAD_DWORD_IMM:

627case AMDGPU::S_LOAD_DWORDX2_IMM:

628case AMDGPU::S_LOAD_DWORDX3_IMM:

629case AMDGPU::S_LOAD_DWORDX4_IMM:

630case AMDGPU::S_LOAD_DWORDX8_IMM:

631case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

632case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

633case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

634case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

635return AMDGPU::S_LOAD_DWORD_IMM;

636case AMDGPU::GLOBAL_LOAD_DWORD:

637case AMDGPU::GLOBAL_LOAD_DWORDX2:

638case AMDGPU::GLOBAL_LOAD_DWORDX3:

639case AMDGPU::GLOBAL_LOAD_DWORDX4:

640case AMDGPU::FLAT_LOAD_DWORD:

641case AMDGPU::FLAT_LOAD_DWORDX2:

642case AMDGPU::FLAT_LOAD_DWORDX3:

643case AMDGPU::FLAT_LOAD_DWORDX4:

644return AMDGPU::FLAT_LOAD_DWORD;

645case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

646case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

647case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

648case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

649return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;

650case AMDGPU::GLOBAL_STORE_DWORD:

651case AMDGPU::GLOBAL_STORE_DWORDX2:

652case AMDGPU::GLOBAL_STORE_DWORDX3:

653case AMDGPU::GLOBAL_STORE_DWORDX4:

654case AMDGPU::FLAT_STORE_DWORD:

655case AMDGPU::FLAT_STORE_DWORDX2:

656case AMDGPU::FLAT_STORE_DWORDX3:

657case AMDGPU::FLAT_STORE_DWORDX4:

658return AMDGPU::FLAT_STORE_DWORD;

659case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

660case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

661case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

662case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

663return AMDGPU::GLOBAL_STORE_DWORD_SADDR;

664 }

665}

666

667// GLOBAL loads and stores are classified as FLAT initially. If both combined

668// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.

669// If either or both instructions are non segment specific FLAT the resulting

670// combined operation will be FLAT, potentially promoting one of the GLOBAL

671// operations to FLAT.

672// For other instructions return the original unmodified class.

673InstClassEnum

674SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,

675const CombineInfo &Paired) {

676assert(CI.InstClass == Paired.InstClass);

677

678if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&

679SIInstrInfo::isFLATGlobal(*CI.I) &&SIInstrInfo::isFLATGlobal(*Paired.I))

680return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;

681

682return CI.InstClass;

683}

684

685static AddressRegs getRegs(unsigned Opc,constSIInstrInfo &TII) {

686 AddressRegs Result;

687

688if (TII.isMUBUF(Opc)) {

689if (AMDGPU::getMUBUFHasVAddr(Opc))

690 Result.VAddr =true;

691if (AMDGPU::getMUBUFHasSrsrc(Opc))

692 Result.SRsrc =true;

693if (AMDGPU::getMUBUFHasSoffset(Opc))

694 Result.SOffset =true;

695

696return Result;

697 }

698

699if (TII.isImage(Opc)) {

700int VAddr0Idx =AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);

701if (VAddr0Idx >= 0) {

702int RsrcName =

703TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;

704int RsrcIdx =AMDGPU::getNamedOperandIdx(Opc, RsrcName);

705 Result.NumVAddrs = RsrcIdx - VAddr0Idx;

706 }else {

707 Result.VAddr =true;

708 }

709 Result.SRsrc =true;

710constAMDGPU::MIMGInfo *Info =AMDGPU::getMIMGInfo(Opc);

711if (Info &&AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)

712 Result.SSamp =true;

713

714return Result;

715 }

716if (TII.isMTBUF(Opc)) {

717if (AMDGPU::getMTBUFHasVAddr(Opc))

718 Result.VAddr =true;

719if (AMDGPU::getMTBUFHasSrsrc(Opc))

720 Result.SRsrc =true;

721if (AMDGPU::getMTBUFHasSoffset(Opc))

722 Result.SOffset =true;

723

724return Result;

725 }

726

727switch (Opc) {

728default:

729return Result;

730case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:

731case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:

732case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:

733case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:

734case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:

735case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:

736case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:

737case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:

738case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:

739 Result.SOffset =true;

740 [[fallthrough]];

741case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:

742case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:

743case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:

744case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:

745case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:

746case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:

747case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:

748case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:

749case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:

750case AMDGPU::S_LOAD_DWORD_IMM:

751case AMDGPU::S_LOAD_DWORDX2_IMM:

752case AMDGPU::S_LOAD_DWORDX3_IMM:

753case AMDGPU::S_LOAD_DWORDX4_IMM:

754case AMDGPU::S_LOAD_DWORDX8_IMM:

755case AMDGPU::S_LOAD_DWORDX2_IMM_ec:

756case AMDGPU::S_LOAD_DWORDX3_IMM_ec:

757case AMDGPU::S_LOAD_DWORDX4_IMM_ec:

758case AMDGPU::S_LOAD_DWORDX8_IMM_ec:

759 Result.SBase =true;

760return Result;

761case AMDGPU::DS_READ_B32:

762case AMDGPU::DS_READ_B64:

763case AMDGPU::DS_READ_B32_gfx9:

764case AMDGPU::DS_READ_B64_gfx9:

765case AMDGPU::DS_WRITE_B32:

766case AMDGPU::DS_WRITE_B64:

767case AMDGPU::DS_WRITE_B32_gfx9:

768case AMDGPU::DS_WRITE_B64_gfx9:

769 Result.Addr =true;

770return Result;

771case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:

772case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:

773case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:

774case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:

775case AMDGPU::GLOBAL_STORE_DWORD_SADDR:

776case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:

777case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:

778case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:

779 Result.SAddr =true;

780 [[fallthrough]];

781case AMDGPU::GLOBAL_LOAD_DWORD:

782case AMDGPU::GLOBAL_LOAD_DWORDX2:

783case AMDGPU::GLOBAL_LOAD_DWORDX3:

784case AMDGPU::GLOBAL_LOAD_DWORDX4:

785case AMDGPU::GLOBAL_STORE_DWORD:

786case AMDGPU::GLOBAL_STORE_DWORDX2:

787case AMDGPU::GLOBAL_STORE_DWORDX3:

788case AMDGPU::GLOBAL_STORE_DWORDX4:

789case AMDGPU::FLAT_LOAD_DWORD:

790case AMDGPU::FLAT_LOAD_DWORDX2:

791case AMDGPU::FLAT_LOAD_DWORDX3:

792case AMDGPU::FLAT_LOAD_DWORDX4:

793case AMDGPU::FLAT_STORE_DWORD:

794case AMDGPU::FLAT_STORE_DWORDX2:

795case AMDGPU::FLAT_STORE_DWORDX3:

796case AMDGPU::FLAT_STORE_DWORDX4:

797 Result.VAddr =true;

798return Result;

799 }

800}

801

802void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,

803const SILoadStoreOptimizer &LSO) {

804I =MI;

805unsigned Opc =MI->getOpcode();

806 InstClass = getInstClass(Opc, *LSO.TII);

807

808if (InstClass == UNKNOWN)

809return;

810

811 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));

812

813switch (InstClass) {

814case DS_READ:

815 EltSize =

816 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8

817 : 4;

818break;

819case DS_WRITE:

820 EltSize =

821 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8

822 : 4;

823break;

824case S_BUFFER_LOAD_IMM:

825case S_BUFFER_LOAD_SGPR_IMM:

826case S_LOAD_IMM:

827 EltSize =AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);

828break;

829default:

830 EltSize = 4;

831break;

832 }

833

834if (InstClass == MIMG) {

835 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();

836// Offset is not considered for MIMG instructions.

837Offset = 0;

838 }else {

839int OffsetIdx =AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);

840Offset =I->getOperand(OffsetIdx).getImm();

841 }

842

843if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)

844 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();

845

846 Width = getOpcodeWidth(*I, *LSO.TII);

847

848if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {

849Offset &= 0xffff;

850 }elseif (InstClass != MIMG) {

851 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();

852 }

853

854 AddressRegs Regs = getRegs(Opc, *LSO.TII);

855bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);

856

857 NumAddresses = 0;

858for (unsigned J = 0; J < Regs.NumVAddrs; J++)

859 AddrIdx[NumAddresses++] =

860AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;

861if (Regs.Addr)

862 AddrIdx[NumAddresses++] =

863AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);

864if (Regs.SBase)

865 AddrIdx[NumAddresses++] =

866AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);

867if (Regs.SRsrc)

868 AddrIdx[NumAddresses++] =AMDGPU::getNamedOperandIdx(

869 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);

870if (Regs.SOffset)

871 AddrIdx[NumAddresses++] =

872AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);

873if (Regs.SAddr)

874 AddrIdx[NumAddresses++] =

875AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);

876if (Regs.VAddr)

877 AddrIdx[NumAddresses++] =

878AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);

879if (Regs.SSamp)

880 AddrIdx[NumAddresses++] =AMDGPU::getNamedOperandIdx(

881 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);

882assert(NumAddresses <= MaxAddressRegs);

883

884for (unsigned J = 0; J < NumAddresses; J++)

885 AddrReg[J] = &I->getOperand(AddrIdx[J]);

886}

887

888}// end anonymous namespace.

889

890INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy,DEBUG_TYPE,

891"SI Load Store Optimizer",false,false)

892INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)

893INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy,DEBUG_TYPE,

894 "SI Load StoreOptimizer",false,false)

895

896char SILoadStoreOptimizerLegacy::ID = 0;

897

898char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;

899

900FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() {

901returnnew SILoadStoreOptimizerLegacy();

902}

903

904staticvoidaddDefsUsesToList(constMachineInstr &MI,

905DenseSet<Register> &RegDefs,

906DenseSet<Register> &RegUses) {

907for (constauto &Op :MI.operands()) {

908if (!Op.isReg())

909continue;

910if (Op.isDef())

911 RegDefs.insert(Op.getReg());

912if (Op.readsReg())

913 RegUses.insert(Op.getReg());

914 }

915}

916

917bool SILoadStoreOptimizer::canSwapInstructions(

918constDenseSet<Register> &ARegDefs,constDenseSet<Register> &ARegUses,

919constMachineInstr &A,constMachineInstr &B) const{

920if (A.mayLoadOrStore() &&B.mayLoadOrStore() &&

921 (A.mayStore() ||B.mayStore()) &&A.mayAlias(AA,B,true))

922returnfalse;

923for (constauto &BOp :B.operands()) {

924if (!BOp.isReg())

925continue;

926if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))

927returnfalse;

928if (BOp.isDef() && ARegUses.contains(BOp.getReg()))

929returnfalse;

930 }

931returntrue;

932}

933

934// Given that \p CI and \p Paired are adjacent memory operations produce a new

935// MMO for the combined operation with a new access size.

936MachineMemOperand *

937SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,

938const CombineInfo &Paired) {

939constMachineMemOperand *MMOa = *CI.I->memoperands_begin();

940constMachineMemOperand *MMOb = *Paired.I->memoperands_begin();

941

942unsignedSize = MMOa->getSize().getValue() + MMOb->getSize().getValue();

943

944// A base pointer for the combined operation is the same as the leading

945// operation's pointer.

946if (Paired < CI)

947std::swap(MMOa, MMOb);

948

949MachinePointerInfo PtrInfo(MMOa->getPointerInfo());

950// If merging FLAT and GLOBAL set address space to FLAT.

951if (MMOb->getAddrSpace() ==AMDGPUAS::FLAT_ADDRESS)

952 PtrInfo.AddrSpace =AMDGPUAS::FLAT_ADDRESS;

953

954MachineFunction *MF = CI.I->getMF();

955return MF->getMachineMemOperand(MMOa, PtrInfo,Size);

956}

957

958bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,

959constSIInstrInfo &TII,

960const CombineInfo &Paired) {

961assert(CI.InstClass == MIMG);

962

963// Ignore instructions with tfe/lwe set.

964constauto *TFEOp =TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);

965constauto *LWEOp =TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);

966

967if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))

968returnfalse;

969

970// Check other optional immediate operands for equality.

971unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,

972 AMDGPU::OpName::unorm, AMDGPU::OpName::da,

973 AMDGPU::OpName::r128, AMDGPU::OpName::a16};

974

975for (autoop : OperandsToMatch) {

976intIdx =AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),op);

977if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(),op) !=Idx)

978returnfalse;

979if (Idx != -1 &&

980 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())

981returnfalse;

982 }

983

984// Check DMask for overlaps.

985unsigned MaxMask = std::max(CI.DMask, Paired.DMask);

986unsigned MinMask = std::min(CI.DMask, Paired.DMask);

987

988if (!MaxMask)

989returnfalse;

990

991unsigned AllowedBitsForMin =llvm::countr_zero(MaxMask);

992if ((1u << AllowedBitsForMin) <= MinMask)

993returnfalse;

994

995returntrue;

996}

997

998staticunsignedgetBufferFormatWithCompCount(unsigned OldFormat,

999unsigned ComponentCount,

1000constGCNSubtarget &STI) {

1001if (ComponentCount > 4)

1002return 0;

1003

1004constllvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =

1005llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);

1006if (!OldFormatInfo)

1007return 0;

1008

1009constllvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =

1010llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,

1011 ComponentCount,

1012 OldFormatInfo->NumFormat, STI);

1013

1014if (!NewFormatInfo)

1015return 0;

1016

1017assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&

1018 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);

1019

1020return NewFormatInfo->Format;

1021}

1022

1023// Return the value in the inclusive range [Lo,Hi] that is aligned to the

1024// highest power of two. Note that the result is well defined for all inputs

1025// including corner cases like:

1026// - if Lo == Hi, return that value

1027// - if Lo == 0, return 0 (even though the "- 1" below underflows

1028// - if Lo > Hi, return 0 (as if the range wrapped around)

1029staticuint32_t mostAlignedValueInRange(uint32_t Lo,uint32_t Hi) {

1030returnHi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^Hi) + 1);

1031}

1032

1033bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,

1034constGCNSubtarget &STI,

1035 CombineInfo &Paired,

1036bool Modify) {

1037assert(CI.InstClass != MIMG);

1038

1039// XXX - Would the same offset be OK? Is there any reason this would happen or

1040// be useful?

1041if (CI.Offset == Paired.Offset)

1042returnfalse;

1043

1044// This won't be valid if the offset isn't aligned.

1045if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))

1046returnfalse;

1047

1048if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {

1049

1050constllvm::AMDGPU::GcnBufferFormatInfo *Info0 =

1051llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);

1052if (!Info0)

1053returnfalse;

1054constllvm::AMDGPU::GcnBufferFormatInfo *Info1 =

1055llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);

1056if (!Info1)

1057returnfalse;

1058

1059if (Info0->BitsPerComp != Info1->BitsPerComp ||

1060 Info0->NumFormat != Info1->NumFormat)

1061returnfalse;

1062

1063// TODO: Should be possible to support more formats, but if format loads

1064// are not dword-aligned, the merged load might not be valid.

1065if (Info0->BitsPerComp != 32)

1066returnfalse;

1067

1068if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)

1069returnfalse;

1070 }

1071

1072uint32_t EltOffset0 = CI.Offset / CI.EltSize;

1073uint32_t EltOffset1 = Paired.Offset / CI.EltSize;

1074 CI.UseST64 =false;

1075 CI.BaseOff = 0;

1076

1077// Handle all non-DS instructions.

1078if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {

1079if (EltOffset0 + CI.Width != EltOffset1 &&

1080 EltOffset1 + Paired.Width != EltOffset0)

1081returnfalse;

1082if (CI.CPol != Paired.CPol)

1083returnfalse;

1084if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||

1085 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {

1086// Reject cases like:

1087// dword + dwordx2 -> dwordx3

1088// dword + dwordx3 -> dwordx4

1089// If we tried to combine these cases, we would fail to extract a subreg

1090// for the result of the second load due to SGPR alignment requirements.

1091if (CI.Width != Paired.Width &&

1092 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))

1093returnfalse;

1094 }

1095returntrue;

1096 }

1097

1098// If the offset in elements doesn't fit in 8-bits, we might be able to use

1099// the stride 64 versions.

1100if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&

1101 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {

1102if (Modify) {

1103 CI.Offset = EltOffset0 / 64;

1104 Paired.Offset = EltOffset1 / 64;

1105 CI.UseST64 =true;

1106 }

1107returntrue;

1108 }

1109

1110// Check if the new offsets fit in the reduced 8-bit range.

1111if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {

1112if (Modify) {

1113 CI.Offset = EltOffset0;

1114 Paired.Offset = EltOffset1;

1115 }

1116returntrue;

1117 }

1118

1119// Try to shift base address to decrease offsets.

1120uint32_t Min = std::min(EltOffset0, EltOffset1);

1121uint32_t Max = std::max(EltOffset0, EltOffset1);

1122

1123constuint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;

1124if (((Max - Min) & ~Mask) == 0) {

1125if (Modify) {

1126// From the range of values we could use for BaseOff, choose the one that

1127// is aligned to the highest power of two, to maximise the chance that

1128// the same offset can be reused for other load/store pairs.

1129uint32_t BaseOff =mostAlignedValueInRange(Max - 0xff * 64, Min);

1130// Copy the low bits of the offsets, so that when we adjust them by

1131// subtracting BaseOff they will be multiples of 64.

1132 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);

1133 CI.BaseOff = BaseOff * CI.EltSize;

1134 CI.Offset = (EltOffset0 - BaseOff) / 64;

1135 Paired.Offset = (EltOffset1 - BaseOff) / 64;

1136 CI.UseST64 =true;

1137 }

1138returntrue;

1139 }

1140

1141if (isUInt<8>(Max - Min)) {

1142if (Modify) {

1143// From the range of values we could use for BaseOff, choose the one that

1144// is aligned to the highest power of two, to maximise the chance that

1145// the same offset can be reused for other load/store pairs.

1146uint32_t BaseOff =mostAlignedValueInRange(Max - 0xff, Min);

1147 CI.BaseOff = BaseOff * CI.EltSize;

1148 CI.Offset = EltOffset0 - BaseOff;

1149 Paired.Offset = EltOffset1 - BaseOff;

1150 }

1151returntrue;

1152 }

1153

1154returnfalse;

1155}

1156

1157bool SILoadStoreOptimizer::widthsFit(constGCNSubtarget &STM,

1158const CombineInfo &CI,

1159const CombineInfo &Paired) {

1160constunsigned Width = (CI.Width + Paired.Width);

1161switch (CI.InstClass) {

1162default:

1163return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));

1164case S_BUFFER_LOAD_IMM:

1165case S_BUFFER_LOAD_SGPR_IMM:

1166case S_LOAD_IMM:

1167switch (Width) {

1168default:

1169returnfalse;

1170case 2:

1171case 4:

1172case 8:

1173returntrue;

1174case 3:

1175return STM.hasScalarDwordx3Loads();

1176 }

1177 }

1178}

1179

1180constTargetRegisterClass *

1181SILoadStoreOptimizer::getDataRegClass(constMachineInstr &MI) const{

1182if (constauto *Dst =TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {

1183returnTRI->getRegClassForReg(*MRI, Dst->getReg());

1184 }

1185if (constauto *Src =TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {

1186returnTRI->getRegClassForReg(*MRI, Src->getReg());

1187 }

1188if (constauto *Src =TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {

1189returnTRI->getRegClassForReg(*MRI, Src->getReg());

1190 }

1191if (constauto *Dst =TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {

1192returnTRI->getRegClassForReg(*MRI, Dst->getReg());

1193 }

1194if (constauto *Src =TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {

1195returnTRI->getRegClassForReg(*MRI, Src->getReg());

1196 }

1197returnnullptr;

1198}

1199

1200/// This function assumes that CI comes before Paired in a basic block. Return

1201/// an insertion point for the merged instruction or nullptr on failure.

1202SILoadStoreOptimizer::CombineInfo *

1203SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,

1204 CombineInfo &Paired) {

1205// If another instruction has already been merged into CI, it may now be a

1206// type that we can't do any further merging into.

1207if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)

1208returnnullptr;

1209assert(CI.InstClass == Paired.InstClass);

1210

1211if (getInstSubclass(CI.I->getOpcode(), *TII) !=

1212 getInstSubclass(Paired.I->getOpcode(), *TII))

1213returnnullptr;

1214

1215// Check both offsets (or masks for MIMG) can be combined and fit in the

1216// reduced range.

1217if (CI.InstClass == MIMG) {

1218if (!dmasksCanBeCombined(CI, *TII, Paired))

1219returnnullptr;

1220 }else {

1221if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))

1222returnnullptr;

1223 }

1224

1225DenseSet<Register> RegDefs;

1226DenseSet<Register> RegUses;

1227 CombineInfo *Where;

1228if (CI.I->mayLoad()) {

1229// Try to hoist Paired up to CI.

1230addDefsUsesToList(*Paired.I, RegDefs, RegUses);

1231for (MachineBasicBlock::iterator MBBI = Paired.I; --MBBI != CI.I;) {

1232if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))

1233returnnullptr;

1234 }

1235 Where = &CI;

1236 }else {

1237// Try to sink CI down to Paired.

1238addDefsUsesToList(*CI.I, RegDefs, RegUses);

1239for (MachineBasicBlock::iterator MBBI = CI.I; ++MBBI != Paired.I;) {

1240if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))

1241returnnullptr;

1242 }

1243 Where = &Paired;

1244 }

1245

1246// Call offsetsCanBeCombined with modify = true so that the offsets are

1247// correct for the new instruction. This should return true, because

1248// this function should only be called on CombineInfo objects that

1249// have already been confirmed to be mergeable.

1250if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)

1251 offsetsCanBeCombined(CI, *STM, Paired,true);

1252return Where;

1253}

1254

1255// Copy the merged load result from DestReg to the original dest regs of CI and

1256// Paired.

1257void SILoadStoreOptimizer::copyToDestRegs(

1258 CombineInfo &CI, CombineInfo &Paired,

1259MachineBasicBlock::iterator InsertBefore,intOpName,

1260Register DestReg) const{

1261MachineBasicBlock *MBB = CI.I->getParent();

1262DebugLoc DL = CI.I->getDebugLoc();

1263

1264auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);

1265

1266// Copy to the old destination registers.

1267constMCInstrDesc &CopyDesc =TII->get(TargetOpcode::COPY);

1268auto *Dest0 =TII->getNamedOperand(*CI.I,OpName);

1269auto *Dest1 =TII->getNamedOperand(*Paired.I,OpName);

1270

1271// The constrained sload instructions in S_LOAD_IMM class will have

1272// `early-clobber` flag in the dst operand. Remove the flag before using the

1273// MOs in copies.

1274 Dest0->setIsEarlyClobber(false);

1275 Dest1->setIsEarlyClobber(false);

1276

1277BuildMI(*MBB, InsertBefore,DL, CopyDesc)

1278 .add(*Dest0)// Copy to same destination including flags and sub reg.

1279 .addReg(DestReg, 0, SubRegIdx0);

1280BuildMI(*MBB, InsertBefore,DL, CopyDesc)

1281 .add(*Dest1)

1282 .addReg(DestReg,RegState::Kill, SubRegIdx1);

1283}

1284

1285// Return a register for the source of the merged store after copying the

1286// original source regs of CI and Paired into it.

1287Register

1288SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,

1289MachineBasicBlock::iterator InsertBefore,

1290intOpName) const{

1291MachineBasicBlock *MBB = CI.I->getParent();

1292DebugLoc DL = CI.I->getDebugLoc();

1293

1294auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);

1295

1296// Copy to the new source register.

1297constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1298Register SrcReg =MRI->createVirtualRegister(SuperRC);

1299

1300constauto *Src0 =TII->getNamedOperand(*CI.I,OpName);

1301constauto *Src1 =TII->getNamedOperand(*Paired.I,OpName);

1302

1303BuildMI(*MBB, InsertBefore,DL,TII->get(AMDGPU::REG_SEQUENCE), SrcReg)

1304 .add(*Src0)

1305 .addImm(SubRegIdx0)

1306 .add(*Src1)

1307 .addImm(SubRegIdx1);

1308

1309return SrcReg;

1310}

1311

1312unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const{

1313if (STM->ldsRequiresM0Init())

1314return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;

1315return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;

1316}

1317

1318unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const{

1319if (STM->ldsRequiresM0Init())

1320return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;

1321

1322return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9

1323 : AMDGPU::DS_READ2ST64_B64_gfx9;

1324}

1325

1326MachineBasicBlock::iterator

1327SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,

1328MachineBasicBlock::iterator InsertBefore) {

1329MachineBasicBlock *MBB = CI.I->getParent();

1330

1331// Be careful, since the addresses could be subregisters themselves in weird

1332// cases, like vectors of pointers.

1333constauto *AddrReg =TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);

1334

1335unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);

1336unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);

1337unsigned Opc =

1338 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);

1339

1340assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&

1341 (NewOffset0 != NewOffset1) &&"Computed offset doesn't fit");

1342

1343constMCInstrDesc &Read2Desc =TII->get(Opc);

1344

1345constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1346Register DestReg =MRI->createVirtualRegister(SuperRC);

1347

1348DebugLoc DL = CI.I->getDebugLoc();

1349

1350Register BaseReg = AddrReg->getReg();

1351unsigned BaseSubReg = AddrReg->getSubReg();

1352unsigned BaseRegFlags = 0;

1353if (CI.BaseOff) {

1354Register ImmReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

1355BuildMI(*MBB, InsertBefore,DL,TII->get(AMDGPU::S_MOV_B32), ImmReg)

1356 .addImm(CI.BaseOff);

1357

1358 BaseReg =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

1359 BaseRegFlags =RegState::Kill;

1360

1361TII->getAddNoCarry(*MBB, InsertBefore,DL, BaseReg)

1362 .addReg(ImmReg)

1363 .addReg(AddrReg->getReg(), 0, BaseSubReg)

1364 .addImm(0);// clamp bit

1365 BaseSubReg = 0;

1366 }

1367

1368MachineInstrBuilder Read2 =

1369BuildMI(*MBB, InsertBefore,DL, Read2Desc, DestReg)

1370 .addReg(BaseReg, BaseRegFlags, BaseSubReg)// addr

1371 .addImm(NewOffset0)// offset0

1372 .addImm(NewOffset1)// offset1

1373 .addImm(0)// gds

1374 .cloneMergedMemRefs({&*CI.I, &*Paired.I});

1375

1376 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);

1377

1378 CI.I->eraseFromParent();

1379 Paired.I->eraseFromParent();

1380

1381LLVM_DEBUG(dbgs() <<"Inserted read2: " << *Read2 <<'\n');

1382return Read2;

1383}

1384

1385unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const{

1386if (STM->ldsRequiresM0Init())

1387return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;

1388return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9

1389 : AMDGPU::DS_WRITE2_B64_gfx9;

1390}

1391

1392unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const{

1393if (STM->ldsRequiresM0Init())

1394return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32

1395 : AMDGPU::DS_WRITE2ST64_B64;

1396

1397return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9

1398 : AMDGPU::DS_WRITE2ST64_B64_gfx9;

1399}

1400

1401MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(

1402 CombineInfo &CI, CombineInfo &Paired,

1403MachineBasicBlock::iterator InsertBefore) {

1404MachineBasicBlock *MBB = CI.I->getParent();

1405

1406// Be sure to use .addOperand(), and not .addReg() with these. We want to be

1407// sure we preserve the subregister index and any register flags set on them.

1408constMachineOperand *AddrReg =

1409TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);

1410constMachineOperand *Data0 =

1411TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);

1412constMachineOperand *Data1 =

1413TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);

1414

1415unsigned NewOffset0 = CI.Offset;

1416unsigned NewOffset1 = Paired.Offset;

1417unsigned Opc =

1418 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);

1419

1420if (NewOffset0 > NewOffset1) {

1421// Canonicalize the merged instruction so the smaller offset comes first.

1422std::swap(NewOffset0, NewOffset1);

1423std::swap(Data0, Data1);

1424 }

1425

1426assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&

1427 (NewOffset0 != NewOffset1) &&"Computed offset doesn't fit");

1428

1429constMCInstrDesc &Write2Desc =TII->get(Opc);

1430DebugLoc DL = CI.I->getDebugLoc();

1431

1432Register BaseReg = AddrReg->getReg();

1433unsigned BaseSubReg = AddrReg->getSubReg();

1434unsigned BaseRegFlags = 0;

1435if (CI.BaseOff) {

1436Register ImmReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

1437BuildMI(*MBB, InsertBefore,DL,TII->get(AMDGPU::S_MOV_B32), ImmReg)

1438 .addImm(CI.BaseOff);

1439

1440 BaseReg =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

1441 BaseRegFlags =RegState::Kill;

1442

1443TII->getAddNoCarry(*MBB, InsertBefore,DL, BaseReg)

1444 .addReg(ImmReg)

1445 .addReg(AddrReg->getReg(), 0, BaseSubReg)

1446 .addImm(0);// clamp bit

1447 BaseSubReg = 0;

1448 }

1449

1450MachineInstrBuilder Write2 =

1451BuildMI(*MBB, InsertBefore,DL, Write2Desc)

1452 .addReg(BaseReg, BaseRegFlags, BaseSubReg)// addr

1453 .add(*Data0)// data0

1454 .add(*Data1)// data1

1455 .addImm(NewOffset0)// offset0

1456 .addImm(NewOffset1)// offset1

1457 .addImm(0)// gds

1458 .cloneMergedMemRefs({&*CI.I, &*Paired.I});

1459

1460 CI.I->eraseFromParent();

1461 Paired.I->eraseFromParent();

1462

1463LLVM_DEBUG(dbgs() <<"Inserted write2 inst: " << *Write2 <<'\n');

1464return Write2;

1465}

1466

1467MachineBasicBlock::iterator

1468SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,

1469MachineBasicBlock::iterator InsertBefore) {

1470MachineBasicBlock *MBB = CI.I->getParent();

1471DebugLoc DL = CI.I->getDebugLoc();

1472constunsigned Opcode = getNewOpcode(CI, Paired);

1473

1474constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1475

1476Register DestReg =MRI->createVirtualRegister(SuperRC);

1477unsigned MergedDMask = CI.DMask | Paired.DMask;

1478unsigned DMaskIdx =

1479AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);

1480

1481auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg);

1482for (unsignedI = 1, E = (*CI.I).getNumOperands();I != E; ++I) {

1483if (I == DMaskIdx)

1484 MIB.addImm(MergedDMask);

1485else

1486 MIB.add((*CI.I).getOperand(I));

1487 }

1488

1489// It shouldn't be possible to get this far if the two instructions

1490// don't have a single memoperand, because MachineInstr::mayAlias()

1491// will return true if this is the case.

1492assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1493

1494MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1495

1496 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);

1497

1498 CI.I->eraseFromParent();

1499 Paired.I->eraseFromParent();

1500returnNew;

1501}

1502

1503MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(

1504 CombineInfo &CI, CombineInfo &Paired,

1505MachineBasicBlock::iterator InsertBefore) {

1506MachineBasicBlock *MBB = CI.I->getParent();

1507DebugLoc DL = CI.I->getDebugLoc();

1508constunsigned Opcode = getNewOpcode(CI, Paired);

1509

1510constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1511

1512Register DestReg =MRI->createVirtualRegister(SuperRC);

1513unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

1514

1515// It shouldn't be possible to get this far if the two instructions

1516// don't have a single memoperand, because MachineInstr::mayAlias()

1517// will return true if this is the case.

1518assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1519

1520MachineInstrBuilder New =

1521BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg)

1522 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));

1523if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)

1524New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));

1525New.addImm(MergedOffset);

1526New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1527

1528 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);

1529

1530 CI.I->eraseFromParent();

1531 Paired.I->eraseFromParent();

1532returnNew;

1533}

1534

1535MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(

1536 CombineInfo &CI, CombineInfo &Paired,

1537MachineBasicBlock::iterator InsertBefore) {

1538MachineBasicBlock *MBB = CI.I->getParent();

1539DebugLoc DL = CI.I->getDebugLoc();

1540

1541constunsigned Opcode = getNewOpcode(CI, Paired);

1542

1543constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1544

1545// Copy to the new source register.

1546Register DestReg =MRI->createVirtualRegister(SuperRC);

1547unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

1548

1549auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg);

1550

1551 AddressRegs Regs = getRegs(Opcode, *TII);

1552

1553if (Regs.VAddr)

1554 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

1555

1556// It shouldn't be possible to get this far if the two instructions

1557// don't have a single memoperand, because MachineInstr::mayAlias()

1558// will return true if this is the case.

1559assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1560

1561MachineInstr *New =

1562 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

1563 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

1564 .addImm(MergedOffset)// offset

1565 .addImm(CI.CPol)// cpol

1566 .addImm(0)// swz

1567 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1568

1569 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);

1570

1571 CI.I->eraseFromParent();

1572 Paired.I->eraseFromParent();

1573returnNew;

1574}

1575

1576MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(

1577 CombineInfo &CI, CombineInfo &Paired,

1578MachineBasicBlock::iterator InsertBefore) {

1579MachineBasicBlock *MBB = CI.I->getParent();

1580DebugLoc DL = CI.I->getDebugLoc();

1581

1582constunsigned Opcode = getNewOpcode(CI, Paired);

1583

1584constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1585

1586// Copy to the new source register.

1587Register DestReg =MRI->createVirtualRegister(SuperRC);

1588unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);

1589

1590auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg);

1591

1592 AddressRegs Regs = getRegs(Opcode, *TII);

1593

1594if (Regs.VAddr)

1595 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

1596

1597unsigned JoinedFormat =

1598getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);

1599

1600// It shouldn't be possible to get this far if the two instructions

1601// don't have a single memoperand, because MachineInstr::mayAlias()

1602// will return true if this is the case.

1603assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1604

1605MachineInstr *New =

1606 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

1607 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

1608 .addImm(MergedOffset)// offset

1609 .addImm(JoinedFormat)// format

1610 .addImm(CI.CPol)// cpol

1611 .addImm(0)// swz

1612 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1613

1614 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);

1615

1616 CI.I->eraseFromParent();

1617 Paired.I->eraseFromParent();

1618returnNew;

1619}

1620

1621MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(

1622 CombineInfo &CI, CombineInfo &Paired,

1623MachineBasicBlock::iterator InsertBefore) {

1624MachineBasicBlock *MBB = CI.I->getParent();

1625DebugLoc DL = CI.I->getDebugLoc();

1626

1627constunsigned Opcode = getNewOpcode(CI, Paired);

1628

1629Register SrcReg =

1630 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);

1631

1632auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode))

1633 .addReg(SrcReg,RegState::Kill);

1634

1635 AddressRegs Regs = getRegs(Opcode, *TII);

1636

1637if (Regs.VAddr)

1638 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

1639

1640unsigned JoinedFormat =

1641getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);

1642

1643// It shouldn't be possible to get this far if the two instructions

1644// don't have a single memoperand, because MachineInstr::mayAlias()

1645// will return true if this is the case.

1646assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1647

1648MachineInstr *New =

1649 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

1650 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

1651 .addImm(std::min(CI.Offset, Paired.Offset))// offset

1652 .addImm(JoinedFormat)// format

1653 .addImm(CI.CPol)// cpol

1654 .addImm(0)// swz

1655 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1656

1657 CI.I->eraseFromParent();

1658 Paired.I->eraseFromParent();

1659returnNew;

1660}

1661

1662MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(

1663 CombineInfo &CI, CombineInfo &Paired,

1664MachineBasicBlock::iterator InsertBefore) {

1665MachineBasicBlock *MBB = CI.I->getParent();

1666DebugLoc DL = CI.I->getDebugLoc();

1667

1668constunsigned Opcode = getNewOpcode(CI, Paired);

1669

1670constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);

1671Register DestReg =MRI->createVirtualRegister(SuperRC);

1672

1673auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg);

1674

1675if (auto *SAddr =TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))

1676 MIB.add(*SAddr);

1677

1678MachineInstr *New =

1679 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))

1680 .addImm(std::min(CI.Offset, Paired.Offset))

1681 .addImm(CI.CPol)

1682 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1683

1684 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);

1685

1686 CI.I->eraseFromParent();

1687 Paired.I->eraseFromParent();

1688returnNew;

1689}

1690

1691MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(

1692 CombineInfo &CI, CombineInfo &Paired,

1693MachineBasicBlock::iterator InsertBefore) {

1694MachineBasicBlock *MBB = CI.I->getParent();

1695DebugLoc DL = CI.I->getDebugLoc();

1696

1697constunsigned Opcode = getNewOpcode(CI, Paired);

1698

1699Register SrcReg =

1700 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);

1701

1702auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode))

1703 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))

1704 .addReg(SrcReg,RegState::Kill);

1705

1706if (auto *SAddr =TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))

1707 MIB.add(*SAddr);

1708

1709MachineInstr *New =

1710 MIB.addImm(std::min(CI.Offset, Paired.Offset))

1711 .addImm(CI.CPol)

1712 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1713

1714 CI.I->eraseFromParent();

1715 Paired.I->eraseFromParent();

1716returnNew;

1717}

1718

1719staticboolneedsConstrainedOpcode(constGCNSubtarget &STM,

1720ArrayRef<MachineMemOperand *> MMOs,

1721unsigned Width) {

1722// Conservatively returns true if not found the MMO.

1723return STM.isXNACKEnabled() &&

1724 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);

1725}

1726

1727unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,

1728const CombineInfo &Paired) {

1729constunsigned Width = CI.Width + Paired.Width;

1730

1731switch (getCommonInstClass(CI, Paired)) {

1732default:

1733assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);

1734// FIXME: Handle d16 correctly

1735returnAMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),

1736 Width);

1737case TBUFFER_LOAD:

1738case TBUFFER_STORE:

1739returnAMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),

1740 Width);

1741

1742caseUNKNOWN:

1743llvm_unreachable("Unknown instruction class");

1744case S_BUFFER_LOAD_IMM: {

1745// If XNACK is enabled, use the constrained opcodes when the first load is

1746// under-aligned.

1747bool NeedsConstrainedOpc =

1748needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);

1749switch (Width) {

1750default:

1751return 0;

1752case 2:

1753return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec

1754 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;

1755case 3:

1756return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec

1757 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;

1758case 4:

1759return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec

1760 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;

1761case 8:

1762return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec

1763 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;

1764 }

1765 }

1766case S_BUFFER_LOAD_SGPR_IMM: {

1767// If XNACK is enabled, use the constrained opcodes when the first load is

1768// under-aligned.

1769bool NeedsConstrainedOpc =

1770needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);

1771switch (Width) {

1772default:

1773return 0;

1774case 2:

1775return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec

1776 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;

1777case 3:

1778return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec

1779 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;

1780case 4:

1781return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec

1782 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;

1783case 8:

1784return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec

1785 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;

1786 }

1787 }

1788case S_LOAD_IMM: {

1789// If XNACK is enabled, use the constrained opcodes when the first load is

1790// under-aligned.

1791bool NeedsConstrainedOpc =

1792needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);

1793switch (Width) {

1794default:

1795return 0;

1796case 2:

1797return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec

1798 : AMDGPU::S_LOAD_DWORDX2_IMM;

1799case 3:

1800return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec

1801 : AMDGPU::S_LOAD_DWORDX3_IMM;

1802case 4:

1803return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec

1804 : AMDGPU::S_LOAD_DWORDX4_IMM;

1805case 8:

1806return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec

1807 : AMDGPU::S_LOAD_DWORDX8_IMM;

1808 }

1809 }

1810case GLOBAL_LOAD:

1811switch (Width) {

1812default:

1813return 0;

1814case 2:

1815return AMDGPU::GLOBAL_LOAD_DWORDX2;

1816case 3:

1817return AMDGPU::GLOBAL_LOAD_DWORDX3;

1818case 4:

1819return AMDGPU::GLOBAL_LOAD_DWORDX4;

1820 }

1821case GLOBAL_LOAD_SADDR:

1822switch (Width) {

1823default:

1824return 0;

1825case 2:

1826return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;

1827case 3:

1828return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;

1829case 4:

1830return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;

1831 }

1832case GLOBAL_STORE:

1833switch (Width) {

1834default:

1835return 0;

1836case 2:

1837return AMDGPU::GLOBAL_STORE_DWORDX2;

1838case 3:

1839return AMDGPU::GLOBAL_STORE_DWORDX3;

1840case 4:

1841return AMDGPU::GLOBAL_STORE_DWORDX4;

1842 }

1843case GLOBAL_STORE_SADDR:

1844switch (Width) {

1845default:

1846return 0;

1847case 2:

1848return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;

1849case 3:

1850return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;

1851case 4:

1852return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;

1853 }

1854case FLAT_LOAD:

1855switch (Width) {

1856default:

1857return 0;

1858case 2:

1859return AMDGPU::FLAT_LOAD_DWORDX2;

1860case 3:

1861return AMDGPU::FLAT_LOAD_DWORDX3;

1862case 4:

1863return AMDGPU::FLAT_LOAD_DWORDX4;

1864 }

1865case FLAT_STORE:

1866switch (Width) {

1867default:

1868return 0;

1869case 2:

1870return AMDGPU::FLAT_STORE_DWORDX2;

1871case 3:

1872return AMDGPU::FLAT_STORE_DWORDX3;

1873case 4:

1874return AMDGPU::FLAT_STORE_DWORDX4;

1875 }

1876caseMIMG:

1877assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&

1878"No overlaps");

1879returnAMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);

1880 }

1881}

1882

1883std::pair<unsigned, unsigned>

1884SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,

1885const CombineInfo &Paired) {

1886assert((CI.InstClass != MIMG ||

1887 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==

1888 CI.Width + Paired.Width)) &&

1889"No overlaps");

1890

1891unsigned Idx0;

1892unsigned Idx1;

1893

1894staticconstunsigned Idxs[5][4] = {

1895 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},

1896 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},

1897 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},

1898 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},

1899 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},

1900 };

1901

1902assert(CI.Width >= 1 && CI.Width <= 4);

1903assert(Paired.Width >= 1 && Paired.Width <= 4);

1904

1905if (Paired < CI) {

1906 Idx1 = Idxs[0][Paired.Width - 1];

1907 Idx0 = Idxs[Paired.Width][CI.Width - 1];

1908 }else {

1909 Idx0 = Idxs[0][CI.Width - 1];

1910 Idx1 = Idxs[CI.Width][Paired.Width - 1];

1911 }

1912

1913return {Idx0, Idx1};

1914}

1915

1916constTargetRegisterClass *

1917SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,

1918const CombineInfo &Paired) const{

1919if (CI.InstClass == S_BUFFER_LOAD_IMM ||

1920 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {

1921switch (CI.Width + Paired.Width) {

1922default:

1923returnnullptr;

1924case 2:

1925return &AMDGPU::SReg_64_XEXECRegClass;

1926case 3:

1927return &AMDGPU::SGPR_96RegClass;

1928case 4:

1929return &AMDGPU::SGPR_128RegClass;

1930case 8:

1931return &AMDGPU::SGPR_256RegClass;

1932case 16:

1933return &AMDGPU::SGPR_512RegClass;

1934 }

1935 }

1936

1937unsignedBitWidth = 32 * (CI.Width + Paired.Width);

1938returnTRI->isAGPRClass(getDataRegClass(*CI.I))

1939 ?TRI->getAGPRClassForBitWidth(BitWidth)

1940 :TRI->getVGPRClassForBitWidth(BitWidth);

1941}

1942

1943MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(

1944 CombineInfo &CI, CombineInfo &Paired,

1945MachineBasicBlock::iterator InsertBefore) {

1946MachineBasicBlock *MBB = CI.I->getParent();

1947DebugLoc DL = CI.I->getDebugLoc();

1948

1949constunsigned Opcode = getNewOpcode(CI, Paired);

1950

1951Register SrcReg =

1952 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);

1953

1954auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode))

1955 .addReg(SrcReg,RegState::Kill);

1956

1957 AddressRegs Regs = getRegs(Opcode, *TII);

1958

1959if (Regs.VAddr)

1960 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));

1961

1962

1963// It shouldn't be possible to get this far if the two instructions

1964// don't have a single memoperand, because MachineInstr::mayAlias()

1965// will return true if this is the case.

1966assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());

1967

1968MachineInstr *New =

1969 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))

1970 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))

1971 .addImm(std::min(CI.Offset, Paired.Offset))// offset

1972 .addImm(CI.CPol)// cpol

1973 .addImm(0)// swz

1974 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));

1975

1976 CI.I->eraseFromParent();

1977 Paired.I->eraseFromParent();

1978returnNew;

1979}

1980

1981MachineOperand

1982SILoadStoreOptimizer::createRegOrImm(int32_t Val,MachineInstr &MI) const{

1983APInt V(32, Val,true);

1984if (TII->isInlineConstant(V))

1985returnMachineOperand::CreateImm(Val);

1986

1987Register Reg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);

1988MachineInstr *Mov =

1989BuildMI(*MI.getParent(),MI.getIterator(),MI.getDebugLoc(),

1990TII->get(AMDGPU::S_MOV_B32), Reg)

1991 .addImm(Val);

1992 (void)Mov;

1993LLVM_DEBUG(dbgs() <<" "; Mov->dump());

1994returnMachineOperand::CreateReg(Reg,false);

1995}

1996

1997// Compute base address using Addr and return the final register.

1998Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,

1999const MemAddress &Addr) const{

2000MachineBasicBlock *MBB =MI.getParent();

2001MachineBasicBlock::iterator MBBI =MI.getIterator();

2002DebugLoc DL =MI.getDebugLoc();

2003

2004assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||

2005Addr.Base.LoSubReg) &&

2006"Expected 32-bit Base-Register-Low!!");

2007

2008assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||

2009Addr.Base.HiSubReg) &&

2010"Expected 32-bit Base-Register-Hi!!");

2011

2012LLVM_DEBUG(dbgs() <<" Re-Computed Anchor-Base:\n");

2013MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset),MI);

2014MachineOperand OffsetHi =

2015 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32),MI);

2016

2017constauto *CarryRC =TRI->getWaveMaskRegClass();

2018Register CarryReg =MRI->createVirtualRegister(CarryRC);

2019Register DeadCarryReg =MRI->createVirtualRegister(CarryRC);

2020

2021Register DestSub0 =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

2022Register DestSub1 =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);

2023MachineInstr *LoHalf =

2024BuildMI(*MBB,MBBI,DL,TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)

2025 .addReg(CarryReg,RegState::Define)

2026 .addReg(Addr.Base.LoReg, 0,Addr.Base.LoSubReg)

2027 .add(OffsetLo)

2028 .addImm(0);// clamp bit

2029 (void)LoHalf;

2030LLVM_DEBUG(dbgs() <<" "; LoHalf->dump(););

2031

2032MachineInstr *HiHalf =

2033BuildMI(*MBB,MBBI,DL,TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)

2034 .addReg(DeadCarryReg,RegState::Define |RegState::Dead)

2035 .addReg(Addr.Base.HiReg, 0,Addr.Base.HiSubReg)

2036 .add(OffsetHi)

2037 .addReg(CarryReg,RegState::Kill)

2038 .addImm(0);// clamp bit

2039 (void)HiHalf;

2040LLVM_DEBUG(dbgs() <<" "; HiHalf->dump(););

2041

2042Register FullDestReg =MRI->createVirtualRegister(TRI->getVGPR64Class());

2043MachineInstr *FullBase =

2044BuildMI(*MBB,MBBI,DL,TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)

2045 .addReg(DestSub0)

2046 .addImm(AMDGPU::sub0)

2047 .addReg(DestSub1)

2048 .addImm(AMDGPU::sub1);

2049 (void)FullBase;

2050LLVM_DEBUG(dbgs() <<" "; FullBase->dump();dbgs() <<"\n";);

2051

2052return FullDestReg;

2053}

2054

2055// Update base and offset with the NewBase and NewOffset in MI.

2056void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,

2057Register NewBase,

2058 int32_t NewOffset) const{

2059auto *Base =TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);

2060Base->setReg(NewBase);

2061Base->setIsKill(false);

2062TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);

2063}

2064

2065std::optional<int32_t>

2066SILoadStoreOptimizer::extractConstOffset(constMachineOperand &Op) const{

2067if (Op.isImm())

2068returnOp.getImm();

2069

2070if (!Op.isReg())

2071return std::nullopt;

2072

2073MachineInstr *Def =MRI->getUniqueVRegDef(Op.getReg());

2074if (!Def ||Def->getOpcode() != AMDGPU::S_MOV_B32 ||

2075 !Def->getOperand(1).isImm())

2076return std::nullopt;

2077

2078returnDef->getOperand(1).getImm();

2079}

2080

2081// Analyze Base and extracts:

2082// - 32bit base registers, subregisters

2083// - 64bit constant offset

2084// Expecting base computation as:

2085// %OFFSET0:sgpr_32 = S_MOV_B32 8000

2086// %LO:vgpr_32, %c:sreg_64_xexec =

2087// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,

2088// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec

2089// %Base:vreg_64 =

2090// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1

2091void SILoadStoreOptimizer::processBaseWithConstOffset(constMachineOperand &Base,

2092 MemAddress &Addr) const{

2093if (!Base.isReg())

2094return;

2095

2096MachineInstr *Def =MRI->getUniqueVRegDef(Base.getReg());

2097if (!Def ||Def->getOpcode() != AMDGPU::REG_SEQUENCE

2098 ||Def->getNumOperands() != 5)

2099return;

2100

2101MachineOperand BaseLo =Def->getOperand(1);

2102MachineOperand BaseHi =Def->getOperand(3);

2103if (!BaseLo.isReg() || !BaseHi.isReg())

2104return;

2105

2106MachineInstr *BaseLoDef =MRI->getUniqueVRegDef(BaseLo.getReg());

2107MachineInstr *BaseHiDef =MRI->getUniqueVRegDef(BaseHi.getReg());

2108

2109if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||

2110 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)

2111return;

2112

2113constauto *Src0 =TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);

2114constauto *Src1 =TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);

2115

2116auto Offset0P = extractConstOffset(*Src0);

2117if (Offset0P)

2118 BaseLo = *Src1;

2119else {

2120if (!(Offset0P = extractConstOffset(*Src1)))

2121return;

2122 BaseLo = *Src0;

2123 }

2124

2125if (!BaseLo.isReg())

2126return;

2127

2128 Src0 =TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);

2129 Src1 =TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);

2130

2131if (Src0->isImm())

2132std::swap(Src0, Src1);

2133

2134if (!Src1->isImm() || Src0->isImm())

2135return;

2136

2137uint64_t Offset1 = Src1->getImm();

2138 BaseHi = *Src0;

2139

2140if (!BaseHi.isReg())

2141return;

2142

2143Addr.Base.LoReg = BaseLo.getReg();

2144Addr.Base.HiReg = BaseHi.getReg();

2145Addr.Base.LoSubReg = BaseLo.getSubReg();

2146Addr.Base.HiSubReg = BaseHi.getSubReg();

2147Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);

2148}

2149

2150bool SILoadStoreOptimizer::promoteConstantOffsetToImm(

2151MachineInstr &MI,

2152 MemInfoMap &Visited,

2153SmallPtrSet<MachineInstr *, 4> &AnchorList) const{

2154

2155if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))

2156returnfalse;

2157

2158// TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.

2159if (SIInstrInfo::isFLATScratch(MI))

2160returnfalse;

2161

2162unsigned AS =SIInstrInfo::isFLATGlobal(MI) ?AMDGPUAS::GLOBAL_ADDRESS

2163 :AMDGPUAS::FLAT_ADDRESS;

2164

2165if (AnchorList.count(&MI))

2166returnfalse;

2167

2168LLVM_DEBUG(dbgs() <<"\nTryToPromoteConstantOffsetToImmFor ";MI.dump());

2169

2170if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {

2171LLVM_DEBUG(dbgs() <<" Const-offset is already promoted.\n";);

2172returnfalse;

2173 }

2174

2175// Step1: Find the base-registers and a 64bit constant offset.

2176MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);

2177 MemAddress MAddr;

2178if (!Visited.contains(&MI)) {

2179 processBaseWithConstOffset(Base, MAddr);

2180 Visited[&MI] = MAddr;

2181 }else

2182 MAddr = Visited[&MI];

2183

2184if (MAddr.Offset == 0) {

2185LLVM_DEBUG(dbgs() <<" Failed to extract constant-offset or there are no"

2186" constant offsets that can be promoted.\n";);

2187returnfalse;

2188 }

2189

2190LLVM_DEBUG(dbgs() <<" BASE: {" <<printReg(MAddr.Base.HiReg,TRI) <<", "

2191 <<printReg(MAddr.Base.LoReg,TRI)

2192 <<"} Offset: " << MAddr.Offset <<"\n\n";);

2193

2194// Step2: Traverse through MI's basic block and find an anchor(that has the

2195// same base-registers) with the highest 13bit distance from MI's offset.

2196// E.g. (64bit loads)

2197// bb:

2198// addr1 = &a + 4096; load1 = load(addr1, 0)

2199// addr2 = &a + 6144; load2 = load(addr2, 0)

2200// addr3 = &a + 8192; load3 = load(addr3, 0)

2201// addr4 = &a + 10240; load4 = load(addr4, 0)

2202// addr5 = &a + 12288; load5 = load(addr5, 0)

2203//

2204// Starting from the first load, the optimization will try to find a new base

2205// from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192

2206// has 13bit distance from &a + 4096. The heuristic considers &a + 8192

2207// as the new-base(anchor) because of the maximum distance which can

2208// accommodate more intermediate bases presumably.

2209//

2210// Step3: move (&a + 8192) above load1. Compute and promote offsets from

2211// (&a + 8192) for load1, load2, load4.

2212// addr = &a + 8192

2213// load1 = load(addr, -4096)

2214// load2 = load(addr, -2048)

2215// load3 = load(addr, 0)

2216// load4 = load(addr, 2048)

2217// addr5 = &a + 12288; load5 = load(addr5, 0)

2218//

2219MachineInstr *AnchorInst =nullptr;

2220 MemAddress AnchorAddr;

2221uint32_t MaxDist = std::numeric_limits<uint32_t>::min();

2222SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;

2223

2224MachineBasicBlock *MBB =MI.getParent();

2225MachineBasicBlock::iterator E =MBB->end();

2226MachineBasicBlock::iterator MBBI =MI.getIterator();

2227 ++MBBI;

2228constSITargetLowering *TLI =

2229static_cast<constSITargetLowering *>(STM->getTargetLowering());

2230

2231for ( ;MBBI != E; ++MBBI) {

2232MachineInstr &MINext = *MBBI;

2233// TODO: Support finding an anchor(with same base) from store addresses or

2234// any other load addresses where the opcodes are different.

2235if (MINext.getOpcode() !=MI.getOpcode() ||

2236TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())

2237continue;

2238

2239constMachineOperand &BaseNext =

2240 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);

2241 MemAddress MAddrNext;

2242if (!Visited.contains(&MINext)) {

2243 processBaseWithConstOffset(BaseNext, MAddrNext);

2244 Visited[&MINext] = MAddrNext;

2245 }else

2246 MAddrNext = Visited[&MINext];

2247

2248if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||

2249 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||

2250 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||

2251 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)

2252continue;

2253

2254 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);

2255

2256 int64_t Dist = MAddr.Offset - MAddrNext.Offset;

2257TargetLoweringBase::AddrMode AM;

2258 AM.HasBaseReg =true;

2259 AM.BaseOffs = Dist;

2260if (TLI->isLegalFlatAddressingMode(AM, AS) &&

2261 (uint32_t)std::abs(Dist) > MaxDist) {

2262 MaxDist = std::abs(Dist);

2263

2264 AnchorAddr = MAddrNext;

2265 AnchorInst = &MINext;

2266 }

2267 }

2268

2269if (AnchorInst) {

2270LLVM_DEBUG(dbgs() <<" Anchor-Inst(with max-distance from Offset): ";

2271 AnchorInst->dump());

2272LLVM_DEBUG(dbgs() <<" Anchor-Offset from BASE: "

2273 << AnchorAddr.Offset <<"\n\n");

2274

2275// Instead of moving up, just re-compute anchor-instruction's base address.

2276Register Base = computeBase(MI, AnchorAddr);

2277

2278 updateBaseAndOffset(MI,Base, MAddr.Offset - AnchorAddr.Offset);

2279LLVM_DEBUG(dbgs() <<" After promotion: ";MI.dump(););

2280

2281for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {

2282TargetLoweringBase::AddrMode AM;

2283 AM.HasBaseReg =true;

2284 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;

2285

2286if (TLI->isLegalFlatAddressingMode(AM, AS)) {

2287LLVM_DEBUG(dbgs() <<" Promote Offset(" << OtherOffset;dbgs() <<")";

2288 OtherMI->dump());

2289 updateBaseAndOffset(*OtherMI,Base, OtherOffset - AnchorAddr.Offset);

2290LLVM_DEBUG(dbgs() <<" After promotion: "; OtherMI->dump());

2291 }

2292 }

2293AnchorList.insert(AnchorInst);

2294returntrue;

2295 }

2296

2297returnfalse;

2298}

2299

2300void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,

2301 std::list<std::list<CombineInfo> > &MergeableInsts) const{

2302for (std::list<CombineInfo> &AddrList : MergeableInsts) {

2303if (AddrList.front().InstClass == CI.InstClass &&

2304 AddrList.front().IsAGPR == CI.IsAGPR &&

2305 AddrList.front().hasSameBaseAddress(CI)) {

2306 AddrList.emplace_back(CI);

2307return;

2308 }

2309 }

2310

2311// Base address not found, so add a new list.

2312 MergeableInsts.emplace_back(1, CI);

2313}

2314

2315std::pair<MachineBasicBlock::iterator, bool>

2316SILoadStoreOptimizer::collectMergeableInsts(

2317MachineBasicBlock::iterator Begin,MachineBasicBlock::iterator End,

2318 MemInfoMap &Visited,SmallPtrSet<MachineInstr *, 4> &AnchorList,

2319 std::list<std::list<CombineInfo>> &MergeableInsts) const{

2320boolModified =false;

2321

2322// Sort potential mergeable instructions into lists. One list per base address.

2323unsigned Order = 0;

2324MachineBasicBlock::iterator BlockI = Begin;

2325for (; BlockI !=End; ++BlockI) {

2326MachineInstr &MI = *BlockI;

2327

2328// We run this before checking if an address is mergeable, because it can produce

2329// better code even if the instructions aren't mergeable.

2330if (promoteConstantOffsetToImm(MI, Visited,AnchorList))

2331Modified =true;

2332

2333// Treat volatile accesses, ordered accesses and unmodeled side effects as

2334// barriers. We can look after this barrier for separate merges.

2335if (MI.hasOrderedMemoryRef() ||MI.hasUnmodeledSideEffects()) {

2336LLVM_DEBUG(dbgs() <<"Breaking search on barrier: " <<MI);

2337

2338// Search will resume after this instruction in a separate merge list.

2339 ++BlockI;

2340break;

2341 }

2342

2343const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);

2344if (InstClass == UNKNOWN)

2345continue;

2346

2347// Do not merge VMEM buffer instructions with "swizzled" bit set.

2348int Swizzled =

2349AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);

2350if (Swizzled != -1 &&MI.getOperand(Swizzled).getImm())

2351continue;

2352

2353 CombineInfo CI;

2354 CI.setMI(MI, *this);

2355 CI.Order = Order++;

2356

2357if (!CI.hasMergeableAddress(*MRI))

2358continue;

2359

2360if (CI.InstClass == DS_WRITE && CI.IsAGPR) {

2361// FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data

2362// operands. However we are reporting that ds_write2 shall have

2363// only VGPR data so that machine copy propagation does not

2364// create an illegal instruction with a VGPR and AGPR sources.

2365// Consequenctially if we create such instruction the verifier

2366// will complain.

2367continue;

2368 }

2369

2370LLVM_DEBUG(dbgs() <<"Mergeable: " <<MI);

2371

2372addInstToMergeableList(CI, MergeableInsts);

2373 }

2374

2375// At this point we have lists of Mergeable instructions.

2376//

2377// Part 2: Sort lists by offset and then for each CombineInfo object in the

2378// list try to find an instruction that can be merged with I. If an instruction

2379// is found, it is stored in the Paired field. If no instructions are found, then

2380// the CombineInfo object is deleted from the list.

2381

2382for (std::list<std::list<CombineInfo>>::iteratorI = MergeableInsts.begin(),

2383 E = MergeableInsts.end();I != E;) {

2384

2385 std::list<CombineInfo> &MergeList = *I;

2386if (MergeList.size() <= 1) {

2387// This means we have found only one instruction with a given address

2388// that can be merged, and we need at least 2 instructions to do a merge,

2389// so this list can be discarded.

2390I = MergeableInsts.erase(I);

2391continue;

2392 }

2393

2394// Sort the lists by offsets, this way mergeable instructions will be

2395// adjacent to each other in the list, which will make it easier to find

2396// matches.

2397 MergeList.sort(

2398 [] (const CombineInfo &A,const CombineInfo &B) {

2399returnA.Offset <B.Offset;

2400 });

2401 ++I;

2402 }

2403

2404return {BlockI,Modified};

2405}

2406

2407// Scan through looking for adjacent LDS operations with constant offsets from

2408// the same base register. We rely on the scheduler to do the hard work of

2409// clustering nearby loads, and assume these are all adjacent.

2410bool SILoadStoreOptimizer::optimizeBlock(

2411 std::list<std::list<CombineInfo> > &MergeableInsts) {

2412boolModified =false;

2413

2414for (std::list<std::list<CombineInfo>>::iteratorI = MergeableInsts.begin(),

2415 E = MergeableInsts.end();I != E;) {

2416 std::list<CombineInfo> &MergeList = *I;

2417

2418bool OptimizeListAgain =false;

2419if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {

2420// We weren't able to make any changes, so delete the list so we don't

2421// process the same instructions the next time we try to optimize this

2422// block.

2423I = MergeableInsts.erase(I);

2424continue;

2425 }

2426

2427Modified =true;

2428

2429// We made changes, but also determined that there were no more optimization

2430// opportunities, so we don't need to reprocess the list

2431if (!OptimizeListAgain) {

2432I = MergeableInsts.erase(I);

2433continue;

2434 }

2435 OptimizeAgain =true;

2436 }

2437returnModified;

2438}

2439

2440bool

2441SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(

2442 std::list<CombineInfo> &MergeList,

2443bool &OptimizeListAgain) {

2444if (MergeList.empty())

2445returnfalse;

2446

2447boolModified =false;

2448

2449for (autoI = MergeList.begin(), Next = std::next(I); Next != MergeList.end();

2450 Next = std::next(I)) {

2451

2452autoFirst =I;

2453auto Second = Next;

2454

2455if ((*First).Order > (*Second).Order)

2456std::swap(First, Second);

2457 CombineInfo &CI = *First;

2458 CombineInfo &Paired = *Second;

2459

2460 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);

2461if (!Where) {

2462 ++I;

2463continue;

2464 }

2465

2466Modified =true;

2467

2468LLVM_DEBUG(dbgs() <<"Merging: " << *CI.I <<" with: " << *Paired.I);

2469

2470MachineBasicBlock::iterator NewMI;

2471switch (CI.InstClass) {

2472default:

2473llvm_unreachable("unknown InstClass");

2474break;

2475case DS_READ:

2476 NewMI = mergeRead2Pair(CI, Paired, Where->I);

2477break;

2478case DS_WRITE:

2479 NewMI = mergeWrite2Pair(CI, Paired, Where->I);

2480break;

2481case S_BUFFER_LOAD_IMM:

2482case S_BUFFER_LOAD_SGPR_IMM:

2483case S_LOAD_IMM:

2484 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);

2485 OptimizeListAgain |= CI.Width + Paired.Width < 8;

2486break;

2487caseBUFFER_LOAD:

2488 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);

2489 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2490break;

2491caseBUFFER_STORE:

2492 NewMI = mergeBufferStorePair(CI, Paired, Where->I);

2493 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2494break;

2495caseMIMG:

2496 NewMI = mergeImagePair(CI, Paired, Where->I);

2497 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2498break;

2499case TBUFFER_LOAD:

2500 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);

2501 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2502break;

2503case TBUFFER_STORE:

2504 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);

2505 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2506break;

2507case FLAT_LOAD:

2508case GLOBAL_LOAD:

2509case GLOBAL_LOAD_SADDR:

2510 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);

2511 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2512break;

2513case FLAT_STORE:

2514case GLOBAL_STORE:

2515case GLOBAL_STORE_SADDR:

2516 NewMI = mergeFlatStorePair(CI, Paired, Where->I);

2517 OptimizeListAgain |= CI.Width + Paired.Width < 4;

2518break;

2519 }

2520 CI.setMI(NewMI, *this);

2521 CI.Order = Where->Order;

2522if (I == Second)

2523I = Next;

2524

2525 MergeList.erase(Second);

2526 }

2527

2528returnModified;

2529}

2530

2531bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {

2532if (skipFunction(MF.getFunction()))

2533returnfalse;

2534return SILoadStoreOptimizer(

2535 &getAnalysis<AAResultsWrapperPass>().getAAResults())

2536 .run(MF);

2537}

2538

2539bool SILoadStoreOptimizer::run(MachineFunction &MF) {

2540 STM = &MF.getSubtarget<GCNSubtarget>();

2541if (!STM->loadStoreOptEnabled())

2542returnfalse;

2543

2544TII = STM->getInstrInfo();

2545TRI = &TII->getRegisterInfo();

2546

2547MRI = &MF.getRegInfo();

2548

2549LLVM_DEBUG(dbgs() <<"Running SILoadStoreOptimizer\n");

2550

2551boolModified =false;

2552

2553// Contains the list of instructions for which constant offsets are being

2554// promoted to the IMM. This is tracked for an entire block at time.

2555SmallPtrSet<MachineInstr *, 4>AnchorList;

2556 MemInfoMap Visited;

2557

2558for (MachineBasicBlock &MBB : MF) {

2559MachineBasicBlock::iterator SectionEnd;

2560for (MachineBasicBlock::iterator I =MBB.begin(), E =MBB.end();I != E;

2561I = SectionEnd) {

2562bool CollectModified;

2563 std::list<std::list<CombineInfo>> MergeableInsts;

2564

2565// First pass: Collect list of all instructions we know how to merge in a

2566// subset of the block.

2567 std::tie(SectionEnd, CollectModified) =

2568collectMergeableInsts(I, E, Visited,AnchorList, MergeableInsts);

2569

2570Modified |= CollectModified;

2571

2572do {

2573 OptimizeAgain =false;

2574Modified |=optimizeBlock(MergeableInsts);

2575 }while (OptimizeAgain);

2576 }

2577

2578 Visited.clear();

2579AnchorList.clear();

2580 }

2581

2582returnModified;

2583}

2584

2585PreservedAnalyses

2586SILoadStoreOptimizerPass::run(MachineFunction &MF,

2587MachineFunctionAnalysisManager &MFAM) {

2588MFPropsModifier _(*this, MF);

2589

2590if (MF.getFunction().hasOptNone())

2591returnPreservedAnalyses::all();

2592

2593auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)

2594 .getManager();

2595AAResults &AA =FAM.getResult<AAManager>(MF.getFunction());

2596

2597bool Changed = SILoadStoreOptimizer(&AA).run(MF);

2598if (!Changed)

2599returnPreservedAnalyses::all();

2600

2601PreservedAnalyses PA =getMachineFunctionPassPreservedAnalyses();

2602 PA.preserveSet<CFGAnalyses>();

2603return PA;

2604}

MRI

unsigned const MachineRegisterInfo * MRI

Definition:AArch64AdvSIMDScalarPass.cpp:105

addInstToMergeableList

INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)

Definition:AMDGPUImageIntrinsicOptimizer.cpp:98

collectMergeableInsts

BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)

Definition:AMDGPUImageIntrinsicOptimizer.cpp:142

AMDGPUMCTargetDesc.h

Provides AMDGPU specific target descriptions.

AMDGPU.h

MBB

MachineBasicBlock & MBB

Definition:ARMSLSHardening.cpp:71

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

Definition:ARMSLSHardening.cpp:73

MBBI

MachineBasicBlock MachineBasicBlock::iterator MBBI

Definition:ARMSLSHardening.cpp:72

AliasAnalysis.h

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

Info

Analysis containing CSE Info

Definition:CSEInfo.cpp:27

Idx

Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx

Definition:DeadArgumentElimination.cpp:353

LLVM_DEBUG

#define LLVM_DEBUG(...)

Definition:Debug.h:106

Addr

uint64_t Addr

Definition:ELFObjHandler.cpp:79

Size

uint64_t Size

Definition:ELFObjHandler.cpp:81

Other

std::optional< std::vector< StOtherPiece > > Other

Definition:ELFYAML.cpp:1315

End

bool End

Definition:ELF_riscv.cpp:480

GCNSubtarget.h

AMD GCN specific subclass of TargetSubtarget.

#define op(i)

TII

const HexagonInstrInfo * TII

Definition:HexagonCopyToCombine.cpp:125

#define _

Definition:HexagonMCCodeEmitter.cpp:46

IRTranslator LLVM IR MI

Definition:IRTranslator.cpp:112

InitializePasses.h

LoopDeletionResult::Modified

@ Modified

#define I(x, y, z)

Definition:MD5.cpp:58

MachineFunctionPass.h

TRI

unsigned const TargetRegisterInfo * TRI

Definition:MachineSink.cpp:2029

getReg

static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)

Definition:MipsDisassembler.cpp:520

FAM

FunctionAnalysisManager FAM

Definition:PassBuilderBindings.cpp:61

INITIALIZE_PASS_DEPENDENCY

#define INITIALIZE_PASS_DEPENDENCY(depName)

Definition:PassSupport.h:55

INITIALIZE_PASS_END

#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)

Definition:PassSupport.h:57

INITIALIZE_PASS_BEGIN

#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)

Definition:PassSupport.h:52

mostAlignedValueInRange

static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)

Definition:SILoadStoreOptimizer.cpp:1029

Optimizer

SI Load Store Optimizer

Definition:SILoadStoreOptimizer.cpp:894

needsConstrainedOpcode

static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)

Definition:SILoadStoreOptimizer.cpp:1719

addDefsUsesToList

static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)

Definition:SILoadStoreOptimizer.cpp:904

getBufferFormatWithCompCount

static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)

Definition:SILoadStoreOptimizer.cpp:998

DEBUG_TYPE

#define DEBUG_TYPE

Definition:SILoadStoreOptimizer.cpp:70

SILoadStoreOptimizer.h

assert

assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

isImm

static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)

Definition:SPIRVInstructionSelector.cpp:2719

optimizeBlock

static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)

Definition:ScalarizeMaskedMemIntrin.cpp:1060

support::ulittle16_t & Lo

Definition:aarch32.cpp:204

support::ulittle16_t & Hi

Definition:aarch32.cpp:203

llvm::AAManager

A manager for alias analyses.

Definition:AliasAnalysis.h:933

llvm::AAManager::run

Result run(Function &F, FunctionAnalysisManager &AM)

Definition:AliasAnalysis.cpp:794

llvm::AAResultsWrapperPass

A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.

Definition:AliasAnalysis.h:981

llvm::AAResults

Definition:AliasAnalysis.h:314

llvm::APInt

Class for arbitrary precision integers.

Definition:APInt.h:78

llvm::AnalysisManager

A container for analyses that lazily runs them and caches their results.

Definition:PassManager.h:253

llvm::AnalysisManager::getResult

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

Definition:PassManager.h:410

llvm::AnalysisUsage

Represent the analysis usage information of a pass.

Definition:PassAnalysisSupport.h:47

llvm::AnalysisUsage::addRequired

AnalysisUsage & addRequired()

Definition:PassAnalysisSupport.h:75

llvm::AnalysisUsage::setPreservesCFG

void setPreservesCFG()

This function should be called by the pass, iff they do not:

Definition:Pass.cpp:256

llvm::ArrayRef

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

Definition:ArrayRef.h:41

llvm::ArrayRef::size

size_t size() const

size - Get the array size.

Definition:ArrayRef.h:168

llvm::CFGAnalyses

Represents analyses that only rely on functions' control flow.

Definition:Analysis.h:72

llvm::DWARFExpression::Operation

This class represents an Operation in the Expression.

Definition:DWARFExpression.h:32

llvm::DebugLoc

A debug info location.

Definition:DebugLoc.h:33

llvm::DenseMap

Definition:DenseMap.h:727

llvm::DenseSet

Implements a dense probed hash-table based set.

Definition:DenseSet.h:278

llvm::FunctionAnalysisManagerMachineFunctionProxy

Definition:MachinePassManager.h:132

llvm::FunctionPass

FunctionPass class - This class is used to implement most global optimizations.

Definition:Pass.h:310

llvm::Function::hasOptNone

bool hasOptNone() const

Do not optimize this function (-O0).

Definition:Function.h:701

llvm::GCNSubtarget

Definition:GCNSubtarget.h:34

llvm::GCNSubtarget::loadStoreOptEnabled

bool loadStoreOptEnabled() const

Definition:GCNSubtarget.h:1141

llvm::GCNSubtarget::hasFlatInstOffsets

bool hasFlatInstOffsets() const

Definition:GCNSubtarget.h:641

llvm::GCNSubtarget::getInstrInfo

const SIInstrInfo * getInstrInfo() const override

Definition:GCNSubtarget.h:279

llvm::GCNSubtarget::hasDwordx3LoadStores

bool hasDwordx3LoadStores() const

Definition:GCNSubtarget.h:1168

llvm::GCNSubtarget::getTargetLowering

const SITargetLowering * getTargetLowering() const override

Definition:GCNSubtarget.h:287

llvm::GCNSubtarget::ldsRequiresM0Init

bool ldsRequiresM0Init() const

Return if most LDS instructions have an m0 use that require m0 to be initialized.

Definition:GCNSubtarget.h:716

llvm::GCNSubtarget::hasScalarDwordx3Loads

bool hasScalarDwordx3Loads() const

Definition:GCNSubtarget.h:1025

llvm::GCNSubtarget::isXNACKEnabled

bool isXNACKEnabled() const

Definition:GCNSubtarget.h:619

llvm::LocationSize::getValue

TypeSize getValue() const

Definition:MemoryLocation.h:170

llvm::MCInstrDesc

Describe properties that are true of each instruction in the target description file.

Definition:MCInstrDesc.h:198

llvm::MFPropsModifier

An RAII based helper class to modify MachineFunctionProperties when running pass.

Definition:MachinePassManager.h:42

llvm::MachineBasicBlock

Definition:MachineBasicBlock.h:125

llvm::MachineBasicBlock::begin

iterator begin()

Definition:MachineBasicBlock.h:355

llvm::MachineBasicBlock::end

iterator end()

Definition:MachineBasicBlock.h:357

llvm::MachineBasicBlock::getParent

const MachineFunction * getParent() const

Return the MachineFunction containing this basic block.

Definition:MachineBasicBlock.h:311

llvm::MachineFunctionPass

MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...

Definition:MachineFunctionPass.h:30

llvm::MachineFunctionPass::getAnalysisUsage

void getAnalysisUsage(AnalysisUsage &AU) const override

getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.

Definition:MachineFunctionPass.cpp:169

llvm::MachineFunctionPass::runOnMachineFunction

virtual bool runOnMachineFunction(MachineFunction &MF)=0

runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...

llvm::MachineFunctionPass::getRequiredProperties

virtual MachineFunctionProperties getRequiredProperties() const

Definition:MachineFunctionPass.h:56

llvm::MachineFunctionProperties

Properties which a MachineFunction may have at a given point in time.

Definition:MachineFunction.h:137

llvm::MachineFunctionProperties::set

MachineFunctionProperties & set(Property P)

Definition:MachineFunction.h:207

llvm::MachineFunction

Definition:MachineFunction.h:267

llvm::MachineFunction::getSubtarget

const TargetSubtargetInfo & getSubtarget() const

getSubtarget - Return the subtarget for which this machine code is being compiled.

Definition:MachineFunction.h:733

llvm::MachineFunction::getMachineMemOperand

MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)

getMachineMemOperand - Allocate a new MachineMemOperand.

Definition:MachineFunction.cpp:536

llvm::MachineFunction::getRegInfo

MachineRegisterInfo & getRegInfo()

getRegInfo - Return information about the registers currently in use.

Definition:MachineFunction.h:743

llvm::MachineFunction::getFunction

Function & getFunction()

Return the LLVM function that this machine code represents.

Definition:MachineFunction.h:704

llvm::MachineInstrBuilder

Definition:MachineInstrBuilder.h:71

llvm::MachineInstrBuilder::cloneMergedMemRefs

const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const

Definition:MachineInstrBuilder.h:221

llvm::MachineInstrBuilder::addImm

const MachineInstrBuilder & addImm(int64_t Val) const

Add a new immediate operand.

Definition:MachineInstrBuilder.h:133

llvm::MachineInstrBuilder::add

const MachineInstrBuilder & add(const MachineOperand &MO) const

Definition:MachineInstrBuilder.h:226

llvm::MachineInstrBuilder::addReg

const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const

Add a new virtual register operand.

Definition:MachineInstrBuilder.h:99

llvm::MachineInstrBundleIterator< MachineInstr >

llvm::MachineInstr

Representation of each machine instruction.

Definition:MachineInstr.h:71

llvm::MachineInstr::getOpcode

unsigned getOpcode() const

Returns the opcode of this MachineInstr.

Definition:MachineInstr.h:577

llvm::MachineInstr::eraseFromParent

void eraseFromParent()

Unlink 'this' from the containing basic block and delete it.

Definition:MachineInstr.cpp:767

llvm::MachineInstr::dump

void dump() const

Definition:MachineInstr.cpp:1695

llvm::MachineMemOperand

A description of a memory reference used in the backend.

Definition:MachineMemOperand.h:129

llvm::MachineMemOperand::getSize

LocationSize getSize() const

Return the size in bytes of the memory reference.

Definition:MachineMemOperand.h:240

llvm::MachineMemOperand::getAddrSpace

unsigned getAddrSpace() const

Definition:MachineMemOperand.h:233

llvm::MachineMemOperand::getPointerInfo

const MachinePointerInfo & getPointerInfo() const

Definition:MachineMemOperand.h:204

llvm::MachineOperand

MachineOperand class - Representation of each machine instruction operand.

Definition:MachineOperand.h:48

llvm::MachineOperand::getSubReg

unsigned getSubReg() const

Definition:MachineOperand.h:374

llvm::MachineOperand::getImm

int64_t getImm() const

Definition:MachineOperand.h:556

llvm::MachineOperand::isReg

bool isReg() const

isReg - Tests if this is a MO_Register operand.

Definition:MachineOperand.h:329

llvm::MachineOperand::isImm

bool isImm() const

isImm - Tests if this is a MO_Immediate operand.

Definition:MachineOperand.h:331

llvm::MachineOperand::CreateImm

static MachineOperand CreateImm(int64_t Val)

Definition:MachineOperand.h:820

llvm::MachineOperand::getReg

getReg - Returns the register number.

Definition:MachineOperand.h:369

llvm::MachineOperand::CreateReg

static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)

Definition:MachineOperand.h:838

llvm::MachineRegisterInfo

MachineRegisterInfo - Keep track of information for virtual and physical registers,...

Definition:MachineRegisterInfo.h:51

llvm::Pass::dump

void dump() const

Definition:Pass.cpp:136

llvm::Pass::getPassName

virtual StringRef getPassName() const

getPassName - Return a nice clean name for a pass.

Definition:Pass.cpp:81

llvm::PreservedAnalyses

A set of analyses that are preserved following a run of a transformation pass.

Definition:Analysis.h:111

llvm::PreservedAnalyses::all

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

Definition:Analysis.h:117

llvm::PreservedAnalyses::preserveSet

void preserveSet()

Mark an analysis set as preserved.

Definition:Analysis.h:146

llvm::Register

Wrapper class representing virtual and physical registers.

Definition:Register.h:19

llvm::Register::isPhysical

constexpr bool isPhysical() const

Return true if the specified register number is in the physical register namespace.

Definition:Register.h:95

llvm::SIInstrInfo

Definition:SIInstrInfo.h:85

llvm::SIInstrInfo::isFLATScratch

static bool isFLATScratch(const MachineInstr &MI)

Definition:SIInstrInfo.h:645

llvm::SIInstrInfo::isFLATGlobal

static bool isFLATGlobal(const MachineInstr &MI)

Definition:SIInstrInfo.h:637

llvm::SIInstrInfo::isFLAT

static bool isFLAT(const MachineInstr &MI)

Definition:SIInstrInfo.h:621

llvm::SILoadStoreOptimizerPass::run

PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)

Definition:SILoadStoreOptimizer.cpp:2586

llvm::SIRegisterInfo

Definition:SIRegisterInfo.h:32

llvm::SITargetLowering

Definition:SIISelLowering.h:31

llvm::SITargetLowering::isLegalFlatAddressingMode

bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const

Definition:SIISelLowering.cpp:1535

llvm::SmallPtrSet

SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.

Definition:SmallPtrSet.h:519

llvm::SmallVectorImpl::emplace_back

reference emplace_back(ArgTypes &&... Args)

Definition:SmallVector.h:937

llvm::SmallVector

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Definition:SmallVector.h:1196

llvm::StringRef

StringRef - Represent a constant reference to a string, i.e.

Definition:StringRef.h:51

llvm::TargetRegisterClass

Definition:TargetRegisterInfo.h:44

llvm::detail::DenseSetImpl::insert

std::pair< iterator, bool > insert(const ValueT &V)

Definition:DenseSet.h:213

llvm::detail::DenseSetImpl::contains

bool contains(const_arg_type_t< ValueT > V) const

Check if the set contains the given element.

Definition:DenseSet.h:193

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

Definition:ErrorHandling.h:143

OpName

Definition:R600Defines.h:62

false

Definition:StackSlotColoring.cpp:193

llvm::AMDGPUAS::FLAT_ADDRESS

@ FLAT_ADDRESS

Address space for flat memory.

Definition:AMDGPUAddrSpace.h:30

llvm::AMDGPUAS::GLOBAL_ADDRESS

@ GLOBAL_ADDRESS

Address space for global memory (RAT0, VTX0).

Definition:AMDGPUAddrSpace.h:31

llvm::AMDGPUISD::BUFFER_STORE

@ BUFFER_STORE

Definition:AMDGPUISelLowering.h:584

llvm::AMDGPUISD::BUFFER_LOAD

@ BUFFER_LOAD

Definition:AMDGPUISelLowering.h:565

llvm::AMDGPU::getMIMGInfo

LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)

llvm::AMDGPU::convertSMRDOffsetUnits

uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)

Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.

Definition:AMDGPUBaseInfo.cpp:2898

llvm::AMDGPU::getMTBUFHasSrsrc

bool getMTBUFHasSrsrc(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:452

llvm::AMDGPU::getMTBUFElements

int getMTBUFElements(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:442

llvm::AMDGPU::getNamedOperandIdx

LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)

llvm::AMDGPU::getMTBUFHasSoffset

bool getMTBUFHasSoffset(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:457

llvm::AMDGPU::getMUBUFOpcode

int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)

Definition:AMDGPUBaseInfo.cpp:467

llvm::AMDGPU::getMUBUFBaseOpcode

int getMUBUFBaseOpcode(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:462

llvm::AMDGPU::getMTBUFBaseOpcode

int getMTBUFBaseOpcode(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:432

llvm::AMDGPU::getMUBUFHasVAddr

bool getMUBUFHasVAddr(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:477

llvm::AMDGPU::hasNamedOperand

LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)

Definition:AMDGPUBaseInfo.h:400

llvm::AMDGPU::getMTBUFOpcode

int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)

Definition:AMDGPUBaseInfo.cpp:437

llvm::AMDGPU::getMUBUFHasSoffset

bool getMUBUFHasSoffset(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:487

llvm::AMDGPU::getMIMGBaseOpcode

const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:280

llvm::AMDGPU::getMIMGBaseOpcodeInfo

LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)

llvm::AMDGPU::getMaskedMIMGOp

int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)

Definition:AMDGPUBaseInfo.cpp:285

llvm::AMDGPU::getMTBUFHasVAddr

bool getMTBUFHasVAddr(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:447

llvm::AMDGPU::getMUBUFElements

int getMUBUFElements(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:472

llvm::AMDGPU::getGcnBufferFormatInfo

const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)

Definition:AMDGPUBaseInfo.cpp:2985

llvm::AMDGPU::getMUBUFHasSrsrc

bool getMUBUFHasSrsrc(unsigned Opc)

Definition:AMDGPUBaseInfo.cpp:482

llvm::BitmaskEnumDetail::Mask

constexpr std::underlying_type_t< E > Mask()

Get a bitmask with 1s in all places up to the high-order bit of E's largest value.

Definition:BitmaskEnum.h:125

llvm::CallingConv::ID

unsigned ID

LLVM IR allows to use arbitrary numbers as calling convention identifiers.

Definition:CallingConv.h:24

llvm::LPAC::UNKNOWN

@ UNKNOWN

Definition:LanaiAluCode.h:39

llvm::M68k::MemAddrModeKind::V

@ V

llvm::RegState::Dead

@ Dead

Unused definition.

Definition:MachineInstrBuilder.h:52

llvm::RegState::Define

@ Define

Definition:MachineInstrBuilder.h:46

llvm::RegState::Kill

@ Kill

The last use of a register.

Definition:MachineInstrBuilder.h:50

llvm::SIInstrFlags::MIMG

@ MIMG

Definition:SIDefines.h:83

llvm::X86Disassembler::Reg

Reg

All possible values of the reg field in the ModR/M byte.

Definition:X86DisassemblerDecoder.h:621

llvm::ms_demangle::IntrinsicFunctionKind::New

@ New

llvm::pdb::DbgHeaderType::Max

@ Max

llvm::rdf::Def

NodeAddr< DefNode * > Def

Definition:RDFGraph.h:384

llvm::sampleprof::Base

@ Base

Definition:Discriminator.h:58

llvm

This is an optimization pass for GlobalISel generic memory operations.

Definition:AddressRanges.h:18

llvm::Offset

@ Offset

Definition:DWP.cpp:480

llvm::operator<

bool operator<(int64_t V1, const APSInt &V2)

Definition:APSInt.h:361

llvm::AnchorList

std::vector< std::pair< LineLocation, FunctionId > > AnchorList

Definition:SampleProfileMatcher.h:22

llvm::popcount

int popcount(T Value) noexcept

Count the number of set bits in a value.

Definition:bit.h:385

llvm::BuildMI

MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)

Builder interface. Specify how to create the initial instruction itself.

Definition:MachineInstrBuilder.h:373

llvm::createSILoadStoreOptimizerLegacyPass

FunctionPass * createSILoadStoreOptimizerLegacyPass()

Definition:SILoadStoreOptimizer.cpp:900

llvm::SILoadStoreOptimizerLegacyID

char & SILoadStoreOptimizerLegacyID

Definition:SILoadStoreOptimizer.cpp:898

llvm::countr_zero

int countr_zero(T Val)

Count number of 0's from the least significant bit to the most stopping at the first 1.

Definition:bit.h:215

llvm::getMachineFunctionPassPreservedAnalyses

PreservedAnalyses getMachineFunctionPassPreservedAnalyses()

Returns the minimum set of Analyses that all machine function passes must preserve.

Definition:MachinePassManager.cpp:158

llvm::countl_zero

int countl_zero(T Val)

Count number of 0's from the most significant bit to the least stopping at the first 1.

Definition:bit.h:281

llvm::dbgs

raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

Definition:Debug.cpp:163

llvm::IRMemLocation::First

@ First

Helpers to iterate all locations in the MemoryEffectsBase class.

llvm::BitWidth

constexpr unsigned BitWidth

Definition:BitmaskEnum.h:217

llvm::printReg

Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)

Prints virtual and physical registers with or without a TRI instance.

Definition:TargetRegisterInfo.cpp:107

std::swap

void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)

Implement std::swap in terms of BitVector swap.

Definition:BitVector.h:860

llvm::AMDGPU::GcnBufferFormatInfo

Definition:AMDGPUBaseInfo.h:87

llvm::AMDGPU::GcnBufferFormatInfo::BitsPerComp

unsigned BitsPerComp

Definition:AMDGPUBaseInfo.h:89

llvm::AMDGPU::GcnBufferFormatInfo::Format

unsigned Format

Definition:AMDGPUBaseInfo.h:88

llvm::AMDGPU::GcnBufferFormatInfo::NumFormat

unsigned NumFormat

Definition:AMDGPUBaseInfo.h:91

llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler

bool Sampler

Definition:AMDGPUBaseInfo.h:412

llvm::AMDGPU::MIMGInfo

Definition:AMDGPUBaseInfo.h:509

llvm::MachinePointerInfo

This class contains a discriminated union of information about pointers in memory operands,...

Definition:MachineMemOperand.h:41

llvm::TargetLoweringBase::AddrMode

This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...

Definition:TargetLowering.h:2816

llvm::TargetLoweringBase::AddrMode::BaseOffs

int64_t BaseOffs

Definition:TargetLowering.h:2818

llvm::TargetLoweringBase::AddrMode::HasBaseReg

bool HasBaseReg

Definition:TargetLowering.h:2819