Movatterモバイル変換


[0]ホーム

URL:


LLVM 20.0.0git
SILoadStoreOptimizer.cpp
Go to the documentation of this file.
1//===- SILoadStoreOptimizer.cpp -------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass tries to fuse DS instructions with close by immediate offsets.
10// This will fuse operations such as
11// ds_read_b32 v0, v2 offset:16
12// ds_read_b32 v1, v2 offset:32
13// ==>
14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
15//
16// The same is done for certain SMEM and VMEM opcodes, e.g.:
17// s_buffer_load_dword s4, s[0:3], 4
18// s_buffer_load_dword s5, s[0:3], 8
19// ==>
20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4
21//
22// This pass also tries to promote constant offset to the immediate by
23// adjusting the base. It tries to use a base from the nearby instructions that
24// allows it to have a 13bit constant offset and then promotes the 13bit offset
25// to the immediate.
26// E.g.
27// s_movk_i32 s0, 0x1800
28// v_add_co_u32_e32 v0, vcc, s0, v2
29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
30//
31// s_movk_i32 s0, 0x1000
32// v_add_co_u32_e32 v5, vcc, s0, v2
33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
34// global_load_dwordx2 v[5:6], v[5:6], off
35// global_load_dwordx2 v[0:1], v[0:1], off
36// =>
37// s_movk_i32 s0, 0x1000
38// v_add_co_u32_e32 v5, vcc, s0, v2
39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
40// global_load_dwordx2 v[5:6], v[5:6], off
41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048
42//
43// Future improvements:
44//
45// - This is currently missing stores of constants because loading
46// the constant into the data register is placed between the stores, although
47// this is arguably a scheduling problem.
48//
49// - Live interval recomputing seems inefficient. This currently only matches
50// one pair, and recomputes live intervals and moves on to the next pair. It
51// would be better to compute a list of all merges that need to occur.
52//
53// - With a list of instructions to process, we can also merge more. If a
54// cluster of loads have offsets that are too large to fit in the 8-bit
55// offsets, but are close enough to fit in the 8 bits, we can add to the base
56// pointer and use the new reduced offsets.
57//
58//===----------------------------------------------------------------------===//
59
60#include "SILoadStoreOptimizer.h"
61#include "AMDGPU.h"
62#include "GCNSubtarget.h"
63#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
64#include "llvm/Analysis/AliasAnalysis.h"
65#include "llvm/CodeGen/MachineFunctionPass.h"
66#include "llvm/InitializePasses.h"
67
68using namespacellvm;
69
70#define DEBUG_TYPE "si-load-store-opt"
71
72namespace{
73enum InstClassEnum {
74 UNKNOWN,
75 DS_READ,
76 DS_WRITE,
77 S_BUFFER_LOAD_IMM,
78 S_BUFFER_LOAD_SGPR_IMM,
79 S_LOAD_IMM,
80 BUFFER_LOAD,
81 BUFFER_STORE,
82 MIMG,
83 TBUFFER_LOAD,
84 TBUFFER_STORE,
85 GLOBAL_LOAD_SADDR,
86 GLOBAL_STORE_SADDR,
87 FLAT_LOAD,
88 FLAT_STORE,
89 GLOBAL_LOAD,// GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of
90 GLOBAL_STORE// any CombineInfo, they are only ever returned by
91// getCommonInstClass.
92};
93
94structAddressRegs {
95unsignedchar NumVAddrs = 0;
96bool SBase =false;
97bool SRsrc =false;
98bool SOffset =false;
99bool SAddr =false;
100bool VAddr =false;
101boolAddr =false;
102bool SSamp =false;
103};
104
105// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
106constunsigned MaxAddressRegs = 12 + 1 + 1;
107
108classSILoadStoreOptimizer {
109structCombineInfo {
110MachineBasicBlock::iteratorI;
111unsigned EltSize;
112unsigned Offset;
113unsigned Width;
114unsigned Format;
115unsigned BaseOff;
116unsigned DMask;
117 InstClassEnum InstClass;
118unsigned CPol = 0;
119bool IsAGPR;
120bool UseST64;
121int AddrIdx[MaxAddressRegs];
122constMachineOperand *AddrReg[MaxAddressRegs];
123unsigned NumAddresses;
124unsigned Order;
125
126bool hasSameBaseAddress(const CombineInfo &CI) {
127if (NumAddresses != CI.NumAddresses)
128returnfalse;
129
130constMachineInstr &MI = *CI.I;
131for (unsigned i = 0; i < NumAddresses; i++) {
132constMachineOperand &AddrRegNext =MI.getOperand(AddrIdx[i]);
133
134if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
135if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
136 AddrReg[i]->getImm() != AddrRegNext.getImm()) {
137returnfalse;
138 }
139continue;
140 }
141
142// Check same base pointer. Be careful of subregisters, which can occur
143// with vectors of pointers.
144if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
145 AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
146returnfalse;
147 }
148 }
149returntrue;
150 }
151
152bool hasMergeableAddress(constMachineRegisterInfo &MRI) {
153for (unsigned i = 0; i < NumAddresses; ++i) {
154constMachineOperand *AddrOp = AddrReg[i];
155// Immediates are always OK.
156if (AddrOp->isImm())
157continue;
158
159// Don't try to merge addresses that aren't either immediates or registers.
160// TODO: Should be possible to merge FrameIndexes and maybe some other
161// non-register
162if (!AddrOp->isReg())
163returnfalse;
164
165// TODO: We should be able to merge instructions with other physical reg
166// addresses too.
167if (AddrOp->getReg().isPhysical() &&
168 AddrOp->getReg() != AMDGPU::SGPR_NULL)
169returnfalse;
170
171// If an address has only one use then there will be no other
172// instructions with the same address, so we can't merge this one.
173if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
174returnfalse;
175 }
176returntrue;
177 }
178
179void setMI(MachineBasicBlock::iteratorMI,const SILoadStoreOptimizer &LSO);
180
181// Compare by pointer order.
182booloperator<(const CombineInfo&Other) const{
183return (InstClass == MIMG) ? DMask <Other.DMask : Offset <Other.Offset;
184 }
185 };
186
187structBaseRegisters {
188Register LoReg;
189Register HiReg;
190
191unsigned LoSubReg = 0;
192unsigned HiSubReg = 0;
193 };
194
195structMemAddress {
196 BaseRegistersBase;
197 int64_t Offset = 0;
198 };
199
200usingMemInfoMap =DenseMap<MachineInstr *, MemAddress>;
201
202private:
203constGCNSubtarget *STM =nullptr;
204constSIInstrInfo *TII =nullptr;
205constSIRegisterInfo *TRI =nullptr;
206MachineRegisterInfo *MRI =nullptr;
207AliasAnalysis *AA =nullptr;
208bool OptimizeAgain;
209
210bool canSwapInstructions(constDenseSet<Register> &ARegDefs,
211constDenseSet<Register> &ARegUses,
212constMachineInstr &A,constMachineInstr &B)const;
213staticbool dmasksCanBeCombined(const CombineInfo &CI,
214constSIInstrInfo &TII,
215const CombineInfo &Paired);
216staticbool offsetsCanBeCombined(CombineInfo &CI,constGCNSubtarget &STI,
217 CombineInfo &Paired,bool Modify =false);
218staticbool widthsFit(constGCNSubtarget &STI,const CombineInfo &CI,
219const CombineInfo &Paired);
220unsigned getNewOpcode(const CombineInfo &CI,const CombineInfo &Paired);
221static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
222const CombineInfo &Paired);
223constTargetRegisterClass *
224 getTargetRegisterClass(const CombineInfo &CI,
225const CombineInfo &Paired)const;
226constTargetRegisterClass *getDataRegClass(constMachineInstr &MI)const;
227
228 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
229
230void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
231MachineBasicBlock::iterator InsertBefore,intOpName,
232Register DestReg)const;
233Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
234MachineBasicBlock::iterator InsertBefore,
235intOpName)const;
236
237unsigned read2Opcode(unsigned EltSize)const;
238unsigned read2ST64Opcode(unsigned EltSize)const;
239MachineBasicBlock::iterator
240 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
241MachineBasicBlock::iterator InsertBefore);
242
243unsigned write2Opcode(unsigned EltSize)const;
244unsigned write2ST64Opcode(unsigned EltSize)const;
245MachineBasicBlock::iterator
246 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
247MachineBasicBlock::iterator InsertBefore);
248MachineBasicBlock::iterator
249 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
250MachineBasicBlock::iterator InsertBefore);
251MachineBasicBlock::iterator
252 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
253MachineBasicBlock::iterator InsertBefore);
254MachineBasicBlock::iterator
255 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
256MachineBasicBlock::iterator InsertBefore);
257MachineBasicBlock::iterator
258 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
259MachineBasicBlock::iterator InsertBefore);
260MachineBasicBlock::iterator
261 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
262MachineBasicBlock::iterator InsertBefore);
263MachineBasicBlock::iterator
264 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
265MachineBasicBlock::iterator InsertBefore);
266MachineBasicBlock::iterator
267 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
268MachineBasicBlock::iterator InsertBefore);
269MachineBasicBlock::iterator
270 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
271MachineBasicBlock::iterator InsertBefore);
272
273void updateBaseAndOffset(MachineInstr &I,Register NewBase,
274 int32_t NewOffset)const;
275Register computeBase(MachineInstr &MI,const MemAddress &Addr)const;
276MachineOperand createRegOrImm(int32_t Val,MachineInstr &MI)const;
277 std::optional<int32_t> extractConstOffset(constMachineOperand &Op)const;
278void processBaseWithConstOffset(constMachineOperand &Base, MemAddress &Addr)const;
279 /// Promotes constant offset to the immediate by adjusting the base. It
280 /// tries to use a base from the nearby instructions that allows it to have
281 /// a 13bit constant offset which gets promoted to the immediate.
282bool promoteConstantOffsetToImm(MachineInstr &CI,
283 MemInfoMap &Visited,
284SmallPtrSet<MachineInstr *, 4> &Promoted)const;
285voidaddInstToMergeableList(const CombineInfo &CI,
286 std::list<std::list<CombineInfo> > &MergeableInsts)const;
287
288 std::pair<MachineBasicBlock::iterator, bool>collectMergeableInsts(
289MachineBasicBlock::iterator Begin,MachineBasicBlock::iteratorEnd,
290 MemInfoMap &Visited,SmallPtrSet<MachineInstr *, 4> &AnchorList,
291 std::list<std::list<CombineInfo>> &MergeableInsts)const;
292
293staticMachineMemOperand *combineKnownAdjacentMMOs(const CombineInfo &CI,
294const CombineInfo &Paired);
295
296static InstClassEnum getCommonInstClass(const CombineInfo &CI,
297const CombineInfo &Paired);
298
299bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
300bool &OptimizeListAgain);
301booloptimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
302
303public:
304 SILoadStoreOptimizer(AliasAnalysis *AA) : AA(AA) {}
305bool run(MachineFunction &MF);
306};
307
308classSILoadStoreOptimizerLegacy :publicMachineFunctionPass {
309public:
310staticcharID;
311
312 SILoadStoreOptimizerLegacy() :MachineFunctionPass(ID) {}
313
314boolrunOnMachineFunction(MachineFunction &MF)override;
315
316StringRefgetPassName() const override{return"SI Load Store Optimizer"; }
317
318voidgetAnalysisUsage(AnalysisUsage &AU) const override{
319 AU.setPreservesCFG();
320 AU.addRequired<AAResultsWrapperPass>();
321
322MachineFunctionPass::getAnalysisUsage(AU);
323 }
324
325MachineFunctionPropertiesgetRequiredProperties() const override{
326returnMachineFunctionProperties()
327 .set(MachineFunctionProperties::Property::IsSSA);
328 }
329};
330
331staticunsigned getOpcodeWidth(constMachineInstr &MI,constSIInstrInfo &TII) {
332constunsigned Opc =MI.getOpcode();
333
334if (TII.isMUBUF(Opc)) {
335// FIXME: Handle d16 correctly
336returnAMDGPU::getMUBUFElements(Opc);
337 }
338if (TII.isImage(MI)) {
339uint64_t DMaskImm =
340TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
341returnllvm::popcount(DMaskImm);
342 }
343if (TII.isMTBUF(Opc)) {
344returnAMDGPU::getMTBUFElements(Opc);
345 }
346
347switch (Opc) {
348case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
349case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
350case AMDGPU::S_LOAD_DWORD_IMM:
351case AMDGPU::GLOBAL_LOAD_DWORD:
352case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
353case AMDGPU::GLOBAL_STORE_DWORD:
354case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
355case AMDGPU::FLAT_LOAD_DWORD:
356case AMDGPU::FLAT_STORE_DWORD:
357return 1;
358case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
359case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
360case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
361case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
362case AMDGPU::S_LOAD_DWORDX2_IMM:
363case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
364case AMDGPU::GLOBAL_LOAD_DWORDX2:
365case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
366case AMDGPU::GLOBAL_STORE_DWORDX2:
367case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
368case AMDGPU::FLAT_LOAD_DWORDX2:
369case AMDGPU::FLAT_STORE_DWORDX2:
370return 2;
371case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
372case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
373case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
374case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
375case AMDGPU::S_LOAD_DWORDX3_IMM:
376case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
377case AMDGPU::GLOBAL_LOAD_DWORDX3:
378case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
379case AMDGPU::GLOBAL_STORE_DWORDX3:
380case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
381case AMDGPU::FLAT_LOAD_DWORDX3:
382case AMDGPU::FLAT_STORE_DWORDX3:
383return 3;
384case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
385case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
386case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
387case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
388case AMDGPU::S_LOAD_DWORDX4_IMM:
389case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
390case AMDGPU::GLOBAL_LOAD_DWORDX4:
391case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
392case AMDGPU::GLOBAL_STORE_DWORDX4:
393case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
394case AMDGPU::FLAT_LOAD_DWORDX4:
395case AMDGPU::FLAT_STORE_DWORDX4:
396return 4;
397case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
398case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
399case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
400case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
401case AMDGPU::S_LOAD_DWORDX8_IMM:
402case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
403return 8;
404case AMDGPU::DS_READ_B32:
405case AMDGPU::DS_READ_B32_gfx9:
406case AMDGPU::DS_WRITE_B32:
407case AMDGPU::DS_WRITE_B32_gfx9:
408return 1;
409case AMDGPU::DS_READ_B64:
410case AMDGPU::DS_READ_B64_gfx9:
411case AMDGPU::DS_WRITE_B64:
412case AMDGPU::DS_WRITE_B64_gfx9:
413return 2;
414default:
415return 0;
416 }
417}
418
419/// Maps instruction opcode to enum InstClassEnum.
420static InstClassEnum getInstClass(unsigned Opc,constSIInstrInfo &TII) {
421switch (Opc) {
422default:
423if (TII.isMUBUF(Opc)) {
424switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
425default:
426return UNKNOWN;
427case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
428case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
429case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
430case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
431case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
432case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
433case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
434case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
435case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
436case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
437case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
438case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
439case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
440case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
441case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
442case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
443return BUFFER_LOAD;
444case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
445case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
446case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
447case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
448case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
449case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
450case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
451case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
452case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
453case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
454case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
455case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
456case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
457case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
458case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
459case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
460return BUFFER_STORE;
461 }
462 }
463if (TII.isImage(Opc)) {
464// Ignore instructions encoded without vaddr.
465if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
466 !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
467return UNKNOWN;
468// Ignore BVH instructions
469if (AMDGPU::getMIMGBaseOpcode(Opc)->BVH)
470return UNKNOWN;
471// TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
472if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
473TII.isGather4(Opc))
474return UNKNOWN;
475return MIMG;
476 }
477if (TII.isMTBUF(Opc)) {
478switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
479default:
480return UNKNOWN;
481case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
482case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
483case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
484case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
485case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
486case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
487case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
488case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
489case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
490case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
491case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
492case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
493case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
494case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
495case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
496case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
497return TBUFFER_LOAD;
498case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
499case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
500case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
501case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
502case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
503case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
504case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
505case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
506return TBUFFER_STORE;
507 }
508 }
509return UNKNOWN;
510case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
511case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
512case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
513case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
514case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
515case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
516case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
517case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
518case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
519return S_BUFFER_LOAD_IMM;
520case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
521case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
522case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
523case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
524case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
525case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
526case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
527case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
528case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
529return S_BUFFER_LOAD_SGPR_IMM;
530case AMDGPU::S_LOAD_DWORD_IMM:
531case AMDGPU::S_LOAD_DWORDX2_IMM:
532case AMDGPU::S_LOAD_DWORDX3_IMM:
533case AMDGPU::S_LOAD_DWORDX4_IMM:
534case AMDGPU::S_LOAD_DWORDX8_IMM:
535case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
536case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
537case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
538case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
539return S_LOAD_IMM;
540case AMDGPU::DS_READ_B32:
541case AMDGPU::DS_READ_B32_gfx9:
542case AMDGPU::DS_READ_B64:
543case AMDGPU::DS_READ_B64_gfx9:
544return DS_READ;
545case AMDGPU::DS_WRITE_B32:
546case AMDGPU::DS_WRITE_B32_gfx9:
547case AMDGPU::DS_WRITE_B64:
548case AMDGPU::DS_WRITE_B64_gfx9:
549return DS_WRITE;
550case AMDGPU::GLOBAL_LOAD_DWORD:
551case AMDGPU::GLOBAL_LOAD_DWORDX2:
552case AMDGPU::GLOBAL_LOAD_DWORDX3:
553case AMDGPU::GLOBAL_LOAD_DWORDX4:
554case AMDGPU::FLAT_LOAD_DWORD:
555case AMDGPU::FLAT_LOAD_DWORDX2:
556case AMDGPU::FLAT_LOAD_DWORDX3:
557case AMDGPU::FLAT_LOAD_DWORDX4:
558return FLAT_LOAD;
559case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
560case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
561case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
562case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
563return GLOBAL_LOAD_SADDR;
564case AMDGPU::GLOBAL_STORE_DWORD:
565case AMDGPU::GLOBAL_STORE_DWORDX2:
566case AMDGPU::GLOBAL_STORE_DWORDX3:
567case AMDGPU::GLOBAL_STORE_DWORDX4:
568case AMDGPU::FLAT_STORE_DWORD:
569case AMDGPU::FLAT_STORE_DWORDX2:
570case AMDGPU::FLAT_STORE_DWORDX3:
571case AMDGPU::FLAT_STORE_DWORDX4:
572return FLAT_STORE;
573case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
574case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
575case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
576case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
577return GLOBAL_STORE_SADDR;
578 }
579}
580
581/// Determines instruction subclass from opcode. Only instructions
582/// of the same subclass can be merged together. The merged instruction may have
583/// a different subclass but must have the same class.
584staticunsigned getInstSubclass(unsigned Opc,constSIInstrInfo &TII) {
585switch (Opc) {
586default:
587if (TII.isMUBUF(Opc))
588returnAMDGPU::getMUBUFBaseOpcode(Opc);
589if (TII.isImage(Opc)) {
590constAMDGPU::MIMGInfo *Info =AMDGPU::getMIMGInfo(Opc);
591assert(Info);
592returnInfo->BaseOpcode;
593 }
594if (TII.isMTBUF(Opc))
595returnAMDGPU::getMTBUFBaseOpcode(Opc);
596return -1;
597case AMDGPU::DS_READ_B32:
598case AMDGPU::DS_READ_B32_gfx9:
599case AMDGPU::DS_READ_B64:
600case AMDGPU::DS_READ_B64_gfx9:
601case AMDGPU::DS_WRITE_B32:
602case AMDGPU::DS_WRITE_B32_gfx9:
603case AMDGPU::DS_WRITE_B64:
604case AMDGPU::DS_WRITE_B64_gfx9:
605return Opc;
606case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
607case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
608case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
609case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
610case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
611case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
612case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
613case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
614case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
615return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
616case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
617case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
618case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
619case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
620case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
621case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
622case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
623case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
624case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
625return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
626case AMDGPU::S_LOAD_DWORD_IMM:
627case AMDGPU::S_LOAD_DWORDX2_IMM:
628case AMDGPU::S_LOAD_DWORDX3_IMM:
629case AMDGPU::S_LOAD_DWORDX4_IMM:
630case AMDGPU::S_LOAD_DWORDX8_IMM:
631case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
632case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
633case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
634case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
635return AMDGPU::S_LOAD_DWORD_IMM;
636case AMDGPU::GLOBAL_LOAD_DWORD:
637case AMDGPU::GLOBAL_LOAD_DWORDX2:
638case AMDGPU::GLOBAL_LOAD_DWORDX3:
639case AMDGPU::GLOBAL_LOAD_DWORDX4:
640case AMDGPU::FLAT_LOAD_DWORD:
641case AMDGPU::FLAT_LOAD_DWORDX2:
642case AMDGPU::FLAT_LOAD_DWORDX3:
643case AMDGPU::FLAT_LOAD_DWORDX4:
644return AMDGPU::FLAT_LOAD_DWORD;
645case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
646case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
647case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
648case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
649return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
650case AMDGPU::GLOBAL_STORE_DWORD:
651case AMDGPU::GLOBAL_STORE_DWORDX2:
652case AMDGPU::GLOBAL_STORE_DWORDX3:
653case AMDGPU::GLOBAL_STORE_DWORDX4:
654case AMDGPU::FLAT_STORE_DWORD:
655case AMDGPU::FLAT_STORE_DWORDX2:
656case AMDGPU::FLAT_STORE_DWORDX3:
657case AMDGPU::FLAT_STORE_DWORDX4:
658return AMDGPU::FLAT_STORE_DWORD;
659case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
660case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
661case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
662case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
663return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
664 }
665}
666
667// GLOBAL loads and stores are classified as FLAT initially. If both combined
668// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE.
669// If either or both instructions are non segment specific FLAT the resulting
670// combined operation will be FLAT, potentially promoting one of the GLOBAL
671// operations to FLAT.
672// For other instructions return the original unmodified class.
673InstClassEnum
674SILoadStoreOptimizer::getCommonInstClass(const CombineInfo &CI,
675const CombineInfo &Paired) {
676assert(CI.InstClass == Paired.InstClass);
677
678if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
679SIInstrInfo::isFLATGlobal(*CI.I) &&SIInstrInfo::isFLATGlobal(*Paired.I))
680return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
681
682return CI.InstClass;
683}
684
685static AddressRegs getRegs(unsigned Opc,constSIInstrInfo &TII) {
686 AddressRegs Result;
687
688if (TII.isMUBUF(Opc)) {
689if (AMDGPU::getMUBUFHasVAddr(Opc))
690 Result.VAddr =true;
691if (AMDGPU::getMUBUFHasSrsrc(Opc))
692 Result.SRsrc =true;
693if (AMDGPU::getMUBUFHasSoffset(Opc))
694 Result.SOffset =true;
695
696return Result;
697 }
698
699if (TII.isImage(Opc)) {
700int VAddr0Idx =AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
701if (VAddr0Idx >= 0) {
702int RsrcName =
703TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
704int RsrcIdx =AMDGPU::getNamedOperandIdx(Opc, RsrcName);
705 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
706 }else {
707 Result.VAddr =true;
708 }
709 Result.SRsrc =true;
710constAMDGPU::MIMGInfo *Info =AMDGPU::getMIMGInfo(Opc);
711if (Info &&AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
712 Result.SSamp =true;
713
714return Result;
715 }
716if (TII.isMTBUF(Opc)) {
717if (AMDGPU::getMTBUFHasVAddr(Opc))
718 Result.VAddr =true;
719if (AMDGPU::getMTBUFHasSrsrc(Opc))
720 Result.SRsrc =true;
721if (AMDGPU::getMTBUFHasSoffset(Opc))
722 Result.SOffset =true;
723
724return Result;
725 }
726
727switch (Opc) {
728default:
729return Result;
730case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
731case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
732case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
733case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
734case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
735case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
736case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
737case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
738case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
739 Result.SOffset =true;
740 [[fallthrough]];
741case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
742case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
743case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
744case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
745case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
746case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
747case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
748case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
749case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
750case AMDGPU::S_LOAD_DWORD_IMM:
751case AMDGPU::S_LOAD_DWORDX2_IMM:
752case AMDGPU::S_LOAD_DWORDX3_IMM:
753case AMDGPU::S_LOAD_DWORDX4_IMM:
754case AMDGPU::S_LOAD_DWORDX8_IMM:
755case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
756case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
757case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
758case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
759 Result.SBase =true;
760return Result;
761case AMDGPU::DS_READ_B32:
762case AMDGPU::DS_READ_B64:
763case AMDGPU::DS_READ_B32_gfx9:
764case AMDGPU::DS_READ_B64_gfx9:
765case AMDGPU::DS_WRITE_B32:
766case AMDGPU::DS_WRITE_B64:
767case AMDGPU::DS_WRITE_B32_gfx9:
768case AMDGPU::DS_WRITE_B64_gfx9:
769 Result.Addr =true;
770return Result;
771case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
772case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
773case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
774case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
775case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
776case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
777case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
778case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
779 Result.SAddr =true;
780 [[fallthrough]];
781case AMDGPU::GLOBAL_LOAD_DWORD:
782case AMDGPU::GLOBAL_LOAD_DWORDX2:
783case AMDGPU::GLOBAL_LOAD_DWORDX3:
784case AMDGPU::GLOBAL_LOAD_DWORDX4:
785case AMDGPU::GLOBAL_STORE_DWORD:
786case AMDGPU::GLOBAL_STORE_DWORDX2:
787case AMDGPU::GLOBAL_STORE_DWORDX3:
788case AMDGPU::GLOBAL_STORE_DWORDX4:
789case AMDGPU::FLAT_LOAD_DWORD:
790case AMDGPU::FLAT_LOAD_DWORDX2:
791case AMDGPU::FLAT_LOAD_DWORDX3:
792case AMDGPU::FLAT_LOAD_DWORDX4:
793case AMDGPU::FLAT_STORE_DWORD:
794case AMDGPU::FLAT_STORE_DWORDX2:
795case AMDGPU::FLAT_STORE_DWORDX3:
796case AMDGPU::FLAT_STORE_DWORDX4:
797 Result.VAddr =true;
798return Result;
799 }
800}
801
802void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iteratorMI,
803const SILoadStoreOptimizer &LSO) {
804I =MI;
805unsigned Opc =MI->getOpcode();
806 InstClass = getInstClass(Opc, *LSO.TII);
807
808if (InstClass == UNKNOWN)
809return;
810
811 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*MI));
812
813switch (InstClass) {
814case DS_READ:
815 EltSize =
816 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
817 : 4;
818break;
819case DS_WRITE:
820 EltSize =
821 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
822 : 4;
823break;
824case S_BUFFER_LOAD_IMM:
825case S_BUFFER_LOAD_SGPR_IMM:
826case S_LOAD_IMM:
827 EltSize =AMDGPU::convertSMRDOffsetUnits(*LSO.STM, 4);
828break;
829default:
830 EltSize = 4;
831break;
832 }
833
834if (InstClass == MIMG) {
835 DMask = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
836// Offset is not considered for MIMG instructions.
837Offset = 0;
838 }else {
839int OffsetIdx =AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
840Offset =I->getOperand(OffsetIdx).getImm();
841 }
842
843if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
844 Format = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::format)->getImm();
845
846 Width = getOpcodeWidth(*I, *LSO.TII);
847
848if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
849Offset &= 0xffff;
850 }elseif (InstClass != MIMG) {
851 CPol = LSO.TII->getNamedOperand(*I, AMDGPU::OpName::cpol)->getImm();
852 }
853
854 AddressRegs Regs = getRegs(Opc, *LSO.TII);
855bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
856
857 NumAddresses = 0;
858for (unsigned J = 0; J < Regs.NumVAddrs; J++)
859 AddrIdx[NumAddresses++] =
860AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
861if (Regs.Addr)
862 AddrIdx[NumAddresses++] =
863AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
864if (Regs.SBase)
865 AddrIdx[NumAddresses++] =
866AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
867if (Regs.SRsrc)
868 AddrIdx[NumAddresses++] =AMDGPU::getNamedOperandIdx(
869 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
870if (Regs.SOffset)
871 AddrIdx[NumAddresses++] =
872AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
873if (Regs.SAddr)
874 AddrIdx[NumAddresses++] =
875AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
876if (Regs.VAddr)
877 AddrIdx[NumAddresses++] =
878AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
879if (Regs.SSamp)
880 AddrIdx[NumAddresses++] =AMDGPU::getNamedOperandIdx(
881 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
882assert(NumAddresses <= MaxAddressRegs);
883
884for (unsigned J = 0; J < NumAddresses; J++)
885 AddrReg[J] = &I->getOperand(AddrIdx[J]);
886}
887
888}// end anonymous namespace.
889
890INITIALIZE_PASS_BEGIN(SILoadStoreOptimizerLegacy,DEBUG_TYPE,
891"SI Load Store Optimizer",false,false)
892INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
893INITIALIZE_PASS_END(SILoadStoreOptimizerLegacy,DEBUG_TYPE,
894 "SI Load StoreOptimizer",false,false)
895
896char SILoadStoreOptimizerLegacy::ID = 0;
897
898char &llvm::SILoadStoreOptimizerLegacyID = SILoadStoreOptimizerLegacy::ID;
899
900FunctionPass *llvm::createSILoadStoreOptimizerLegacyPass() {
901returnnew SILoadStoreOptimizerLegacy();
902}
903
904staticvoidaddDefsUsesToList(constMachineInstr &MI,
905DenseSet<Register> &RegDefs,
906DenseSet<Register> &RegUses) {
907for (constauto &Op :MI.operands()) {
908if (!Op.isReg())
909continue;
910if (Op.isDef())
911 RegDefs.insert(Op.getReg());
912if (Op.readsReg())
913 RegUses.insert(Op.getReg());
914 }
915}
916
917bool SILoadStoreOptimizer::canSwapInstructions(
918constDenseSet<Register> &ARegDefs,constDenseSet<Register> &ARegUses,
919constMachineInstr &A,constMachineInstr &B) const{
920if (A.mayLoadOrStore() &&B.mayLoadOrStore() &&
921 (A.mayStore() ||B.mayStore()) &&A.mayAlias(AA,B,true))
922returnfalse;
923for (constauto &BOp :B.operands()) {
924if (!BOp.isReg())
925continue;
926if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.contains(BOp.getReg()))
927returnfalse;
928if (BOp.isDef() && ARegUses.contains(BOp.getReg()))
929returnfalse;
930 }
931returntrue;
932}
933
934// Given that \p CI and \p Paired are adjacent memory operations produce a new
935// MMO for the combined operation with a new access size.
936MachineMemOperand *
937SILoadStoreOptimizer::combineKnownAdjacentMMOs(const CombineInfo &CI,
938const CombineInfo &Paired) {
939constMachineMemOperand *MMOa = *CI.I->memoperands_begin();
940constMachineMemOperand *MMOb = *Paired.I->memoperands_begin();
941
942unsignedSize = MMOa->getSize().getValue() + MMOb->getSize().getValue();
943
944// A base pointer for the combined operation is the same as the leading
945// operation's pointer.
946if (Paired < CI)
947std::swap(MMOa, MMOb);
948
949MachinePointerInfo PtrInfo(MMOa->getPointerInfo());
950// If merging FLAT and GLOBAL set address space to FLAT.
951if (MMOb->getAddrSpace() ==AMDGPUAS::FLAT_ADDRESS)
952 PtrInfo.AddrSpace =AMDGPUAS::FLAT_ADDRESS;
953
954MachineFunction *MF = CI.I->getMF();
955return MF->getMachineMemOperand(MMOa, PtrInfo,Size);
956}
957
958bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
959constSIInstrInfo &TII,
960const CombineInfo &Paired) {
961assert(CI.InstClass == MIMG);
962
963// Ignore instructions with tfe/lwe set.
964constauto *TFEOp =TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
965constauto *LWEOp =TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
966
967if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
968returnfalse;
969
970// Check other optional immediate operands for equality.
971unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
972 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
973 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
974
975for (autoop : OperandsToMatch) {
976intIdx =AMDGPU::getNamedOperandIdx(CI.I->getOpcode(),op);
977if (AMDGPU::getNamedOperandIdx(Paired.I->getOpcode(),op) !=Idx)
978returnfalse;
979if (Idx != -1 &&
980 CI.I->getOperand(Idx).getImm() != Paired.I->getOperand(Idx).getImm())
981returnfalse;
982 }
983
984// Check DMask for overlaps.
985unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
986unsigned MinMask = std::min(CI.DMask, Paired.DMask);
987
988if (!MaxMask)
989returnfalse;
990
991unsigned AllowedBitsForMin =llvm::countr_zero(MaxMask);
992if ((1u << AllowedBitsForMin) <= MinMask)
993returnfalse;
994
995returntrue;
996}
997
998staticunsignedgetBufferFormatWithCompCount(unsigned OldFormat,
999unsigned ComponentCount,
1000constGCNSubtarget &STI) {
1001if (ComponentCount > 4)
1002return 0;
1003
1004constllvm::AMDGPU::GcnBufferFormatInfo *OldFormatInfo =
1005llvm::AMDGPU::getGcnBufferFormatInfo(OldFormat, STI);
1006if (!OldFormatInfo)
1007return 0;
1008
1009constllvm::AMDGPU::GcnBufferFormatInfo *NewFormatInfo =
1010llvm::AMDGPU::getGcnBufferFormatInfo(OldFormatInfo->BitsPerComp,
1011 ComponentCount,
1012 OldFormatInfo->NumFormat, STI);
1013
1014if (!NewFormatInfo)
1015return 0;
1016
1017assert(NewFormatInfo->NumFormat == OldFormatInfo->NumFormat &&
1018 NewFormatInfo->BitsPerComp == OldFormatInfo->BitsPerComp);
1019
1020return NewFormatInfo->Format;
1021}
1022
1023// Return the value in the inclusive range [Lo,Hi] that is aligned to the
1024// highest power of two. Note that the result is well defined for all inputs
1025// including corner cases like:
1026// - if Lo == Hi, return that value
1027// - if Lo == 0, return 0 (even though the "- 1" below underflows
1028// - if Lo > Hi, return 0 (as if the range wrapped around)
1029staticuint32_tmostAlignedValueInRange(uint32_tLo,uint32_tHi) {
1030returnHi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^Hi) + 1);
1031}
1032
1033bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1034constGCNSubtarget &STI,
1035 CombineInfo &Paired,
1036bool Modify) {
1037assert(CI.InstClass != MIMG);
1038
1039// XXX - Would the same offset be OK? Is there any reason this would happen or
1040// be useful?
1041if (CI.Offset == Paired.Offset)
1042returnfalse;
1043
1044// This won't be valid if the offset isn't aligned.
1045if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1046returnfalse;
1047
1048if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1049
1050constllvm::AMDGPU::GcnBufferFormatInfo *Info0 =
1051llvm::AMDGPU::getGcnBufferFormatInfo(CI.Format, STI);
1052if (!Info0)
1053returnfalse;
1054constllvm::AMDGPU::GcnBufferFormatInfo *Info1 =
1055llvm::AMDGPU::getGcnBufferFormatInfo(Paired.Format, STI);
1056if (!Info1)
1057returnfalse;
1058
1059if (Info0->BitsPerComp != Info1->BitsPerComp ||
1060 Info0->NumFormat != Info1->NumFormat)
1061returnfalse;
1062
1063// TODO: Should be possible to support more formats, but if format loads
1064// are not dword-aligned, the merged load might not be valid.
1065if (Info0->BitsPerComp != 32)
1066returnfalse;
1067
1068if (getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, STI) == 0)
1069returnfalse;
1070 }
1071
1072uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1073uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1074 CI.UseST64 =false;
1075 CI.BaseOff = 0;
1076
1077// Handle all non-DS instructions.
1078if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1079if (EltOffset0 + CI.Width != EltOffset1 &&
1080 EltOffset1 + Paired.Width != EltOffset0)
1081returnfalse;
1082if (CI.CPol != Paired.CPol)
1083returnfalse;
1084if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1085 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1086// Reject cases like:
1087// dword + dwordx2 -> dwordx3
1088// dword + dwordx3 -> dwordx4
1089// If we tried to combine these cases, we would fail to extract a subreg
1090// for the result of the second load due to SGPR alignment requirements.
1091if (CI.Width != Paired.Width &&
1092 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1093returnfalse;
1094 }
1095returntrue;
1096 }
1097
1098// If the offset in elements doesn't fit in 8-bits, we might be able to use
1099// the stride 64 versions.
1100if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1101 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1102if (Modify) {
1103 CI.Offset = EltOffset0 / 64;
1104 Paired.Offset = EltOffset1 / 64;
1105 CI.UseST64 =true;
1106 }
1107returntrue;
1108 }
1109
1110// Check if the new offsets fit in the reduced 8-bit range.
1111if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1112if (Modify) {
1113 CI.Offset = EltOffset0;
1114 Paired.Offset = EltOffset1;
1115 }
1116returntrue;
1117 }
1118
1119// Try to shift base address to decrease offsets.
1120uint32_t Min = std::min(EltOffset0, EltOffset1);
1121uint32_tMax = std::max(EltOffset0, EltOffset1);
1122
1123constuint32_tMask = maskTrailingOnes<uint32_t>(8) * 64;
1124if (((Max - Min) & ~Mask) == 0) {
1125if (Modify) {
1126// From the range of values we could use for BaseOff, choose the one that
1127// is aligned to the highest power of two, to maximise the chance that
1128// the same offset can be reused for other load/store pairs.
1129uint32_t BaseOff =mostAlignedValueInRange(Max - 0xff * 64, Min);
1130// Copy the low bits of the offsets, so that when we adjust them by
1131// subtracting BaseOff they will be multiples of 64.
1132 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1133 CI.BaseOff = BaseOff * CI.EltSize;
1134 CI.Offset = (EltOffset0 - BaseOff) / 64;
1135 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1136 CI.UseST64 =true;
1137 }
1138returntrue;
1139 }
1140
1141if (isUInt<8>(Max - Min)) {
1142if (Modify) {
1143// From the range of values we could use for BaseOff, choose the one that
1144// is aligned to the highest power of two, to maximise the chance that
1145// the same offset can be reused for other load/store pairs.
1146uint32_t BaseOff =mostAlignedValueInRange(Max - 0xff, Min);
1147 CI.BaseOff = BaseOff * CI.EltSize;
1148 CI.Offset = EltOffset0 - BaseOff;
1149 Paired.Offset = EltOffset1 - BaseOff;
1150 }
1151returntrue;
1152 }
1153
1154returnfalse;
1155}
1156
1157bool SILoadStoreOptimizer::widthsFit(constGCNSubtarget &STM,
1158const CombineInfo &CI,
1159const CombineInfo &Paired) {
1160constunsigned Width = (CI.Width + Paired.Width);
1161switch (CI.InstClass) {
1162default:
1163return (Width <= 4) && (STM.hasDwordx3LoadStores() || (Width != 3));
1164case S_BUFFER_LOAD_IMM:
1165case S_BUFFER_LOAD_SGPR_IMM:
1166case S_LOAD_IMM:
1167switch (Width) {
1168default:
1169returnfalse;
1170case 2:
1171case 4:
1172case 8:
1173returntrue;
1174case 3:
1175return STM.hasScalarDwordx3Loads();
1176 }
1177 }
1178}
1179
1180constTargetRegisterClass *
1181SILoadStoreOptimizer::getDataRegClass(constMachineInstr &MI) const{
1182if (constauto *Dst =TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
1183returnTRI->getRegClassForReg(*MRI, Dst->getReg());
1184 }
1185if (constauto *Src =TII->getNamedOperand(MI, AMDGPU::OpName::vdata)) {
1186returnTRI->getRegClassForReg(*MRI, Src->getReg());
1187 }
1188if (constauto *Src =TII->getNamedOperand(MI, AMDGPU::OpName::data0)) {
1189returnTRI->getRegClassForReg(*MRI, Src->getReg());
1190 }
1191if (constauto *Dst =TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
1192returnTRI->getRegClassForReg(*MRI, Dst->getReg());
1193 }
1194if (constauto *Src =TII->getNamedOperand(MI, AMDGPU::OpName::sdata)) {
1195returnTRI->getRegClassForReg(*MRI, Src->getReg());
1196 }
1197returnnullptr;
1198}
1199
1200/// This function assumes that CI comes before Paired in a basic block. Return
1201/// an insertion point for the merged instruction or nullptr on failure.
1202SILoadStoreOptimizer::CombineInfo *
1203SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1204 CombineInfo &Paired) {
1205// If another instruction has already been merged into CI, it may now be a
1206// type that we can't do any further merging into.
1207if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1208returnnullptr;
1209assert(CI.InstClass == Paired.InstClass);
1210
1211if (getInstSubclass(CI.I->getOpcode(), *TII) !=
1212 getInstSubclass(Paired.I->getOpcode(), *TII))
1213returnnullptr;
1214
1215// Check both offsets (or masks for MIMG) can be combined and fit in the
1216// reduced range.
1217if (CI.InstClass == MIMG) {
1218if (!dmasksCanBeCombined(CI, *TII, Paired))
1219returnnullptr;
1220 }else {
1221if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1222returnnullptr;
1223 }
1224
1225DenseSet<Register> RegDefs;
1226DenseSet<Register> RegUses;
1227 CombineInfo *Where;
1228if (CI.I->mayLoad()) {
1229// Try to hoist Paired up to CI.
1230addDefsUsesToList(*Paired.I, RegDefs, RegUses);
1231for (MachineBasicBlock::iteratorMBBI = Paired.I; --MBBI != CI.I;) {
1232if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *MBBI))
1233returnnullptr;
1234 }
1235 Where = &CI;
1236 }else {
1237// Try to sink CI down to Paired.
1238addDefsUsesToList(*CI.I, RegDefs, RegUses);
1239for (MachineBasicBlock::iteratorMBBI = CI.I; ++MBBI != Paired.I;) {
1240if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *MBBI))
1241returnnullptr;
1242 }
1243 Where = &Paired;
1244 }
1245
1246// Call offsetsCanBeCombined with modify = true so that the offsets are
1247// correct for the new instruction. This should return true, because
1248// this function should only be called on CombineInfo objects that
1249// have already been confirmed to be mergeable.
1250if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1251 offsetsCanBeCombined(CI, *STM, Paired,true);
1252return Where;
1253}
1254
1255// Copy the merged load result from DestReg to the original dest regs of CI and
1256// Paired.
1257void SILoadStoreOptimizer::copyToDestRegs(
1258 CombineInfo &CI, CombineInfo &Paired,
1259MachineBasicBlock::iterator InsertBefore,intOpName,
1260Register DestReg) const{
1261MachineBasicBlock *MBB = CI.I->getParent();
1262DebugLocDL = CI.I->getDebugLoc();
1263
1264auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1265
1266// Copy to the old destination registers.
1267constMCInstrDesc &CopyDesc =TII->get(TargetOpcode::COPY);
1268auto *Dest0 =TII->getNamedOperand(*CI.I,OpName);
1269auto *Dest1 =TII->getNamedOperand(*Paired.I,OpName);
1270
1271// The constrained sload instructions in S_LOAD_IMM class will have
1272// `early-clobber` flag in the dst operand. Remove the flag before using the
1273// MOs in copies.
1274 Dest0->setIsEarlyClobber(false);
1275 Dest1->setIsEarlyClobber(false);
1276
1277BuildMI(*MBB, InsertBefore,DL, CopyDesc)
1278 .add(*Dest0)// Copy to same destination including flags and sub reg.
1279 .addReg(DestReg, 0, SubRegIdx0);
1280BuildMI(*MBB, InsertBefore,DL, CopyDesc)
1281 .add(*Dest1)
1282 .addReg(DestReg,RegState::Kill, SubRegIdx1);
1283}
1284
1285// Return a register for the source of the merged store after copying the
1286// original source regs of CI and Paired into it.
1287Register
1288SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1289MachineBasicBlock::iterator InsertBefore,
1290intOpName) const{
1291MachineBasicBlock *MBB = CI.I->getParent();
1292DebugLocDL = CI.I->getDebugLoc();
1293
1294auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1295
1296// Copy to the new source register.
1297constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1298Register SrcReg =MRI->createVirtualRegister(SuperRC);
1299
1300constauto *Src0 =TII->getNamedOperand(*CI.I,OpName);
1301constauto *Src1 =TII->getNamedOperand(*Paired.I,OpName);
1302
1303BuildMI(*MBB, InsertBefore,DL,TII->get(AMDGPU::REG_SEQUENCE), SrcReg)
1304 .add(*Src0)
1305 .addImm(SubRegIdx0)
1306 .add(*Src1)
1307 .addImm(SubRegIdx1);
1308
1309return SrcReg;
1310}
1311
1312unsigned SILoadStoreOptimizer::read2Opcode(unsigned EltSize) const{
1313if (STM->ldsRequiresM0Init())
1314return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1315return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1316}
1317
1318unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const{
1319if (STM->ldsRequiresM0Init())
1320return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1321
1322return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1323 : AMDGPU::DS_READ2ST64_B64_gfx9;
1324}
1325
1326MachineBasicBlock::iterator
1327SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1328MachineBasicBlock::iterator InsertBefore) {
1329MachineBasicBlock *MBB = CI.I->getParent();
1330
1331// Be careful, since the addresses could be subregisters themselves in weird
1332// cases, like vectors of pointers.
1333constauto *AddrReg =TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1334
1335unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1336unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1337unsigned Opc =
1338 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1339
1340assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1341 (NewOffset0 != NewOffset1) &&"Computed offset doesn't fit");
1342
1343constMCInstrDesc &Read2Desc =TII->get(Opc);
1344
1345constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1346Register DestReg =MRI->createVirtualRegister(SuperRC);
1347
1348DebugLocDL = CI.I->getDebugLoc();
1349
1350Register BaseReg = AddrReg->getReg();
1351unsigned BaseSubReg = AddrReg->getSubReg();
1352unsigned BaseRegFlags = 0;
1353if (CI.BaseOff) {
1354Register ImmReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1355BuildMI(*MBB, InsertBefore,DL,TII->get(AMDGPU::S_MOV_B32), ImmReg)
1356 .addImm(CI.BaseOff);
1357
1358 BaseReg =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1359 BaseRegFlags =RegState::Kill;
1360
1361TII->getAddNoCarry(*MBB, InsertBefore,DL, BaseReg)
1362 .addReg(ImmReg)
1363 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1364 .addImm(0);// clamp bit
1365 BaseSubReg = 0;
1366 }
1367
1368MachineInstrBuilder Read2 =
1369BuildMI(*MBB, InsertBefore,DL, Read2Desc, DestReg)
1370 .addReg(BaseReg, BaseRegFlags, BaseSubReg)// addr
1371 .addImm(NewOffset0)// offset0
1372 .addImm(NewOffset1)// offset1
1373 .addImm(0)// gds
1374 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1375
1376 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1377
1378 CI.I->eraseFromParent();
1379 Paired.I->eraseFromParent();
1380
1381LLVM_DEBUG(dbgs() <<"Inserted read2: " << *Read2 <<'\n');
1382return Read2;
1383}
1384
1385unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const{
1386if (STM->ldsRequiresM0Init())
1387return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1388return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1389 : AMDGPU::DS_WRITE2_B64_gfx9;
1390}
1391
1392unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const{
1393if (STM->ldsRequiresM0Init())
1394return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1395 : AMDGPU::DS_WRITE2ST64_B64;
1396
1397return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1398 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1399}
1400
1401MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
1402 CombineInfo &CI, CombineInfo &Paired,
1403MachineBasicBlock::iterator InsertBefore) {
1404MachineBasicBlock *MBB = CI.I->getParent();
1405
1406// Be sure to use .addOperand(), and not .addReg() with these. We want to be
1407// sure we preserve the subregister index and any register flags set on them.
1408constMachineOperand *AddrReg =
1409TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1410constMachineOperand *Data0 =
1411TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1412constMachineOperand *Data1 =
1413TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1414
1415unsigned NewOffset0 = CI.Offset;
1416unsigned NewOffset1 = Paired.Offset;
1417unsigned Opc =
1418 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1419
1420if (NewOffset0 > NewOffset1) {
1421// Canonicalize the merged instruction so the smaller offset comes first.
1422std::swap(NewOffset0, NewOffset1);
1423std::swap(Data0, Data1);
1424 }
1425
1426assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1427 (NewOffset0 != NewOffset1) &&"Computed offset doesn't fit");
1428
1429constMCInstrDesc &Write2Desc =TII->get(Opc);
1430DebugLocDL = CI.I->getDebugLoc();
1431
1432Register BaseReg = AddrReg->getReg();
1433unsigned BaseSubReg = AddrReg->getSubReg();
1434unsigned BaseRegFlags = 0;
1435if (CI.BaseOff) {
1436Register ImmReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1437BuildMI(*MBB, InsertBefore,DL,TII->get(AMDGPU::S_MOV_B32), ImmReg)
1438 .addImm(CI.BaseOff);
1439
1440 BaseReg =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1441 BaseRegFlags =RegState::Kill;
1442
1443TII->getAddNoCarry(*MBB, InsertBefore,DL, BaseReg)
1444 .addReg(ImmReg)
1445 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1446 .addImm(0);// clamp bit
1447 BaseSubReg = 0;
1448 }
1449
1450MachineInstrBuilder Write2 =
1451BuildMI(*MBB, InsertBefore,DL, Write2Desc)
1452 .addReg(BaseReg, BaseRegFlags, BaseSubReg)// addr
1453 .add(*Data0)// data0
1454 .add(*Data1)// data1
1455 .addImm(NewOffset0)// offset0
1456 .addImm(NewOffset1)// offset1
1457 .addImm(0)// gds
1458 .cloneMergedMemRefs({&*CI.I, &*Paired.I});
1459
1460 CI.I->eraseFromParent();
1461 Paired.I->eraseFromParent();
1462
1463LLVM_DEBUG(dbgs() <<"Inserted write2 inst: " << *Write2 <<'\n');
1464return Write2;
1465}
1466
1467MachineBasicBlock::iterator
1468SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1469MachineBasicBlock::iterator InsertBefore) {
1470MachineBasicBlock *MBB = CI.I->getParent();
1471DebugLocDL = CI.I->getDebugLoc();
1472constunsigned Opcode = getNewOpcode(CI, Paired);
1473
1474constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1475
1476Register DestReg =MRI->createVirtualRegister(SuperRC);
1477unsigned MergedDMask = CI.DMask | Paired.DMask;
1478unsigned DMaskIdx =
1479AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
1480
1481auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg);
1482for (unsignedI = 1, E = (*CI.I).getNumOperands();I != E; ++I) {
1483if (I == DMaskIdx)
1484 MIB.addImm(MergedDMask);
1485else
1486 MIB.add((*CI.I).getOperand(I));
1487 }
1488
1489// It shouldn't be possible to get this far if the two instructions
1490// don't have a single memoperand, because MachineInstr::mayAlias()
1491// will return true if this is the case.
1492assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1493
1494MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1495
1496 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1497
1498 CI.I->eraseFromParent();
1499 Paired.I->eraseFromParent();
1500returnNew;
1501}
1502
1503MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
1504 CombineInfo &CI, CombineInfo &Paired,
1505MachineBasicBlock::iterator InsertBefore) {
1506MachineBasicBlock *MBB = CI.I->getParent();
1507DebugLocDL = CI.I->getDebugLoc();
1508constunsigned Opcode = getNewOpcode(CI, Paired);
1509
1510constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1511
1512Register DestReg =MRI->createVirtualRegister(SuperRC);
1513unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1514
1515// It shouldn't be possible to get this far if the two instructions
1516// don't have a single memoperand, because MachineInstr::mayAlias()
1517// will return true if this is the case.
1518assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1519
1520MachineInstrBuilderNew =
1521BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg)
1522 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1523if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1524New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1525New.addImm(MergedOffset);
1526New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1527
1528 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1529
1530 CI.I->eraseFromParent();
1531 Paired.I->eraseFromParent();
1532returnNew;
1533}
1534
1535MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
1536 CombineInfo &CI, CombineInfo &Paired,
1537MachineBasicBlock::iterator InsertBefore) {
1538MachineBasicBlock *MBB = CI.I->getParent();
1539DebugLocDL = CI.I->getDebugLoc();
1540
1541constunsigned Opcode = getNewOpcode(CI, Paired);
1542
1543constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1544
1545// Copy to the new source register.
1546Register DestReg =MRI->createVirtualRegister(SuperRC);
1547unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1548
1549auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg);
1550
1551 AddressRegs Regs = getRegs(Opcode, *TII);
1552
1553if (Regs.VAddr)
1554 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1555
1556// It shouldn't be possible to get this far if the two instructions
1557// don't have a single memoperand, because MachineInstr::mayAlias()
1558// will return true if this is the case.
1559assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1560
1561MachineInstr *New =
1562 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1563 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1564 .addImm(MergedOffset)// offset
1565 .addImm(CI.CPol)// cpol
1566 .addImm(0)// swz
1567 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1568
1569 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1570
1571 CI.I->eraseFromParent();
1572 Paired.I->eraseFromParent();
1573returnNew;
1574}
1575
1576MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
1577 CombineInfo &CI, CombineInfo &Paired,
1578MachineBasicBlock::iterator InsertBefore) {
1579MachineBasicBlock *MBB = CI.I->getParent();
1580DebugLocDL = CI.I->getDebugLoc();
1581
1582constunsigned Opcode = getNewOpcode(CI, Paired);
1583
1584constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1585
1586// Copy to the new source register.
1587Register DestReg =MRI->createVirtualRegister(SuperRC);
1588unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1589
1590auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg);
1591
1592 AddressRegs Regs = getRegs(Opcode, *TII);
1593
1594if (Regs.VAddr)
1595 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1596
1597unsigned JoinedFormat =
1598getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1599
1600// It shouldn't be possible to get this far if the two instructions
1601// don't have a single memoperand, because MachineInstr::mayAlias()
1602// will return true if this is the case.
1603assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1604
1605MachineInstr *New =
1606 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608 .addImm(MergedOffset)// offset
1609 .addImm(JoinedFormat)// format
1610 .addImm(CI.CPol)// cpol
1611 .addImm(0)// swz
1612 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1613
1614 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1615
1616 CI.I->eraseFromParent();
1617 Paired.I->eraseFromParent();
1618returnNew;
1619}
1620
1621MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
1622 CombineInfo &CI, CombineInfo &Paired,
1623MachineBasicBlock::iterator InsertBefore) {
1624MachineBasicBlock *MBB = CI.I->getParent();
1625DebugLocDL = CI.I->getDebugLoc();
1626
1627constunsigned Opcode = getNewOpcode(CI, Paired);
1628
1629Register SrcReg =
1630 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1631
1632auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode))
1633 .addReg(SrcReg,RegState::Kill);
1634
1635 AddressRegs Regs = getRegs(Opcode, *TII);
1636
1637if (Regs.VAddr)
1638 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1639
1640unsigned JoinedFormat =
1641getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
1642
1643// It shouldn't be possible to get this far if the two instructions
1644// don't have a single memoperand, because MachineInstr::mayAlias()
1645// will return true if this is the case.
1646assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1647
1648MachineInstr *New =
1649 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1650 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1651 .addImm(std::min(CI.Offset, Paired.Offset))// offset
1652 .addImm(JoinedFormat)// format
1653 .addImm(CI.CPol)// cpol
1654 .addImm(0)// swz
1655 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1656
1657 CI.I->eraseFromParent();
1658 Paired.I->eraseFromParent();
1659returnNew;
1660}
1661
1662MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatLoadPair(
1663 CombineInfo &CI, CombineInfo &Paired,
1664MachineBasicBlock::iterator InsertBefore) {
1665MachineBasicBlock *MBB = CI.I->getParent();
1666DebugLocDL = CI.I->getDebugLoc();
1667
1668constunsigned Opcode = getNewOpcode(CI, Paired);
1669
1670constTargetRegisterClass *SuperRC = getTargetRegisterClass(CI, Paired);
1671Register DestReg =MRI->createVirtualRegister(SuperRC);
1672
1673auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode), DestReg);
1674
1675if (auto *SAddr =TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1676 MIB.add(*SAddr);
1677
1678MachineInstr *New =
1679 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1680 .addImm(std::min(CI.Offset, Paired.Offset))
1681 .addImm(CI.CPol)
1682 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1683
1684 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1685
1686 CI.I->eraseFromParent();
1687 Paired.I->eraseFromParent();
1688returnNew;
1689}
1690
1691MachineBasicBlock::iterator SILoadStoreOptimizer::mergeFlatStorePair(
1692 CombineInfo &CI, CombineInfo &Paired,
1693MachineBasicBlock::iterator InsertBefore) {
1694MachineBasicBlock *MBB = CI.I->getParent();
1695DebugLocDL = CI.I->getDebugLoc();
1696
1697constunsigned Opcode = getNewOpcode(CI, Paired);
1698
1699Register SrcReg =
1700 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1701
1702auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode))
1703 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1704 .addReg(SrcReg,RegState::Kill);
1705
1706if (auto *SAddr =TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1707 MIB.add(*SAddr);
1708
1709MachineInstr *New =
1710 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1711 .addImm(CI.CPol)
1712 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1713
1714 CI.I->eraseFromParent();
1715 Paired.I->eraseFromParent();
1716returnNew;
1717}
1718
1719staticboolneedsConstrainedOpcode(constGCNSubtarget &STM,
1720ArrayRef<MachineMemOperand *> MMOs,
1721unsigned Width) {
1722// Conservatively returns true if not found the MMO.
1723return STM.isXNACKEnabled() &&
1724 (MMOs.size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1725}
1726
1727unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
1728const CombineInfo &Paired) {
1729constunsigned Width = CI.Width + Paired.Width;
1730
1731switch (getCommonInstClass(CI, Paired)) {
1732default:
1733assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1734// FIXME: Handle d16 correctly
1735returnAMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
1736 Width);
1737case TBUFFER_LOAD:
1738case TBUFFER_STORE:
1739returnAMDGPU::getMTBUFOpcode(AMDGPU::getMTBUFBaseOpcode(CI.I->getOpcode()),
1740 Width);
1741
1742caseUNKNOWN:
1743llvm_unreachable("Unknown instruction class");
1744case S_BUFFER_LOAD_IMM: {
1745// If XNACK is enabled, use the constrained opcodes when the first load is
1746// under-aligned.
1747bool NeedsConstrainedOpc =
1748needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1749switch (Width) {
1750default:
1751return 0;
1752case 2:
1753return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1754 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1755case 3:
1756return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1757 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1758case 4:
1759return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1760 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1761case 8:
1762return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1763 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1764 }
1765 }
1766case S_BUFFER_LOAD_SGPR_IMM: {
1767// If XNACK is enabled, use the constrained opcodes when the first load is
1768// under-aligned.
1769bool NeedsConstrainedOpc =
1770needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1771switch (Width) {
1772default:
1773return 0;
1774case 2:
1775return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1776 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1777case 3:
1778return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1779 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1780case 4:
1781return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1782 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1783case 8:
1784return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1785 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1786 }
1787 }
1788case S_LOAD_IMM: {
1789// If XNACK is enabled, use the constrained opcodes when the first load is
1790// under-aligned.
1791bool NeedsConstrainedOpc =
1792needsConstrainedOpcode(*STM, CI.I->memoperands(), Width);
1793switch (Width) {
1794default:
1795return 0;
1796case 2:
1797return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1798 : AMDGPU::S_LOAD_DWORDX2_IMM;
1799case 3:
1800return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1801 : AMDGPU::S_LOAD_DWORDX3_IMM;
1802case 4:
1803return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1804 : AMDGPU::S_LOAD_DWORDX4_IMM;
1805case 8:
1806return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1807 : AMDGPU::S_LOAD_DWORDX8_IMM;
1808 }
1809 }
1810case GLOBAL_LOAD:
1811switch (Width) {
1812default:
1813return 0;
1814case 2:
1815return AMDGPU::GLOBAL_LOAD_DWORDX2;
1816case 3:
1817return AMDGPU::GLOBAL_LOAD_DWORDX3;
1818case 4:
1819return AMDGPU::GLOBAL_LOAD_DWORDX4;
1820 }
1821case GLOBAL_LOAD_SADDR:
1822switch (Width) {
1823default:
1824return 0;
1825case 2:
1826return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1827case 3:
1828return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1829case 4:
1830return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1831 }
1832case GLOBAL_STORE:
1833switch (Width) {
1834default:
1835return 0;
1836case 2:
1837return AMDGPU::GLOBAL_STORE_DWORDX2;
1838case 3:
1839return AMDGPU::GLOBAL_STORE_DWORDX3;
1840case 4:
1841return AMDGPU::GLOBAL_STORE_DWORDX4;
1842 }
1843case GLOBAL_STORE_SADDR:
1844switch (Width) {
1845default:
1846return 0;
1847case 2:
1848return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1849case 3:
1850return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1851case 4:
1852return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1853 }
1854case FLAT_LOAD:
1855switch (Width) {
1856default:
1857return 0;
1858case 2:
1859return AMDGPU::FLAT_LOAD_DWORDX2;
1860case 3:
1861return AMDGPU::FLAT_LOAD_DWORDX3;
1862case 4:
1863return AMDGPU::FLAT_LOAD_DWORDX4;
1864 }
1865case FLAT_STORE:
1866switch (Width) {
1867default:
1868return 0;
1869case 2:
1870return AMDGPU::FLAT_STORE_DWORDX2;
1871case 3:
1872return AMDGPU::FLAT_STORE_DWORDX3;
1873case 4:
1874return AMDGPU::FLAT_STORE_DWORDX4;
1875 }
1876caseMIMG:
1877assert(((unsigned)llvm::popcount(CI.DMask | Paired.DMask) == Width) &&
1878"No overlaps");
1879returnAMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
1880 }
1881}
1882
1883std::pair<unsigned, unsigned>
1884SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI,
1885const CombineInfo &Paired) {
1886assert((CI.InstClass != MIMG ||
1887 ((unsigned)llvm::popcount(CI.DMask | Paired.DMask) ==
1888 CI.Width + Paired.Width)) &&
1889"No overlaps");
1890
1891unsigned Idx0;
1892unsigned Idx1;
1893
1894staticconstunsigned Idxs[5][4] = {
1895 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1896 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1897 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1898 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1899 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1900 };
1901
1902assert(CI.Width >= 1 && CI.Width <= 4);
1903assert(Paired.Width >= 1 && Paired.Width <= 4);
1904
1905if (Paired < CI) {
1906 Idx1 = Idxs[0][Paired.Width - 1];
1907 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1908 }else {
1909 Idx0 = Idxs[0][CI.Width - 1];
1910 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1911 }
1912
1913return {Idx0, Idx1};
1914}
1915
1916constTargetRegisterClass *
1917SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
1918const CombineInfo &Paired) const{
1919if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1920 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1921switch (CI.Width + Paired.Width) {
1922default:
1923returnnullptr;
1924case 2:
1925return &AMDGPU::SReg_64_XEXECRegClass;
1926case 3:
1927return &AMDGPU::SGPR_96RegClass;
1928case 4:
1929return &AMDGPU::SGPR_128RegClass;
1930case 8:
1931return &AMDGPU::SGPR_256RegClass;
1932case 16:
1933return &AMDGPU::SGPR_512RegClass;
1934 }
1935 }
1936
1937unsignedBitWidth = 32 * (CI.Width + Paired.Width);
1938returnTRI->isAGPRClass(getDataRegClass(*CI.I))
1939 ?TRI->getAGPRClassForBitWidth(BitWidth)
1940 :TRI->getVGPRClassForBitWidth(BitWidth);
1941}
1942
1943MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
1944 CombineInfo &CI, CombineInfo &Paired,
1945MachineBasicBlock::iterator InsertBefore) {
1946MachineBasicBlock *MBB = CI.I->getParent();
1947DebugLocDL = CI.I->getDebugLoc();
1948
1949constunsigned Opcode = getNewOpcode(CI, Paired);
1950
1951Register SrcReg =
1952 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1953
1954auto MIB =BuildMI(*MBB, InsertBefore,DL,TII->get(Opcode))
1955 .addReg(SrcReg,RegState::Kill);
1956
1957 AddressRegs Regs = getRegs(Opcode, *TII);
1958
1959if (Regs.VAddr)
1960 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1961
1962
1963// It shouldn't be possible to get this far if the two instructions
1964// don't have a single memoperand, because MachineInstr::mayAlias()
1965// will return true if this is the case.
1966assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1967
1968MachineInstr *New =
1969 MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1970 .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1971 .addImm(std::min(CI.Offset, Paired.Offset))// offset
1972 .addImm(CI.CPol)// cpol
1973 .addImm(0)// swz
1974 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1975
1976 CI.I->eraseFromParent();
1977 Paired.I->eraseFromParent();
1978returnNew;
1979}
1980
1981MachineOperand
1982SILoadStoreOptimizer::createRegOrImm(int32_t Val,MachineInstr &MI) const{
1983APIntV(32, Val,true);
1984if (TII->isInlineConstant(V))
1985returnMachineOperand::CreateImm(Val);
1986
1987RegisterReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1988MachineInstr *Mov =
1989BuildMI(*MI.getParent(),MI.getIterator(),MI.getDebugLoc(),
1990TII->get(AMDGPU::S_MOV_B32), Reg)
1991 .addImm(Val);
1992 (void)Mov;
1993LLVM_DEBUG(dbgs() <<" "; Mov->dump());
1994returnMachineOperand::CreateReg(Reg,false);
1995}
1996
1997// Compute base address using Addr and return the final register.
1998Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
1999const MemAddress &Addr) const{
2000MachineBasicBlock *MBB =MI.getParent();
2001MachineBasicBlock::iteratorMBBI =MI.getIterator();
2002DebugLocDL =MI.getDebugLoc();
2003
2004assert((TRI->getRegSizeInBits(Addr.Base.LoReg, *MRI) == 32 ||
2005Addr.Base.LoSubReg) &&
2006"Expected 32-bit Base-Register-Low!!");
2007
2008assert((TRI->getRegSizeInBits(Addr.Base.HiReg, *MRI) == 32 ||
2009Addr.Base.HiSubReg) &&
2010"Expected 32-bit Base-Register-Hi!!");
2011
2012LLVM_DEBUG(dbgs() <<" Re-Computed Anchor-Base:\n");
2013MachineOperand OffsetLo = createRegOrImm(static_cast<int32_t>(Addr.Offset),MI);
2014MachineOperand OffsetHi =
2015 createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32),MI);
2016
2017constauto *CarryRC =TRI->getWaveMaskRegClass();
2018Register CarryReg =MRI->createVirtualRegister(CarryRC);
2019Register DeadCarryReg =MRI->createVirtualRegister(CarryRC);
2020
2021Register DestSub0 =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2022Register DestSub1 =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2023MachineInstr *LoHalf =
2024BuildMI(*MBB,MBBI,DL,TII->get(AMDGPU::V_ADD_CO_U32_e64), DestSub0)
2025 .addReg(CarryReg,RegState::Define)
2026 .addReg(Addr.Base.LoReg, 0,Addr.Base.LoSubReg)
2027 .add(OffsetLo)
2028 .addImm(0);// clamp bit
2029 (void)LoHalf;
2030LLVM_DEBUG(dbgs() <<" "; LoHalf->dump(););
2031
2032MachineInstr *HiHalf =
2033BuildMI(*MBB,MBBI,DL,TII->get(AMDGPU::V_ADDC_U32_e64), DestSub1)
2034 .addReg(DeadCarryReg,RegState::Define |RegState::Dead)
2035 .addReg(Addr.Base.HiReg, 0,Addr.Base.HiSubReg)
2036 .add(OffsetHi)
2037 .addReg(CarryReg,RegState::Kill)
2038 .addImm(0);// clamp bit
2039 (void)HiHalf;
2040LLVM_DEBUG(dbgs() <<" "; HiHalf->dump(););
2041
2042Register FullDestReg =MRI->createVirtualRegister(TRI->getVGPR64Class());
2043MachineInstr *FullBase =
2044BuildMI(*MBB,MBBI,DL,TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
2045 .addReg(DestSub0)
2046 .addImm(AMDGPU::sub0)
2047 .addReg(DestSub1)
2048 .addImm(AMDGPU::sub1);
2049 (void)FullBase;
2050LLVM_DEBUG(dbgs() <<" "; FullBase->dump();dbgs() <<"\n";);
2051
2052return FullDestReg;
2053}
2054
2055// Update base and offset with the NewBase and NewOffset in MI.
2056void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
2057Register NewBase,
2058 int32_t NewOffset) const{
2059auto *Base =TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2060Base->setReg(NewBase);
2061Base->setIsKill(false);
2062TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2063}
2064
2065std::optional<int32_t>
2066SILoadStoreOptimizer::extractConstOffset(constMachineOperand &Op) const{
2067if (Op.isImm())
2068returnOp.getImm();
2069
2070if (!Op.isReg())
2071return std::nullopt;
2072
2073MachineInstr *Def =MRI->getUniqueVRegDef(Op.getReg());
2074if (!Def ||Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2075 !Def->getOperand(1).isImm())
2076return std::nullopt;
2077
2078returnDef->getOperand(1).getImm();
2079}
2080
2081// Analyze Base and extracts:
2082// - 32bit base registers, subregisters
2083// - 64bit constant offset
2084// Expecting base computation as:
2085// %OFFSET0:sgpr_32 = S_MOV_B32 8000
2086// %LO:vgpr_32, %c:sreg_64_xexec =
2087// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32,
2088// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec
2089// %Base:vreg_64 =
2090// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
2091void SILoadStoreOptimizer::processBaseWithConstOffset(constMachineOperand &Base,
2092 MemAddress &Addr) const{
2093if (!Base.isReg())
2094return;
2095
2096MachineInstr *Def =MRI->getUniqueVRegDef(Base.getReg());
2097if (!Def ||Def->getOpcode() != AMDGPU::REG_SEQUENCE
2098 ||Def->getNumOperands() != 5)
2099return;
2100
2101MachineOperand BaseLo =Def->getOperand(1);
2102MachineOperand BaseHi =Def->getOperand(3);
2103if (!BaseLo.isReg() || !BaseHi.isReg())
2104return;
2105
2106MachineInstr *BaseLoDef =MRI->getUniqueVRegDef(BaseLo.getReg());
2107MachineInstr *BaseHiDef =MRI->getUniqueVRegDef(BaseHi.getReg());
2108
2109if (!BaseLoDef || BaseLoDef->getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2110 !BaseHiDef || BaseHiDef->getOpcode() != AMDGPU::V_ADDC_U32_e64)
2111return;
2112
2113constauto *Src0 =TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2114constauto *Src1 =TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2115
2116auto Offset0P = extractConstOffset(*Src0);
2117if (Offset0P)
2118 BaseLo = *Src1;
2119else {
2120if (!(Offset0P = extractConstOffset(*Src1)))
2121return;
2122 BaseLo = *Src0;
2123 }
2124
2125if (!BaseLo.isReg())
2126return;
2127
2128 Src0 =TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2129 Src1 =TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2130
2131if (Src0->isImm())
2132std::swap(Src0, Src1);
2133
2134if (!Src1->isImm() || Src0->isImm())
2135return;
2136
2137uint64_t Offset1 = Src1->getImm();
2138 BaseHi = *Src0;
2139
2140if (!BaseHi.isReg())
2141return;
2142
2143Addr.Base.LoReg = BaseLo.getReg();
2144Addr.Base.HiReg = BaseHi.getReg();
2145Addr.Base.LoSubReg = BaseLo.getSubReg();
2146Addr.Base.HiSubReg = BaseHi.getSubReg();
2147Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2148}
2149
2150bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2151MachineInstr &MI,
2152 MemInfoMap &Visited,
2153SmallPtrSet<MachineInstr *, 4> &AnchorList) const{
2154
2155if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI))
2156returnfalse;
2157
2158// TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers.
2159if (SIInstrInfo::isFLATScratch(MI))
2160returnfalse;
2161
2162unsigned AS =SIInstrInfo::isFLATGlobal(MI) ?AMDGPUAS::GLOBAL_ADDRESS
2163 :AMDGPUAS::FLAT_ADDRESS;
2164
2165if (AnchorList.count(&MI))
2166returnfalse;
2167
2168LLVM_DEBUG(dbgs() <<"\nTryToPromoteConstantOffsetToImmFor ";MI.dump());
2169
2170if (TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm()) {
2171LLVM_DEBUG(dbgs() <<" Const-offset is already promoted.\n";);
2172returnfalse;
2173 }
2174
2175// Step1: Find the base-registers and a 64bit constant offset.
2176MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
2177 MemAddress MAddr;
2178if (!Visited.contains(&MI)) {
2179 processBaseWithConstOffset(Base, MAddr);
2180 Visited[&MI] = MAddr;
2181 }else
2182 MAddr = Visited[&MI];
2183
2184if (MAddr.Offset == 0) {
2185LLVM_DEBUG(dbgs() <<" Failed to extract constant-offset or there are no"
2186" constant offsets that can be promoted.\n";);
2187returnfalse;
2188 }
2189
2190LLVM_DEBUG(dbgs() <<" BASE: {" <<printReg(MAddr.Base.HiReg,TRI) <<", "
2191 <<printReg(MAddr.Base.LoReg,TRI)
2192 <<"} Offset: " << MAddr.Offset <<"\n\n";);
2193
2194// Step2: Traverse through MI's basic block and find an anchor(that has the
2195// same base-registers) with the highest 13bit distance from MI's offset.
2196// E.g. (64bit loads)
2197// bb:
2198// addr1 = &a + 4096; load1 = load(addr1, 0)
2199// addr2 = &a + 6144; load2 = load(addr2, 0)
2200// addr3 = &a + 8192; load3 = load(addr3, 0)
2201// addr4 = &a + 10240; load4 = load(addr4, 0)
2202// addr5 = &a + 12288; load5 = load(addr5, 0)
2203//
2204// Starting from the first load, the optimization will try to find a new base
2205// from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192
2206// has 13bit distance from &a + 4096. The heuristic considers &a + 8192
2207// as the new-base(anchor) because of the maximum distance which can
2208// accommodate more intermediate bases presumably.
2209//
2210// Step3: move (&a + 8192) above load1. Compute and promote offsets from
2211// (&a + 8192) for load1, load2, load4.
2212// addr = &a + 8192
2213// load1 = load(addr, -4096)
2214// load2 = load(addr, -2048)
2215// load3 = load(addr, 0)
2216// load4 = load(addr, 2048)
2217// addr5 = &a + 12288; load5 = load(addr5, 0)
2218//
2219MachineInstr *AnchorInst =nullptr;
2220 MemAddress AnchorAddr;
2221uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2222SmallVector<std::pair<MachineInstr *, int64_t>, 4> InstsWCommonBase;
2223
2224MachineBasicBlock *MBB =MI.getParent();
2225MachineBasicBlock::iterator E =MBB->end();
2226MachineBasicBlock::iteratorMBBI =MI.getIterator();
2227 ++MBBI;
2228constSITargetLowering *TLI =
2229static_cast<constSITargetLowering *>(STM->getTargetLowering());
2230
2231for ( ;MBBI != E; ++MBBI) {
2232MachineInstr &MINext = *MBBI;
2233// TODO: Support finding an anchor(with same base) from store addresses or
2234// any other load addresses where the opcodes are different.
2235if (MINext.getOpcode() !=MI.getOpcode() ||
2236TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2237continue;
2238
2239constMachineOperand &BaseNext =
2240 *TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2241 MemAddress MAddrNext;
2242if (!Visited.contains(&MINext)) {
2243 processBaseWithConstOffset(BaseNext, MAddrNext);
2244 Visited[&MINext] = MAddrNext;
2245 }else
2246 MAddrNext = Visited[&MINext];
2247
2248if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2249 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2250 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2251 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2252continue;
2253
2254 InstsWCommonBase.emplace_back(&MINext, MAddrNext.Offset);
2255
2256 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2257TargetLoweringBase::AddrMode AM;
2258 AM.HasBaseReg =true;
2259 AM.BaseOffs = Dist;
2260if (TLI->isLegalFlatAddressingMode(AM, AS) &&
2261 (uint32_t)std::abs(Dist) > MaxDist) {
2262 MaxDist = std::abs(Dist);
2263
2264 AnchorAddr = MAddrNext;
2265 AnchorInst = &MINext;
2266 }
2267 }
2268
2269if (AnchorInst) {
2270LLVM_DEBUG(dbgs() <<" Anchor-Inst(with max-distance from Offset): ";
2271 AnchorInst->dump());
2272LLVM_DEBUG(dbgs() <<" Anchor-Offset from BASE: "
2273 << AnchorAddr.Offset <<"\n\n");
2274
2275// Instead of moving up, just re-compute anchor-instruction's base address.
2276RegisterBase = computeBase(MI, AnchorAddr);
2277
2278 updateBaseAndOffset(MI,Base, MAddr.Offset - AnchorAddr.Offset);
2279LLVM_DEBUG(dbgs() <<" After promotion: ";MI.dump(););
2280
2281for (auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2282TargetLoweringBase::AddrMode AM;
2283 AM.HasBaseReg =true;
2284 AM.BaseOffs = OtherOffset - AnchorAddr.Offset;
2285
2286if (TLI->isLegalFlatAddressingMode(AM, AS)) {
2287LLVM_DEBUG(dbgs() <<" Promote Offset(" << OtherOffset;dbgs() <<")";
2288 OtherMI->dump());
2289 updateBaseAndOffset(*OtherMI,Base, OtherOffset - AnchorAddr.Offset);
2290LLVM_DEBUG(dbgs() <<" After promotion: "; OtherMI->dump());
2291 }
2292 }
2293AnchorList.insert(AnchorInst);
2294returntrue;
2295 }
2296
2297returnfalse;
2298}
2299
2300void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
2301 std::list<std::list<CombineInfo> > &MergeableInsts) const{
2302for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2303if (AddrList.front().InstClass == CI.InstClass &&
2304 AddrList.front().IsAGPR == CI.IsAGPR &&
2305 AddrList.front().hasSameBaseAddress(CI)) {
2306 AddrList.emplace_back(CI);
2307return;
2308 }
2309 }
2310
2311// Base address not found, so add a new list.
2312 MergeableInsts.emplace_back(1, CI);
2313}
2314
2315std::pair<MachineBasicBlock::iterator, bool>
2316SILoadStoreOptimizer::collectMergeableInsts(
2317MachineBasicBlock::iterator Begin,MachineBasicBlock::iteratorEnd,
2318 MemInfoMap &Visited,SmallPtrSet<MachineInstr *, 4> &AnchorList,
2319 std::list<std::list<CombineInfo>> &MergeableInsts) const{
2320boolModified =false;
2321
2322// Sort potential mergeable instructions into lists. One list per base address.
2323unsigned Order = 0;
2324MachineBasicBlock::iterator BlockI = Begin;
2325for (; BlockI !=End; ++BlockI) {
2326MachineInstr &MI = *BlockI;
2327
2328// We run this before checking if an address is mergeable, because it can produce
2329// better code even if the instructions aren't mergeable.
2330if (promoteConstantOffsetToImm(MI, Visited,AnchorList))
2331Modified =true;
2332
2333// Treat volatile accesses, ordered accesses and unmodeled side effects as
2334// barriers. We can look after this barrier for separate merges.
2335if (MI.hasOrderedMemoryRef() ||MI.hasUnmodeledSideEffects()) {
2336LLVM_DEBUG(dbgs() <<"Breaking search on barrier: " <<MI);
2337
2338// Search will resume after this instruction in a separate merge list.
2339 ++BlockI;
2340break;
2341 }
2342
2343const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
2344if (InstClass == UNKNOWN)
2345continue;
2346
2347// Do not merge VMEM buffer instructions with "swizzled" bit set.
2348int Swizzled =
2349AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::swz);
2350if (Swizzled != -1 &&MI.getOperand(Swizzled).getImm())
2351continue;
2352
2353 CombineInfo CI;
2354 CI.setMI(MI, *this);
2355 CI.Order = Order++;
2356
2357if (!CI.hasMergeableAddress(*MRI))
2358continue;
2359
2360if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2361// FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data
2362// operands. However we are reporting that ds_write2 shall have
2363// only VGPR data so that machine copy propagation does not
2364// create an illegal instruction with a VGPR and AGPR sources.
2365// Consequenctially if we create such instruction the verifier
2366// will complain.
2367continue;
2368 }
2369
2370LLVM_DEBUG(dbgs() <<"Mergeable: " <<MI);
2371
2372addInstToMergeableList(CI, MergeableInsts);
2373 }
2374
2375// At this point we have lists of Mergeable instructions.
2376//
2377// Part 2: Sort lists by offset and then for each CombineInfo object in the
2378// list try to find an instruction that can be merged with I. If an instruction
2379// is found, it is stored in the Paired field. If no instructions are found, then
2380// the CombineInfo object is deleted from the list.
2381
2382for (std::list<std::list<CombineInfo>>::iteratorI = MergeableInsts.begin(),
2383 E = MergeableInsts.end();I != E;) {
2384
2385 std::list<CombineInfo> &MergeList = *I;
2386if (MergeList.size() <= 1) {
2387// This means we have found only one instruction with a given address
2388// that can be merged, and we need at least 2 instructions to do a merge,
2389// so this list can be discarded.
2390I = MergeableInsts.erase(I);
2391continue;
2392 }
2393
2394// Sort the lists by offsets, this way mergeable instructions will be
2395// adjacent to each other in the list, which will make it easier to find
2396// matches.
2397 MergeList.sort(
2398 [] (const CombineInfo &A,const CombineInfo &B) {
2399returnA.Offset <B.Offset;
2400 });
2401 ++I;
2402 }
2403
2404return {BlockI,Modified};
2405}
2406
2407// Scan through looking for adjacent LDS operations with constant offsets from
2408// the same base register. We rely on the scheduler to do the hard work of
2409// clustering nearby loads, and assume these are all adjacent.
2410bool SILoadStoreOptimizer::optimizeBlock(
2411 std::list<std::list<CombineInfo> > &MergeableInsts) {
2412boolModified =false;
2413
2414for (std::list<std::list<CombineInfo>>::iteratorI = MergeableInsts.begin(),
2415 E = MergeableInsts.end();I != E;) {
2416 std::list<CombineInfo> &MergeList = *I;
2417
2418bool OptimizeListAgain =false;
2419if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2420// We weren't able to make any changes, so delete the list so we don't
2421// process the same instructions the next time we try to optimize this
2422// block.
2423I = MergeableInsts.erase(I);
2424continue;
2425 }
2426
2427Modified =true;
2428
2429// We made changes, but also determined that there were no more optimization
2430// opportunities, so we don't need to reprocess the list
2431if (!OptimizeListAgain) {
2432I = MergeableInsts.erase(I);
2433continue;
2434 }
2435 OptimizeAgain =true;
2436 }
2437returnModified;
2438}
2439
2440bool
2441SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2442 std::list<CombineInfo> &MergeList,
2443bool &OptimizeListAgain) {
2444if (MergeList.empty())
2445returnfalse;
2446
2447boolModified =false;
2448
2449for (autoI = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
2450 Next = std::next(I)) {
2451
2452autoFirst =I;
2453auto Second = Next;
2454
2455if ((*First).Order > (*Second).Order)
2456std::swap(First, Second);
2457 CombineInfo &CI = *First;
2458 CombineInfo &Paired = *Second;
2459
2460 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2461if (!Where) {
2462 ++I;
2463continue;
2464 }
2465
2466Modified =true;
2467
2468LLVM_DEBUG(dbgs() <<"Merging: " << *CI.I <<" with: " << *Paired.I);
2469
2470MachineBasicBlock::iterator NewMI;
2471switch (CI.InstClass) {
2472default:
2473llvm_unreachable("unknown InstClass");
2474break;
2475case DS_READ:
2476 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2477break;
2478case DS_WRITE:
2479 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2480break;
2481case S_BUFFER_LOAD_IMM:
2482case S_BUFFER_LOAD_SGPR_IMM:
2483case S_LOAD_IMM:
2484 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2485 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2486break;
2487caseBUFFER_LOAD:
2488 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2489 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2490break;
2491caseBUFFER_STORE:
2492 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2493 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2494break;
2495caseMIMG:
2496 NewMI = mergeImagePair(CI, Paired, Where->I);
2497 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2498break;
2499case TBUFFER_LOAD:
2500 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2501 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2502break;
2503case TBUFFER_STORE:
2504 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2505 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2506break;
2507case FLAT_LOAD:
2508case GLOBAL_LOAD:
2509case GLOBAL_LOAD_SADDR:
2510 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2511 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2512break;
2513case FLAT_STORE:
2514case GLOBAL_STORE:
2515case GLOBAL_STORE_SADDR:
2516 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2517 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2518break;
2519 }
2520 CI.setMI(NewMI, *this);
2521 CI.Order = Where->Order;
2522if (I == Second)
2523I = Next;
2524
2525 MergeList.erase(Second);
2526 }
2527
2528returnModified;
2529}
2530
2531bool SILoadStoreOptimizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2532if (skipFunction(MF.getFunction()))
2533returnfalse;
2534return SILoadStoreOptimizer(
2535 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2536 .run(MF);
2537}
2538
2539bool SILoadStoreOptimizer::run(MachineFunction &MF) {
2540 STM = &MF.getSubtarget<GCNSubtarget>();
2541if (!STM->loadStoreOptEnabled())
2542returnfalse;
2543
2544TII = STM->getInstrInfo();
2545TRI = &TII->getRegisterInfo();
2546
2547MRI = &MF.getRegInfo();
2548
2549LLVM_DEBUG(dbgs() <<"Running SILoadStoreOptimizer\n");
2550
2551boolModified =false;
2552
2553// Contains the list of instructions for which constant offsets are being
2554// promoted to the IMM. This is tracked for an entire block at time.
2555SmallPtrSet<MachineInstr *, 4>AnchorList;
2556 MemInfoMap Visited;
2557
2558for (MachineBasicBlock &MBB : MF) {
2559MachineBasicBlock::iterator SectionEnd;
2560for (MachineBasicBlock::iteratorI =MBB.begin(), E =MBB.end();I != E;
2561I = SectionEnd) {
2562bool CollectModified;
2563 std::list<std::list<CombineInfo>> MergeableInsts;
2564
2565// First pass: Collect list of all instructions we know how to merge in a
2566// subset of the block.
2567 std::tie(SectionEnd, CollectModified) =
2568collectMergeableInsts(I, E, Visited,AnchorList, MergeableInsts);
2569
2570Modified |= CollectModified;
2571
2572do {
2573 OptimizeAgain =false;
2574Modified |=optimizeBlock(MergeableInsts);
2575 }while (OptimizeAgain);
2576 }
2577
2578 Visited.clear();
2579AnchorList.clear();
2580 }
2581
2582returnModified;
2583}
2584
2585PreservedAnalyses
2586SILoadStoreOptimizerPass::run(MachineFunction &MF,
2587MachineFunctionAnalysisManager &MFAM) {
2588MFPropsModifier_(*this, MF);
2589
2590if (MF.getFunction().hasOptNone())
2591returnPreservedAnalyses::all();
2592
2593auto &FAM = MFAM.getResult<FunctionAnalysisManagerMachineFunctionProxy>(MF)
2594 .getManager();
2595AAResults &AA =FAM.getResult<AAManager>(MF.getFunction());
2596
2597bool Changed = SILoadStoreOptimizer(&AA).run(MF);
2598if (!Changed)
2599returnPreservedAnalyses::all();
2600
2601PreservedAnalyses PA =getMachineFunctionPassPreservedAnalyses();
2602 PA.preserveSet<CFGAnalyses>();
2603return PA;
2604}
MRI
unsigned const MachineRegisterInfo * MRI
Definition:AArch64AdvSIMDScalarPass.cpp:105
addInstToMergeableList
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
Definition:AMDGPUImageIntrinsicOptimizer.cpp:98
collectMergeableInsts
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Definition:AMDGPUImageIntrinsicOptimizer.cpp:142
AMDGPUMCTargetDesc.h
Provides AMDGPU specific target descriptions.
AMDGPU.h
MBB
MachineBasicBlock & MBB
Definition:ARMSLSHardening.cpp:71
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition:ARMSLSHardening.cpp:73
MBBI
MachineBasicBlock MachineBasicBlock::iterator MBBI
Definition:ARMSLSHardening.cpp:72
AliasAnalysis.h
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
A
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Info
Analysis containing CSE Info
Definition:CSEInfo.cpp:27
Idx
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
Definition:DeadArgumentElimination.cpp:353
LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition:Debug.h:106
Addr
uint64_t Addr
Definition:ELFObjHandler.cpp:79
Size
uint64_t Size
Definition:ELFObjHandler.cpp:81
Other
std::optional< std::vector< StOtherPiece > > Other
Definition:ELFYAML.cpp:1315
End
bool End
Definition:ELF_riscv.cpp:480
GCNSubtarget.h
AMD GCN specific subclass of TargetSubtarget.
op
#define op(i)
TII
const HexagonInstrInfo * TII
Definition:HexagonCopyToCombine.cpp:125
_
#define _
Definition:HexagonMCCodeEmitter.cpp:46
MI
IRTranslator LLVM IR MI
Definition:IRTranslator.cpp:112
InitializePasses.h
LoopDeletionResult::Modified
@ Modified
I
#define I(x, y, z)
Definition:MD5.cpp:58
MachineFunctionPass.h
TRI
unsigned const TargetRegisterInfo * TRI
Definition:MachineSink.cpp:2029
getReg
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
Definition:MipsDisassembler.cpp:520
FAM
FunctionAnalysisManager FAM
Definition:PassBuilderBindings.cpp:61
INITIALIZE_PASS_DEPENDENCY
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition:PassSupport.h:55
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition:PassSupport.h:57
INITIALIZE_PASS_BEGIN
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition:PassSupport.h:52
mostAlignedValueInRange
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
Definition:SILoadStoreOptimizer.cpp:1029
Optimizer
SI Load Store Optimizer
Definition:SILoadStoreOptimizer.cpp:894
needsConstrainedOpcode
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
Definition:SILoadStoreOptimizer.cpp:1719
addDefsUsesToList
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
Definition:SILoadStoreOptimizer.cpp:904
getBufferFormatWithCompCount
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
Definition:SILoadStoreOptimizer.cpp:998
DEBUG_TYPE
#define DEBUG_TYPE
Definition:SILoadStoreOptimizer.cpp:70
SILoadStoreOptimizer.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
isImm
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
Definition:SPIRVInstructionSelector.cpp:2719
optimizeBlock
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
Definition:ScalarizeMaskedMemIntrin.cpp:1060
Lo
support::ulittle16_t & Lo
Definition:aarch32.cpp:204
Hi
support::ulittle16_t & Hi
Definition:aarch32.cpp:203
llvm::AAManager
A manager for alias analyses.
Definition:AliasAnalysis.h:933
llvm::AAManager::run
Result run(Function &F, FunctionAnalysisManager &AM)
Definition:AliasAnalysis.cpp:794
llvm::AAResultsWrapperPass
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Definition:AliasAnalysis.h:981
llvm::AAResults
Definition:AliasAnalysis.h:314
llvm::APInt
Class for arbitrary precision integers.
Definition:APInt.h:78
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition:PassManager.h:253
llvm::AnalysisManager::getResult
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition:PassManager.h:410
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition:PassAnalysisSupport.h:47
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition:PassAnalysisSupport.h:75
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition:Pass.cpp:256
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition:ArrayRef.h:41
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition:ArrayRef.h:168
llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition:Analysis.h:72
llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition:DWARFExpression.h:32
llvm::DebugLoc
A debug info location.
Definition:DebugLoc.h:33
llvm::DenseMap
Definition:DenseMap.h:727
llvm::DenseSet
Implements a dense probed hash-table based set.
Definition:DenseSet.h:278
llvm::FunctionAnalysisManagerMachineFunctionProxy
Definition:MachinePassManager.h:132
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition:Pass.h:310
llvm::Function::hasOptNone
bool hasOptNone() const
Do not optimize this function (-O0).
Definition:Function.h:701
llvm::GCNSubtarget
Definition:GCNSubtarget.h:34
llvm::GCNSubtarget::loadStoreOptEnabled
bool loadStoreOptEnabled() const
Definition:GCNSubtarget.h:1141
llvm::GCNSubtarget::hasFlatInstOffsets
bool hasFlatInstOffsets() const
Definition:GCNSubtarget.h:641
llvm::GCNSubtarget::getInstrInfo
const SIInstrInfo * getInstrInfo() const override
Definition:GCNSubtarget.h:279
llvm::GCNSubtarget::hasDwordx3LoadStores
bool hasDwordx3LoadStores() const
Definition:GCNSubtarget.h:1168
llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition:GCNSubtarget.h:287
llvm::GCNSubtarget::ldsRequiresM0Init
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition:GCNSubtarget.h:716
llvm::GCNSubtarget::hasScalarDwordx3Loads
bool hasScalarDwordx3Loads() const
Definition:GCNSubtarget.h:1025
llvm::GCNSubtarget::isXNACKEnabled
bool isXNACKEnabled() const
Definition:GCNSubtarget.h:619
llvm::LocationSize::getValue
TypeSize getValue() const
Definition:MemoryLocation.h:170
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition:MCInstrDesc.h:198
llvm::MFPropsModifier
An RAII based helper class to modify MachineFunctionProperties when running pass.
Definition:MachinePassManager.h:42
llvm::MachineBasicBlock
Definition:MachineBasicBlock.h:125
llvm::MachineBasicBlock::begin
iterator begin()
Definition:MachineBasicBlock.h:355
llvm::MachineBasicBlock::end
iterator end()
Definition:MachineBasicBlock.h:357
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition:MachineBasicBlock.h:311
llvm::MachineFunctionPass
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
Definition:MachineFunctionPass.h:30
llvm::MachineFunctionPass::getAnalysisUsage
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
Definition:MachineFunctionPass.cpp:169
llvm::MachineFunctionPass::runOnMachineFunction
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
llvm::MachineFunctionPass::getRequiredProperties
virtual MachineFunctionProperties getRequiredProperties() const
Definition:MachineFunctionPass.h:56
llvm::MachineFunctionProperties
Properties which a MachineFunction may have at a given point in time.
Definition:MachineFunction.h:137
llvm::MachineFunctionProperties::set
MachineFunctionProperties & set(Property P)
Definition:MachineFunction.h:207
llvm::MachineFunction
Definition:MachineFunction.h:267
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition:MachineFunction.h:733
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition:MachineFunction.cpp:536
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition:MachineFunction.h:743
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition:MachineFunction.h:704
llvm::MachineInstrBuilder
Definition:MachineInstrBuilder.h:71
llvm::MachineInstrBuilder::cloneMergedMemRefs
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
Definition:MachineInstrBuilder.h:221
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition:MachineInstrBuilder.h:133
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition:MachineInstrBuilder.h:226
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition:MachineInstrBuilder.h:99
llvm::MachineInstrBundleIterator< MachineInstr >
llvm::MachineInstr
Representation of each machine instruction.
Definition:MachineInstr.h:71
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition:MachineInstr.h:577
llvm::MachineInstr::eraseFromParent
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
Definition:MachineInstr.cpp:767
llvm::MachineInstr::dump
void dump() const
Definition:MachineInstr.cpp:1695
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition:MachineMemOperand.h:129
llvm::MachineMemOperand::getSize
LocationSize getSize() const
Return the size in bytes of the memory reference.
Definition:MachineMemOperand.h:240
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition:MachineMemOperand.h:233
llvm::MachineMemOperand::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition:MachineMemOperand.h:204
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition:MachineOperand.h:48
llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition:MachineOperand.h:374
llvm::MachineOperand::getImm
int64_t getImm() const
Definition:MachineOperand.h:556
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition:MachineOperand.h:329
llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition:MachineOperand.h:331
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition:MachineOperand.h:820
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition:MachineOperand.h:369
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition:MachineOperand.h:838
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition:MachineRegisterInfo.h:51
llvm::Pass::dump
void dump() const
Definition:Pass.cpp:136
llvm::Pass::getPassName
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Definition:Pass.cpp:81
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition:Analysis.h:111
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition:Analysis.h:117
llvm::PreservedAnalyses::preserveSet
void preserveSet()
Mark an analysis set as preserved.
Definition:Analysis.h:146
llvm::Register
Wrapper class representing virtual and physical registers.
Definition:Register.h:19
llvm::Register::isPhysical
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition:Register.h:95
llvm::SIInstrInfo
Definition:SIInstrInfo.h:85
llvm::SIInstrInfo::isFLATScratch
static bool isFLATScratch(const MachineInstr &MI)
Definition:SIInstrInfo.h:645
llvm::SIInstrInfo::isFLATGlobal
static bool isFLATGlobal(const MachineInstr &MI)
Definition:SIInstrInfo.h:637
llvm::SIInstrInfo::isFLAT
static bool isFLAT(const MachineInstr &MI)
Definition:SIInstrInfo.h:621
llvm::SILoadStoreOptimizerPass::run
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
Definition:SILoadStoreOptimizer.cpp:2586
llvm::SIRegisterInfo
Definition:SIRegisterInfo.h:32
llvm::SITargetLowering
Definition:SIISelLowering.h:31
llvm::SITargetLowering::isLegalFlatAddressingMode
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
Definition:SIISelLowering.cpp:1535
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition:SmallPtrSet.h:519
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition:SmallVector.h:937
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition:SmallVector.h:1196
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition:StringRef.h:51
llvm::TargetRegisterClass
Definition:TargetRegisterInfo.h:44
llvm::detail::DenseSetImpl::insert
std::pair< iterator, bool > insert(const ValueT &V)
Definition:DenseSet.h:213
llvm::detail::DenseSetImpl::contains
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition:DenseSet.h:193
uint32_t
uint64_t
unsigned
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition:ErrorHandling.h:143
OpName
Definition:R600Defines.h:62
false
Definition:StackSlotColoring.cpp:193
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition:AMDGPUAddrSpace.h:30
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition:AMDGPUAddrSpace.h:31
llvm::AMDGPUISD::BUFFER_STORE
@ BUFFER_STORE
Definition:AMDGPUISelLowering.h:584
llvm::AMDGPUISD::BUFFER_LOAD
@ BUFFER_LOAD
Definition:AMDGPUISelLowering.h:565
llvm::AMDGPU::getMIMGInfo
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
llvm::AMDGPU::convertSMRDOffsetUnits
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
Definition:AMDGPUBaseInfo.cpp:2898
llvm::AMDGPU::getMTBUFHasSrsrc
bool getMTBUFHasSrsrc(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:452
llvm::AMDGPU::getMTBUFElements
int getMTBUFElements(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:442
llvm::AMDGPU::getNamedOperandIdx
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
llvm::AMDGPU::getMTBUFHasSoffset
bool getMTBUFHasSoffset(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:457
llvm::AMDGPU::getMUBUFOpcode
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition:AMDGPUBaseInfo.cpp:467
llvm::AMDGPU::getMUBUFBaseOpcode
int getMUBUFBaseOpcode(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:462
llvm::AMDGPU::getMTBUFBaseOpcode
int getMTBUFBaseOpcode(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:432
llvm::AMDGPU::getMUBUFHasVAddr
bool getMUBUFHasVAddr(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:477
llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
Definition:AMDGPUBaseInfo.h:400
llvm::AMDGPU::getMTBUFOpcode
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
Definition:AMDGPUBaseInfo.cpp:437
llvm::AMDGPU::getMUBUFHasSoffset
bool getMUBUFHasSoffset(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:487
llvm::AMDGPU::getMIMGBaseOpcode
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:280
llvm::AMDGPU::getMIMGBaseOpcodeInfo
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::AMDGPU::getMaskedMIMGOp
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
Definition:AMDGPUBaseInfo.cpp:285
llvm::AMDGPU::getMTBUFHasVAddr
bool getMTBUFHasVAddr(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:447
llvm::AMDGPU::getMUBUFElements
int getMUBUFElements(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:472
llvm::AMDGPU::getGcnBufferFormatInfo
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
Definition:AMDGPUBaseInfo.cpp:2985
llvm::AMDGPU::getMUBUFHasSrsrc
bool getMUBUFHasSrsrc(unsigned Opc)
Definition:AMDGPUBaseInfo.cpp:482
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition:BitmaskEnum.h:125
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition:CallingConv.h:24
llvm::LPAC::UNKNOWN
@ UNKNOWN
Definition:LanaiAluCode.h:39
llvm::M68k::MemAddrModeKind::V
@ V
llvm::RegState::Dead
@ Dead
Unused definition.
Definition:MachineInstrBuilder.h:52
llvm::RegState::Define
@ Define
Register definition.
Definition:MachineInstrBuilder.h:46
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition:MachineInstrBuilder.h:50
llvm::SIInstrFlags::MIMG
@ MIMG
Definition:SIDefines.h:83
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition:X86DisassemblerDecoder.h:621
llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
llvm::pdb::DbgHeaderType::Max
@ Max
llvm::rdf::Def
NodeAddr< DefNode * > Def
Definition:RDFGraph.h:384
llvm::sampleprof::Base
@ Base
Definition:Discriminator.h:58
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition:AddressRanges.h:18
llvm::Offset
@ Offset
Definition:DWP.cpp:480
llvm::operator<
bool operator<(int64_t V1, const APSInt &V2)
Definition:APSInt.h:361
llvm::AnchorList
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
Definition:SampleProfileMatcher.h:22
llvm::popcount
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition:bit.h:385
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition:MachineInstrBuilder.h:373
llvm::createSILoadStoreOptimizerLegacyPass
FunctionPass * createSILoadStoreOptimizerLegacyPass()
Definition:SILoadStoreOptimizer.cpp:900
llvm::SILoadStoreOptimizerLegacyID
char & SILoadStoreOptimizerLegacyID
Definition:SILoadStoreOptimizer.cpp:898
llvm::countr_zero
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition:bit.h:215
llvm::getMachineFunctionPassPreservedAnalyses
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
Definition:MachinePassManager.cpp:158
llvm::countl_zero
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition:bit.h:281
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition:Debug.cpp:163
llvm::IRMemLocation::First
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
llvm::BitWidth
constexpr unsigned BitWidth
Definition:BitmaskEnum.h:217
llvm::printReg
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
Definition:TargetRegisterInfo.cpp:107
std::swap
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition:BitVector.h:860
llvm::AMDGPU::GcnBufferFormatInfo
Definition:AMDGPUBaseInfo.h:87
llvm::AMDGPU::GcnBufferFormatInfo::BitsPerComp
unsigned BitsPerComp
Definition:AMDGPUBaseInfo.h:89
llvm::AMDGPU::GcnBufferFormatInfo::Format
unsigned Format
Definition:AMDGPUBaseInfo.h:88
llvm::AMDGPU::GcnBufferFormatInfo::NumFormat
unsigned NumFormat
Definition:AMDGPUBaseInfo.h:91
llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition:AMDGPUBaseInfo.h:412
llvm::AMDGPU::MIMGInfo
Definition:AMDGPUBaseInfo.h:509
llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition:MachineMemOperand.h:41
llvm::TargetLoweringBase::AddrMode
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Definition:TargetLowering.h:2816
llvm::TargetLoweringBase::AddrMode::BaseOffs
int64_t BaseOffs
Definition:TargetLowering.h:2818
llvm::TargetLoweringBase::AddrMode::HasBaseReg
bool HasBaseReg
Definition:TargetLowering.h:2819

Generated on Fri Jul 18 2025 13:25:26 for LLVM by doxygen 1.9.6
[8]ページ先頭

©2009-2025 Movatter.jp