1//===- SILoadStoreOptimizer.cpp -------------------------------------------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This pass tries to fuse DS instructions with close by immediate offsets. 10// This will fuse operations such as 11// ds_read_b32 v0, v2 offset:16 12// ds_read_b32 v1, v2 offset:32 14// ds_read2_b32 v[0:1], v2, offset0:4 offset1:8 16// The same is done for certain SMEM and VMEM opcodes, e.g.: 17// s_buffer_load_dword s4, s[0:3], 4 18// s_buffer_load_dword s5, s[0:3], 8 20// s_buffer_load_dwordx2 s[4:5], s[0:3], 4 22// This pass also tries to promote constant offset to the immediate by 23// adjusting the base. It tries to use a base from the nearby instructions that 24// allows it to have a 13bit constant offset and then promotes the 13bit offset 27// s_movk_i32 s0, 0x1800 28// v_add_co_u32_e32 v0, vcc, s0, v2 29// v_addc_co_u32_e32 v1, vcc, 0, v6, vcc 31// s_movk_i32 s0, 0x1000 32// v_add_co_u32_e32 v5, vcc, s0, v2 33// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 34// global_load_dwordx2 v[5:6], v[5:6], off 35// global_load_dwordx2 v[0:1], v[0:1], off 37// s_movk_i32 s0, 0x1000 38// v_add_co_u32_e32 v5, vcc, s0, v2 39// v_addc_co_u32_e32 v6, vcc, 0, v6, vcc 40// global_load_dwordx2 v[5:6], v[5:6], off 41// global_load_dwordx2 v[0:1], v[5:6], off offset:2048 43// Future improvements: 45// - This is currently missing stores of constants because loading 46// the constant into the data register is placed between the stores, although 47// this is arguably a scheduling problem. 49// - Live interval recomputing seems inefficient. This currently only matches 50// one pair, and recomputes live intervals and moves on to the next pair. It 51// would be better to compute a list of all merges that need to occur. 53// - With a list of instructions to process, we can also merge more. If a 54// cluster of loads have offsets that are too large to fit in the 8-bit 55// offsets, but are close enough to fit in the 8 bits, we can add to the base 56// pointer and use the new reduced offsets. 58//===----------------------------------------------------------------------===// 70#define DEBUG_TYPE "si-load-store-opt" 78 S_BUFFER_LOAD_SGPR_IMM,
89 GLOBAL_LOAD,
// GLOBAL_LOAD/GLOBAL_STORE are never used as the InstClass of 90 GLOBAL_STORE
// any CombineInfo, they are only ever returned by 95unsignedchar NumVAddrs = 0;
105// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp. 106constunsigned MaxAddressRegs = 12 + 1 + 1;
108classSILoadStoreOptimizer {
117 InstClassEnum InstClass;
121int AddrIdx[MaxAddressRegs];
123unsigned NumAddresses;
126bool hasSameBaseAddress(
const CombineInfo &CI) {
127if (NumAddresses != CI.NumAddresses)
131for (
unsigned i = 0; i < NumAddresses; i++) {
134if (AddrReg[i]->
isImm() || AddrRegNext.
isImm()) {
135if (AddrReg[i]->
isImm() != AddrRegNext.
isImm() ||
142// Check same base pointer. Be careful of subregisters, which can occur 143// with vectors of pointers. 153for (
unsigned i = 0; i < NumAddresses; ++i) {
155// Immediates are always OK. 159// Don't try to merge addresses that aren't either immediates or registers. 160// TODO: Should be possible to merge FrameIndexes and maybe some other 165// TODO: We should be able to merge instructions with other physical reg 168 AddrOp->
getReg() != AMDGPU::SGPR_NULL)
171// If an address has only one use then there will be no other 172// instructions with the same address, so we can't merge this one. 173if (
MRI.hasOneNonDBGUse(AddrOp->
getReg()))
181// Compare by pointer order. 183return (InstClass == MIMG) ? DMask <
Other.DMask : Offset <
Other.Offset;
191unsigned LoSubReg = 0;
192unsigned HiSubReg = 0;
213staticbool dmasksCanBeCombined(
const CombineInfo &CI,
215const CombineInfo &Paired);
216staticbool offsetsCanBeCombined(CombineInfo &CI,
constGCNSubtarget &STI,
217 CombineInfo &Paired,
bool Modify =
false);
218staticbool widthsFit(
constGCNSubtarget &STI,
const CombineInfo &CI,
219const CombineInfo &Paired);
220unsigned getNewOpcode(
const CombineInfo &CI,
const CombineInfo &Paired);
221static std::pair<unsigned, unsigned> getSubRegIdxs(
const CombineInfo &CI,
222const CombineInfo &Paired);
224 getTargetRegisterClass(
const CombineInfo &CI,
225const CombineInfo &Paired)
const;
228 CombineInfo *checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired);
230void copyToDestRegs(CombineInfo &CI, CombineInfo &Paired,
233Register copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
237unsigned read2Opcode(
unsigned EltSize)
const;
238unsigned read2ST64Opcode(
unsigned EltSize)
const;
240 mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
243unsigned write2Opcode(
unsigned EltSize)
const;
244unsigned write2ST64Opcode(
unsigned EltSize)
const;
246 mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
249 mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
252 mergeSMemLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
255 mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
258 mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
261 mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
264 mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
267 mergeFlatLoadPair(CombineInfo &CI, CombineInfo &Paired,
270 mergeFlatStorePair(CombineInfo &CI, CombineInfo &Paired,
274 int32_t NewOffset)
const;
279 /// Promotes constant offset to the immediate by adjusting the base. It 280 /// tries to use a base from the nearby instructions that allows it to have 281 /// a 13bit constant offset which gets promoted to the immediate. 286 std::list<std::list<CombineInfo> > &MergeableInsts)
const;
291 std::list<std::list<CombineInfo>> &MergeableInsts)
const;
294const CombineInfo &Paired);
296static InstClassEnum getCommonInstClass(
const CombineInfo &CI,
297const CombineInfo &Paired);
299bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
300bool &OptimizeListAgain);
301booloptimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
327 .
set(MachineFunctionProperties::Property::IsSSA);
332constunsigned Opc =
MI.getOpcode();
334if (
TII.isMUBUF(Opc)) {
335// FIXME: Handle d16 correctly 338if (
TII.isImage(
MI)) {
340TII.getNamedOperand(
MI, AMDGPU::OpName::dmask)->getImm();
343if (
TII.isMTBUF(Opc)) {
348case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
349case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
350case AMDGPU::S_LOAD_DWORD_IMM:
351case AMDGPU::GLOBAL_LOAD_DWORD:
352case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
353case AMDGPU::GLOBAL_STORE_DWORD:
354case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
355case AMDGPU::FLAT_LOAD_DWORD:
356case AMDGPU::FLAT_STORE_DWORD:
358case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
359case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
360case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
361case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
362case AMDGPU::S_LOAD_DWORDX2_IMM:
363case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
364case AMDGPU::GLOBAL_LOAD_DWORDX2:
365case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
366case AMDGPU::GLOBAL_STORE_DWORDX2:
367case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
368case AMDGPU::FLAT_LOAD_DWORDX2:
369case AMDGPU::FLAT_STORE_DWORDX2:
371case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
372case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
373case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
374case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
375case AMDGPU::S_LOAD_DWORDX3_IMM:
376case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
377case AMDGPU::GLOBAL_LOAD_DWORDX3:
378case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
379case AMDGPU::GLOBAL_STORE_DWORDX3:
380case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
381case AMDGPU::FLAT_LOAD_DWORDX3:
382case AMDGPU::FLAT_STORE_DWORDX3:
384case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
385case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
386case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
387case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
388case AMDGPU::S_LOAD_DWORDX4_IMM:
389case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
390case AMDGPU::GLOBAL_LOAD_DWORDX4:
391case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
392case AMDGPU::GLOBAL_STORE_DWORDX4:
393case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
394case AMDGPU::FLAT_LOAD_DWORDX4:
395case AMDGPU::FLAT_STORE_DWORDX4:
397case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
398case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
399case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
400case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
401case AMDGPU::S_LOAD_DWORDX8_IMM:
402case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
404case AMDGPU::DS_READ_B32:
405case AMDGPU::DS_READ_B32_gfx9:
406case AMDGPU::DS_WRITE_B32:
407case AMDGPU::DS_WRITE_B32_gfx9:
409case AMDGPU::DS_READ_B64:
410case AMDGPU::DS_READ_B64_gfx9:
411case AMDGPU::DS_WRITE_B64:
412case AMDGPU::DS_WRITE_B64_gfx9:
419/// Maps instruction opcode to enum InstClassEnum. 420static InstClassEnum getInstClass(
unsigned Opc,
constSIInstrInfo &
TII) {
423if (
TII.isMUBUF(Opc)) {
427case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN:
428case AMDGPU::BUFFER_LOAD_DWORD_BOTHEN_exact:
429case AMDGPU::BUFFER_LOAD_DWORD_IDXEN:
430case AMDGPU::BUFFER_LOAD_DWORD_IDXEN_exact:
431case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
432case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
433case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
434case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
435case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN:
436case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:
437case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN:
438case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:
439case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
440case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
441case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
442case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
444case AMDGPU::BUFFER_STORE_DWORD_BOTHEN:
445case AMDGPU::BUFFER_STORE_DWORD_BOTHEN_exact:
446case AMDGPU::BUFFER_STORE_DWORD_IDXEN:
447case AMDGPU::BUFFER_STORE_DWORD_IDXEN_exact:
448case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
449case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
450case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
451case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
452case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN:
453case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact:
454case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN:
455case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_IDXEN_exact:
456case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
457case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
458case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
459case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
463if (
TII.isImage(Opc)) {
464// Ignore instructions encoded without vaddr. 468// Ignore BVH instructions 471// TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD. 472if (
TII.get(Opc).mayStore() || !
TII.get(Opc).mayLoad() ||
477if (
TII.isMTBUF(Opc)) {
481case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
482case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
483case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
484case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
485case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
486case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
487case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
488case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
489case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
490case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
491case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
492case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
493case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
494case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
495case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
496case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
498case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
499case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
500case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
501case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
502case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
503case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
504case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
505case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
510case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
511case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
512case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
513case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
514case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
515case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
516case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
517case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
518case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
519return S_BUFFER_LOAD_IMM;
520case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
521case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
522case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
523case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
524case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
525case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
526case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
527case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
528case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
529return S_BUFFER_LOAD_SGPR_IMM;
530case AMDGPU::S_LOAD_DWORD_IMM:
531case AMDGPU::S_LOAD_DWORDX2_IMM:
532case AMDGPU::S_LOAD_DWORDX3_IMM:
533case AMDGPU::S_LOAD_DWORDX4_IMM:
534case AMDGPU::S_LOAD_DWORDX8_IMM:
535case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
536case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
537case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
538case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
540case AMDGPU::DS_READ_B32:
541case AMDGPU::DS_READ_B32_gfx9:
542case AMDGPU::DS_READ_B64:
543case AMDGPU::DS_READ_B64_gfx9:
545case AMDGPU::DS_WRITE_B32:
546case AMDGPU::DS_WRITE_B32_gfx9:
547case AMDGPU::DS_WRITE_B64:
548case AMDGPU::DS_WRITE_B64_gfx9:
550case AMDGPU::GLOBAL_LOAD_DWORD:
551case AMDGPU::GLOBAL_LOAD_DWORDX2:
552case AMDGPU::GLOBAL_LOAD_DWORDX3:
553case AMDGPU::GLOBAL_LOAD_DWORDX4:
554case AMDGPU::FLAT_LOAD_DWORD:
555case AMDGPU::FLAT_LOAD_DWORDX2:
556case AMDGPU::FLAT_LOAD_DWORDX3:
557case AMDGPU::FLAT_LOAD_DWORDX4:
559case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
560case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
561case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
562case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
563return GLOBAL_LOAD_SADDR;
564case AMDGPU::GLOBAL_STORE_DWORD:
565case AMDGPU::GLOBAL_STORE_DWORDX2:
566case AMDGPU::GLOBAL_STORE_DWORDX3:
567case AMDGPU::GLOBAL_STORE_DWORDX4:
568case AMDGPU::FLAT_STORE_DWORD:
569case AMDGPU::FLAT_STORE_DWORDX2:
570case AMDGPU::FLAT_STORE_DWORDX3:
571case AMDGPU::FLAT_STORE_DWORDX4:
573case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
574case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
575case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
576case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
577return GLOBAL_STORE_SADDR;
581/// Determines instruction subclass from opcode. Only instructions 582/// of the same subclass can be merged together. The merged instruction may have 583/// a different subclass but must have the same class. 584staticunsigned getInstSubclass(
unsigned Opc,
constSIInstrInfo &
TII) {
589if (
TII.isImage(Opc)) {
592returnInfo->BaseOpcode;
597case AMDGPU::DS_READ_B32:
598case AMDGPU::DS_READ_B32_gfx9:
599case AMDGPU::DS_READ_B64:
600case AMDGPU::DS_READ_B64_gfx9:
601case AMDGPU::DS_WRITE_B32:
602case AMDGPU::DS_WRITE_B32_gfx9:
603case AMDGPU::DS_WRITE_B64:
604case AMDGPU::DS_WRITE_B64_gfx9:
606case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
607case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
608case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
609case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
610case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
611case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
612case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
613case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
614case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
615return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
616case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
617case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
618case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
619case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
620case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
621case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
622case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
623case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
624case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
625return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
626case AMDGPU::S_LOAD_DWORD_IMM:
627case AMDGPU::S_LOAD_DWORDX2_IMM:
628case AMDGPU::S_LOAD_DWORDX3_IMM:
629case AMDGPU::S_LOAD_DWORDX4_IMM:
630case AMDGPU::S_LOAD_DWORDX8_IMM:
631case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
632case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
633case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
634case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
635return AMDGPU::S_LOAD_DWORD_IMM;
636case AMDGPU::GLOBAL_LOAD_DWORD:
637case AMDGPU::GLOBAL_LOAD_DWORDX2:
638case AMDGPU::GLOBAL_LOAD_DWORDX3:
639case AMDGPU::GLOBAL_LOAD_DWORDX4:
640case AMDGPU::FLAT_LOAD_DWORD:
641case AMDGPU::FLAT_LOAD_DWORDX2:
642case AMDGPU::FLAT_LOAD_DWORDX3:
643case AMDGPU::FLAT_LOAD_DWORDX4:
644return AMDGPU::FLAT_LOAD_DWORD;
645case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
646case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
647case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
648case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
649return AMDGPU::GLOBAL_LOAD_DWORD_SADDR;
650case AMDGPU::GLOBAL_STORE_DWORD:
651case AMDGPU::GLOBAL_STORE_DWORDX2:
652case AMDGPU::GLOBAL_STORE_DWORDX3:
653case AMDGPU::GLOBAL_STORE_DWORDX4:
654case AMDGPU::FLAT_STORE_DWORD:
655case AMDGPU::FLAT_STORE_DWORDX2:
656case AMDGPU::FLAT_STORE_DWORDX3:
657case AMDGPU::FLAT_STORE_DWORDX4:
658return AMDGPU::FLAT_STORE_DWORD;
659case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
660case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
661case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
662case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
663return AMDGPU::GLOBAL_STORE_DWORD_SADDR;
667// GLOBAL loads and stores are classified as FLAT initially. If both combined 668// instructions are FLAT GLOBAL adjust the class to GLOBAL_LOAD or GLOBAL_STORE. 669// If either or both instructions are non segment specific FLAT the resulting 670// combined operation will be FLAT, potentially promoting one of the GLOBAL 671// operations to FLAT. 672// For other instructions return the original unmodified class. 674SILoadStoreOptimizer::getCommonInstClass(
const CombineInfo &CI,
675const CombineInfo &Paired) {
676assert(CI.InstClass == Paired.InstClass);
678if ((CI.InstClass == FLAT_LOAD || CI.InstClass == FLAT_STORE) &&
680return (CI.InstClass == FLAT_STORE) ? GLOBAL_STORE : GLOBAL_LOAD;
688if (
TII.isMUBUF(Opc)) {
694 Result.SOffset =
true;
699if (
TII.isImage(Opc)) {
703TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
705 Result.NumVAddrs = RsrcIdx - VAddr0Idx;
716if (
TII.isMTBUF(Opc)) {
722 Result.SOffset =
true;
730case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
731case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
732case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
733case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
734case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
735case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec:
736case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec:
737case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec:
738case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec:
739 Result.SOffset =
true;
741case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
742case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
743case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
744case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
745case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
746case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec:
747case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec:
748case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec:
749case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec:
750case AMDGPU::S_LOAD_DWORD_IMM:
751case AMDGPU::S_LOAD_DWORDX2_IMM:
752case AMDGPU::S_LOAD_DWORDX3_IMM:
753case AMDGPU::S_LOAD_DWORDX4_IMM:
754case AMDGPU::S_LOAD_DWORDX8_IMM:
755case AMDGPU::S_LOAD_DWORDX2_IMM_ec:
756case AMDGPU::S_LOAD_DWORDX3_IMM_ec:
757case AMDGPU::S_LOAD_DWORDX4_IMM_ec:
758case AMDGPU::S_LOAD_DWORDX8_IMM_ec:
761case AMDGPU::DS_READ_B32:
762case AMDGPU::DS_READ_B64:
763case AMDGPU::DS_READ_B32_gfx9:
764case AMDGPU::DS_READ_B64_gfx9:
765case AMDGPU::DS_WRITE_B32:
766case AMDGPU::DS_WRITE_B64:
767case AMDGPU::DS_WRITE_B32_gfx9:
768case AMDGPU::DS_WRITE_B64_gfx9:
771case AMDGPU::GLOBAL_LOAD_DWORD_SADDR:
772case AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR:
773case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
774case AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR:
775case AMDGPU::GLOBAL_STORE_DWORD_SADDR:
776case AMDGPU::GLOBAL_STORE_DWORDX2_SADDR:
777case AMDGPU::GLOBAL_STORE_DWORDX3_SADDR:
778case AMDGPU::GLOBAL_STORE_DWORDX4_SADDR:
781case AMDGPU::GLOBAL_LOAD_DWORD:
782case AMDGPU::GLOBAL_LOAD_DWORDX2:
783case AMDGPU::GLOBAL_LOAD_DWORDX3:
784case AMDGPU::GLOBAL_LOAD_DWORDX4:
785case AMDGPU::GLOBAL_STORE_DWORD:
786case AMDGPU::GLOBAL_STORE_DWORDX2:
787case AMDGPU::GLOBAL_STORE_DWORDX3:
788case AMDGPU::GLOBAL_STORE_DWORDX4:
789case AMDGPU::FLAT_LOAD_DWORD:
790case AMDGPU::FLAT_LOAD_DWORDX2:
791case AMDGPU::FLAT_LOAD_DWORDX3:
792case AMDGPU::FLAT_LOAD_DWORDX4:
793case AMDGPU::FLAT_STORE_DWORD:
794case AMDGPU::FLAT_STORE_DWORDX2:
795case AMDGPU::FLAT_STORE_DWORDX3:
796case AMDGPU::FLAT_STORE_DWORDX4:
803const SILoadStoreOptimizer &LSO) {
805unsigned Opc =
MI->getOpcode();
806 InstClass = getInstClass(Opc, *LSO.TII);
808if (InstClass == UNKNOWN)
811 IsAGPR = LSO.TRI->hasAGPRs(LSO.getDataRegClass(*
MI));
816 (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
821 (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
824case S_BUFFER_LOAD_IMM:
825case S_BUFFER_LOAD_SGPR_IMM:
834if (InstClass == MIMG) {
835 DMask = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::dmask)->getImm();
836// Offset is not considered for MIMG instructions. 840Offset =
I->getOperand(OffsetIdx).getImm();
843if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
844 Format = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::format)->getImm();
846 Width = getOpcodeWidth(*
I, *LSO.TII);
848if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
850 }
elseif (InstClass != MIMG) {
851 CPol = LSO.TII->getNamedOperand(*
I, AMDGPU::OpName::cpol)->getImm();
854 AddressRegs Regs = getRegs(Opc, *LSO.TII);
855bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*
I) || LSO.TII->isVSAMPLE(*
I);
858for (
unsigned J = 0; J < Regs.NumVAddrs; J++)
859 AddrIdx[NumAddresses++] =
862 AddrIdx[NumAddresses++] =
865 AddrIdx[NumAddresses++] =
869 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
871 AddrIdx[NumAddresses++] =
874 AddrIdx[NumAddresses++] =
877 AddrIdx[NumAddresses++] =
881 Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
882assert(NumAddresses <= MaxAddressRegs);
884for (
unsigned J = 0; J < NumAddresses; J++)
885 AddrReg[J] = &
I->getOperand(AddrIdx[J]);
888}
// end anonymous namespace. 891"SI Load Store Optimizer",
false,
false)
896char SILoadStoreOptimizerLegacy::
ID = 0;
901returnnew SILoadStoreOptimizerLegacy();
907for (
constauto &
Op :
MI.operands()) {
917bool SILoadStoreOptimizer::canSwapInstructions(
920if (
A.mayLoadOrStore() &&
B.mayLoadOrStore() &&
921 (
A.mayStore() ||
B.mayStore()) &&
A.mayAlias(AA,
B,
true))
923for (
constauto &BOp :
B.operands()) {
926if ((BOp.isDef() || BOp.readsReg()) && ARegDefs.
contains(BOp.getReg()))
928if (BOp.isDef() && ARegUses.
contains(BOp.getReg()))
934// Given that \p CI and \p Paired are adjacent memory operations produce a new 935// MMO for the combined operation with a new access size. 937SILoadStoreOptimizer::combineKnownAdjacentMMOs(
const CombineInfo &CI,
938const CombineInfo &Paired) {
944// A base pointer for the combined operation is the same as the leading 945// operation's pointer. 950// If merging FLAT and GLOBAL set address space to FLAT. 958bool SILoadStoreOptimizer::dmasksCanBeCombined(
const CombineInfo &CI,
960const CombineInfo &Paired) {
961assert(CI.InstClass == MIMG);
963// Ignore instructions with tfe/lwe set. 964constauto *TFEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
965constauto *LWEOp =
TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
967if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
970// Check other optional immediate operands for equality. 971unsigned OperandsToMatch[] = {AMDGPU::OpName::cpol, AMDGPU::OpName::d16,
972 AMDGPU::OpName::unorm, AMDGPU::OpName::da,
973 AMDGPU::OpName::r128, AMDGPU::OpName::a16};
975for (
autoop : OperandsToMatch) {
980 CI.I->getOperand(
Idx).getImm() != Paired.I->getOperand(
Idx).getImm())
984// Check DMask for overlaps. 985unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
986unsigned MinMask = std::min(CI.DMask, Paired.DMask);
992if ((1u << AllowedBitsForMin) <= MinMask)
999unsigned ComponentCount,
1001if (ComponentCount > 4)
1020return NewFormatInfo->
Format;
1023// Return the value in the inclusive range [Lo,Hi] that is aligned to the 1024// highest power of two. Note that the result is well defined for all inputs 1025// including corner cases like: 1026// - if Lo == Hi, return that value 1027// - if Lo == 0, return 0 (even though the "- 1" below underflows 1028// - if Lo > Hi, return 0 (as if the range wrapped around) 1033bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
1035 CombineInfo &Paired,
1037assert(CI.InstClass != MIMG);
1039// XXX - Would the same offset be OK? Is there any reason this would happen or 1041if (CI.Offset == Paired.Offset)
1044// This won't be valid if the offset isn't aligned. 1045if ((CI.Offset % CI.EltSize != 0) || (Paired.Offset % CI.EltSize != 0))
1048if (CI.InstClass == TBUFFER_LOAD || CI.InstClass == TBUFFER_STORE) {
1063// TODO: Should be possible to support more formats, but if format loads 1064// are not dword-aligned, the merged load might not be valid. 1072uint32_t EltOffset0 = CI.Offset / CI.EltSize;
1073uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
1077// Handle all non-DS instructions. 1078if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
1079if (EltOffset0 + CI.Width != EltOffset1 &&
1080 EltOffset1 + Paired.Width != EltOffset0)
1082if (CI.CPol != Paired.CPol)
1084if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
1085 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
1086// Reject cases like: 1087// dword + dwordx2 -> dwordx3 1088// dword + dwordx3 -> dwordx4 1089// If we tried to combine these cases, we would fail to extract a subreg 1090// for the result of the second load due to SGPR alignment requirements. 1091if (CI.Width != Paired.Width &&
1092 (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
1098// If the offset in elements doesn't fit in 8-bits, we might be able to use 1099// the stride 64 versions. 1100if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
1101 isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
1103 CI.Offset = EltOffset0 / 64;
1104 Paired.Offset = EltOffset1 / 64;
1110// Check if the new offsets fit in the reduced 8-bit range. 1111if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
1113 CI.Offset = EltOffset0;
1114 Paired.Offset = EltOffset1;
1119// Try to shift base address to decrease offsets. 1120uint32_t Min = std::min(EltOffset0, EltOffset1);
1124if (((Max - Min) & ~Mask) == 0) {
1126// From the range of values we could use for BaseOff, choose the one that 1127// is aligned to the highest power of two, to maximise the chance that 1128// the same offset can be reused for other load/store pairs. 1130// Copy the low bits of the offsets, so that when we adjust them by 1131// subtracting BaseOff they will be multiples of 64. 1132 BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
1133 CI.BaseOff = BaseOff * CI.EltSize;
1134 CI.Offset = (EltOffset0 - BaseOff) / 64;
1135 Paired.Offset = (EltOffset1 - BaseOff) / 64;
1141if (isUInt<8>(Max - Min)) {
1143// From the range of values we could use for BaseOff, choose the one that 1144// is aligned to the highest power of two, to maximise the chance that 1145// the same offset can be reused for other load/store pairs. 1147 CI.BaseOff = BaseOff * CI.EltSize;
1148 CI.Offset = EltOffset0 - BaseOff;
1149 Paired.Offset = EltOffset1 - BaseOff;
1157bool SILoadStoreOptimizer::widthsFit(
constGCNSubtarget &STM,
1158const CombineInfo &CI,
1159const CombineInfo &Paired) {
1160constunsigned Width = (CI.Width + Paired.Width);
1161switch (CI.InstClass) {
1164case S_BUFFER_LOAD_IMM:
1165case S_BUFFER_LOAD_SGPR_IMM:
1181SILoadStoreOptimizer::getDataRegClass(
constMachineInstr &
MI)
const{
1182if (
constauto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdst)) {
1183returnTRI->getRegClassForReg(*
MRI, Dst->getReg());
1185if (
constauto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::vdata)) {
1186returnTRI->getRegClassForReg(*
MRI, Src->getReg());
1188if (
constauto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::data0)) {
1189returnTRI->getRegClassForReg(*
MRI, Src->getReg());
1191if (
constauto *Dst =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdst)) {
1192returnTRI->getRegClassForReg(*
MRI, Dst->getReg());
1194if (
constauto *Src =
TII->getNamedOperand(
MI, AMDGPU::OpName::sdata)) {
1195returnTRI->getRegClassForReg(*
MRI, Src->getReg());
1200/// This function assumes that CI comes before Paired in a basic block. Return 1201/// an insertion point for the merged instruction or nullptr on failure. 1202SILoadStoreOptimizer::CombineInfo *
1203SILoadStoreOptimizer::checkAndPrepareMerge(CombineInfo &CI,
1204 CombineInfo &Paired) {
1205// If another instruction has already been merged into CI, it may now be a 1206// type that we can't do any further merging into. 1207if (CI.InstClass == UNKNOWN || Paired.InstClass == UNKNOWN)
1209assert(CI.InstClass == Paired.InstClass);
1211if (getInstSubclass(CI.I->getOpcode(), *
TII) !=
1212 getInstSubclass(Paired.I->getOpcode(), *
TII))
1215// Check both offsets (or masks for MIMG) can be combined and fit in the 1217if (CI.InstClass == MIMG) {
1218if (!dmasksCanBeCombined(CI, *
TII, Paired))
1221if (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired))
1228if (CI.I->mayLoad()) {
1229// Try to hoist Paired up to CI. 1232if (!canSwapInstructions(RegDefs, RegUses, *Paired.I, *
MBBI))
1237// Try to sink CI down to Paired. 1240if (!canSwapInstructions(RegDefs, RegUses, *CI.I, *
MBBI))
1246// Call offsetsCanBeCombined with modify = true so that the offsets are 1247// correct for the new instruction. This should return true, because 1248// this function should only be called on CombineInfo objects that 1249// have already been confirmed to be mergeable. 1250if (CI.InstClass == DS_READ || CI.InstClass == DS_WRITE)
1251 offsetsCanBeCombined(CI, *STM, Paired,
true);
1255// Copy the merged load result from DestReg to the original dest regs of CI and 1257void SILoadStoreOptimizer::copyToDestRegs(
1258 CombineInfo &CI, CombineInfo &Paired,
1264auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1266// Copy to the old destination registers. 1268auto *Dest0 =
TII->getNamedOperand(*CI.I,
OpName);
1269auto *Dest1 =
TII->getNamedOperand(*Paired.I,
OpName);
1271// The constrained sload instructions in S_LOAD_IMM class will have 1272// `early-clobber` flag in the dst operand. Remove the flag before using the 1274 Dest0->setIsEarlyClobber(
false);
1275 Dest1->setIsEarlyClobber(
false);
1278 .
add(*Dest0)
// Copy to same destination including flags and sub reg. 1279 .
addReg(DestReg, 0, SubRegIdx0);
1285// Return a register for the source of the merged store after copying the 1286// original source regs of CI and Paired into it. 1288SILoadStoreOptimizer::copyFromSrcRegs(CombineInfo &CI, CombineInfo &Paired,
1294auto [SubRegIdx0, SubRegIdx1] = getSubRegIdxs(CI, Paired);
1296// Copy to the new source register. 1298Register SrcReg =
MRI->createVirtualRegister(SuperRC);
1300constauto *Src0 =
TII->getNamedOperand(*CI.I,
OpName);
1301constauto *Src1 =
TII->getNamedOperand(*Paired.I,
OpName);
1312unsigned SILoadStoreOptimizer::read2Opcode(
unsigned EltSize)
const{
1314return (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
1315return (EltSize == 4) ? AMDGPU::DS_READ2_B32_gfx9 : AMDGPU::DS_READ2_B64_gfx9;
1318unsigned SILoadStoreOptimizer::read2ST64Opcode(
unsigned EltSize)
const{
1320return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
1322return (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32_gfx9
1323 : AMDGPU::DS_READ2ST64_B64_gfx9;
1327SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
1331// Be careful, since the addresses could be subregisters themselves in weird 1332// cases, like vectors of pointers. 1333constauto *AddrReg =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1335unsigned NewOffset0 = std::min(CI.Offset, Paired.Offset);
1336unsigned NewOffset1 = std::max(CI.Offset, Paired.Offset);
1338 CI.UseST64 ? read2ST64Opcode(CI.EltSize) : read2Opcode(CI.EltSize);
1340assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1341 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1346Register DestReg =
MRI->createVirtualRegister(SuperRC);
1350Register BaseReg = AddrReg->getReg();
1351unsigned BaseSubReg = AddrReg->getSubReg();
1352unsigned BaseRegFlags = 0;
1354Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1358 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1361TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1363 .addReg(AddrReg->getReg(), 0, BaseSubReg)
1364 .addImm(0);
// clamp bit 1370 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
// addr 1371 .
addImm(NewOffset0)
// offset0 1372 .
addImm(NewOffset1)
// offset1 1376 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1378 CI.I->eraseFromParent();
1379 Paired.I->eraseFromParent();
1385unsigned SILoadStoreOptimizer::write2Opcode(
unsigned EltSize)
const{
1387return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
1388return (EltSize == 4) ? AMDGPU::DS_WRITE2_B32_gfx9
1389 : AMDGPU::DS_WRITE2_B64_gfx9;
1392unsigned SILoadStoreOptimizer::write2ST64Opcode(
unsigned EltSize)
const{
1394return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32
1395 : AMDGPU::DS_WRITE2ST64_B64;
1397return (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32_gfx9
1398 : AMDGPU::DS_WRITE2ST64_B64_gfx9;
1402 CombineInfo &CI, CombineInfo &Paired,
1406// Be sure to use .addOperand(), and not .addReg() with these. We want to be 1407// sure we preserve the subregister index and any register flags set on them. 1409TII->getNamedOperand(*CI.I, AMDGPU::OpName::addr);
1411TII->getNamedOperand(*CI.I, AMDGPU::OpName::data0);
1413TII->getNamedOperand(*Paired.I, AMDGPU::OpName::data0);
1415unsigned NewOffset0 = CI.Offset;
1416unsigned NewOffset1 = Paired.Offset;
1418 CI.UseST64 ? write2ST64Opcode(CI.EltSize) : write2Opcode(CI.EltSize);
1420if (NewOffset0 > NewOffset1) {
1421// Canonicalize the merged instruction so the smaller offset comes first. 1426assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
1427 (NewOffset0 != NewOffset1) &&
"Computed offset doesn't fit");
1433unsigned BaseSubReg = AddrReg->
getSubReg();
1434unsigned BaseRegFlags = 0;
1436Register ImmReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1440 BaseReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
1443TII->getAddNoCarry(*
MBB, InsertBefore,
DL, BaseReg)
1445 .addReg(AddrReg->
getReg(), 0, BaseSubReg)
1446 .addImm(0);
// clamp bit 1452 .
addReg(BaseReg, BaseRegFlags, BaseSubReg)
// addr 1453 .
add(*Data0)
// data0 1454 .
add(*Data1)
// data1 1455 .
addImm(NewOffset0)
// offset0 1456 .
addImm(NewOffset1)
// offset1 1461 Paired.I->eraseFromParent();
1468SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
1472constunsigned Opcode = getNewOpcode(CI, Paired);
1476Register DestReg =
MRI->createVirtualRegister(SuperRC);
1477unsigned MergedDMask = CI.DMask | Paired.DMask;
1482for (
unsignedI = 1, E = (*CI.I).getNumOperands();
I != E; ++
I) {
1484 MIB.addImm(MergedDMask);
1486 MIB.add((*CI.I).getOperand(
I));
1489// It shouldn't be possible to get this far if the two instructions 1490// don't have a single memoperand, because MachineInstr::mayAlias() 1491// will return true if this is the case. 1492assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1494MachineInstr *
New = MIB.addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1496 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1498 CI.I->eraseFromParent();
1499 Paired.I->eraseFromParent();
1504 CombineInfo &CI, CombineInfo &Paired,
1508constunsigned Opcode = getNewOpcode(CI, Paired);
1512Register DestReg =
MRI->createVirtualRegister(SuperRC);
1513unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1515// It shouldn't be possible to get this far if the two instructions 1516// don't have a single memoperand, because MachineInstr::mayAlias() 1517// will return true if this is the case. 1518assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1522 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
1523if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
1524New.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
1525New.addImm(MergedOffset);
1526New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1528 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::sdst, DestReg);
1530 CI.I->eraseFromParent();
1531 Paired.I->eraseFromParent();
1536 CombineInfo &CI, CombineInfo &Paired,
1541constunsigned Opcode = getNewOpcode(CI, Paired);
1545// Copy to the new source register. 1546Register DestReg =
MRI->createVirtualRegister(SuperRC);
1547unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1551 AddressRegs Regs = getRegs(Opcode, *
TII);
1554 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1556// It shouldn't be possible to get this far if the two instructions 1557// don't have a single memoperand, because MachineInstr::mayAlias() 1558// will return true if this is the case. 1559assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1562 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1563 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1564 .addImm(MergedOffset)
// offset 1565 .addImm(CI.CPol)
// cpol 1567 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1569 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1571 CI.I->eraseFromParent();
1572 Paired.I->eraseFromParent();
1577 CombineInfo &CI, CombineInfo &Paired,
1582constunsigned Opcode = getNewOpcode(CI, Paired);
1586// Copy to the new source register. 1587Register DestReg =
MRI->createVirtualRegister(SuperRC);
1588unsigned MergedOffset = std::min(CI.Offset, Paired.Offset);
1592 AddressRegs Regs = getRegs(Opcode, *
TII);
1595 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1597unsigned JoinedFormat =
1600// It shouldn't be possible to get this far if the two instructions 1601// don't have a single memoperand, because MachineInstr::mayAlias() 1602// will return true if this is the case. 1603assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1606 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1607 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1608 .addImm(MergedOffset)
// offset 1609 .addImm(JoinedFormat)
// format 1610 .addImm(CI.CPol)
// cpol 1612 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1614 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata, DestReg);
1616 CI.I->eraseFromParent();
1617 Paired.I->eraseFromParent();
1622 CombineInfo &CI, CombineInfo &Paired,
1627constunsigned Opcode = getNewOpcode(CI, Paired);
1630 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1635 AddressRegs Regs = getRegs(Opcode, *
TII);
1638 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1640unsigned JoinedFormat =
1643// It shouldn't be possible to get this far if the two instructions 1644// don't have a single memoperand, because MachineInstr::mayAlias() 1645// will return true if this is the case. 1646assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1649 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1650 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1651 .addImm(std::min(CI.Offset, Paired.Offset))
// offset 1652 .addImm(JoinedFormat)
// format 1653 .addImm(CI.CPol)
// cpol 1655 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1657 CI.I->eraseFromParent();
1658 Paired.I->eraseFromParent();
1663 CombineInfo &CI, CombineInfo &Paired,
1668constunsigned Opcode = getNewOpcode(CI, Paired);
1671Register DestReg =
MRI->createVirtualRegister(SuperRC);
1675if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1679 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1680 .addImm(std::min(CI.Offset, Paired.Offset))
1682 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1684 copyToDestRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdst, DestReg);
1686 CI.I->eraseFromParent();
1687 Paired.I->eraseFromParent();
1692 CombineInfo &CI, CombineInfo &Paired,
1697constunsigned Opcode = getNewOpcode(CI, Paired);
1700 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1703 .
add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr))
1706if (
auto *SAddr =
TII->getNamedOperand(*CI.I, AMDGPU::OpName::saddr))
1710 MIB.addImm(std::min(CI.Offset, Paired.Offset))
1712 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1714 CI.I->eraseFromParent();
1715 Paired.I->eraseFromParent();
1722// Conservatively returns true if not found the MMO. 1724 (MMOs.
size() != 1 || MMOs[0]->getAlign().value() < Width * 4);
1727unsigned SILoadStoreOptimizer::getNewOpcode(
const CombineInfo &CI,
1728const CombineInfo &Paired) {
1729constunsigned Width = CI.Width + Paired.Width;
1731switch (getCommonInstClass(CI, Paired)) {
1733assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
1734// FIXME: Handle d16 correctly 1744case S_BUFFER_LOAD_IMM: {
1745// If XNACK is enabled, use the constrained opcodes when the first load is 1747bool NeedsConstrainedOpc =
1753return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM_ec
1754 : AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
1756return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM_ec
1757 : AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
1759return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM_ec
1760 : AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
1762return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM_ec
1763 : AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM;
1766case S_BUFFER_LOAD_SGPR_IMM: {
1767// If XNACK is enabled, use the constrained opcodes when the first load is 1769bool NeedsConstrainedOpc =
1775return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM_ec
1776 : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
1778return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM_ec
1779 : AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
1781return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM_ec
1782 : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
1784return NeedsConstrainedOpc ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM_ec
1785 : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
1789// If XNACK is enabled, use the constrained opcodes when the first load is 1791bool NeedsConstrainedOpc =
1797return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1798 : AMDGPU::S_LOAD_DWORDX2_IMM;
1800return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1801 : AMDGPU::S_LOAD_DWORDX3_IMM;
1803return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1804 : AMDGPU::S_LOAD_DWORDX4_IMM;
1806return NeedsConstrainedOpc ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1807 : AMDGPU::S_LOAD_DWORDX8_IMM;
1815return AMDGPU::GLOBAL_LOAD_DWORDX2;
1817return AMDGPU::GLOBAL_LOAD_DWORDX3;
1819return AMDGPU::GLOBAL_LOAD_DWORDX4;
1821case GLOBAL_LOAD_SADDR:
1826return AMDGPU::GLOBAL_LOAD_DWORDX2_SADDR;
1828return AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR;
1830return AMDGPU::GLOBAL_LOAD_DWORDX4_SADDR;
1837return AMDGPU::GLOBAL_STORE_DWORDX2;
1839return AMDGPU::GLOBAL_STORE_DWORDX3;
1841return AMDGPU::GLOBAL_STORE_DWORDX4;
1843case GLOBAL_STORE_SADDR:
1848return AMDGPU::GLOBAL_STORE_DWORDX2_SADDR;
1850return AMDGPU::GLOBAL_STORE_DWORDX3_SADDR;
1852return AMDGPU::GLOBAL_STORE_DWORDX4_SADDR;
1859return AMDGPU::FLAT_LOAD_DWORDX2;
1861return AMDGPU::FLAT_LOAD_DWORDX3;
1863return AMDGPU::FLAT_LOAD_DWORDX4;
1870return AMDGPU::FLAT_STORE_DWORDX2;
1872return AMDGPU::FLAT_STORE_DWORDX3;
1874return AMDGPU::FLAT_STORE_DWORDX4;
1883std::pair<unsigned, unsigned>
1884SILoadStoreOptimizer::getSubRegIdxs(
const CombineInfo &CI,
1885const CombineInfo &Paired) {
1886assert((CI.InstClass != MIMG ||
1888 CI.Width + Paired.Width)) &&
1894staticconstunsigned Idxs[5][4] = {
1895 {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
1896 {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4},
1897 {AMDGPU::sub2, AMDGPU::sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5},
1898 {AMDGPU::sub3, AMDGPU::sub3_sub4, AMDGPU::sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6},
1899 {AMDGPU::sub4, AMDGPU::sub4_sub5, AMDGPU::sub4_sub5_sub6, AMDGPU::sub4_sub5_sub6_sub7},
1902assert(CI.Width >= 1 && CI.Width <= 4);
1903assert(Paired.Width >= 1 && Paired.Width <= 4);
1906 Idx1 = Idxs[0][Paired.Width - 1];
1907 Idx0 = Idxs[Paired.Width][CI.Width - 1];
1909 Idx0 = Idxs[0][CI.Width - 1];
1910 Idx1 = Idxs[CI.Width][Paired.Width - 1];
1917SILoadStoreOptimizer::getTargetRegisterClass(
const CombineInfo &CI,
1918const CombineInfo &Paired)
const{
1919if (CI.InstClass == S_BUFFER_LOAD_IMM ||
1920 CI.InstClass == S_BUFFER_LOAD_SGPR_IMM || CI.InstClass == S_LOAD_IMM) {
1921switch (CI.Width + Paired.Width) {
1925return &AMDGPU::SReg_64_XEXECRegClass;
1927return &AMDGPU::SGPR_96RegClass;
1929return &AMDGPU::SGPR_128RegClass;
1931return &AMDGPU::SGPR_256RegClass;
1933return &AMDGPU::SGPR_512RegClass;
1937unsignedBitWidth = 32 * (CI.Width + Paired.Width);
1938returnTRI->isAGPRClass(getDataRegClass(*CI.I))
1944 CombineInfo &CI, CombineInfo &Paired,
1949constunsigned Opcode = getNewOpcode(CI, Paired);
1952 copyFromSrcRegs(CI, Paired, InsertBefore, AMDGPU::OpName::vdata);
1957 AddressRegs Regs = getRegs(Opcode, *
TII);
1960 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
1963// It shouldn't be possible to get this far if the two instructions 1964// don't have a single memoperand, because MachineInstr::mayAlias() 1965// will return true if this is the case. 1966assert(CI.I->hasOneMemOperand() && Paired.I->hasOneMemOperand());
1969 MIB.add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
1970 .add(*
TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
1971 .addImm(std::min(CI.Offset, Paired.Offset))
// offset 1972 .addImm(CI.CPol)
// cpol 1974 .addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
1976 CI.I->eraseFromParent();
1977 Paired.I->eraseFromParent();
1982SILoadStoreOptimizer::createRegOrImm(int32_t Val,
MachineInstr &
MI)
const{
1984if (
TII->isInlineConstant(V))
1987RegisterReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1990TII->get(AMDGPU::S_MOV_B32), Reg)
1997// Compute base address using Addr and return the final register. 1999const MemAddress &
Addr)
const{
2005Addr.Base.LoSubReg) &&
2006"Expected 32-bit Base-Register-Low!!");
2009Addr.Base.HiSubReg) &&
2010"Expected 32-bit Base-Register-Hi!!");
2015 createRegOrImm(
static_cast<int32_t
>(
Addr.Offset >> 32),
MI);
2017constauto *CarryRC =
TRI->getWaveMaskRegClass();
2018Register CarryReg =
MRI->createVirtualRegister(CarryRC);
2019Register DeadCarryReg =
MRI->createVirtualRegister(CarryRC);
2021Register DestSub0 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2022Register DestSub1 =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2042Register FullDestReg =
MRI->createVirtualRegister(
TRI->getVGPR64Class());
2055// Update base and offset with the NewBase and NewOffset in MI. 2058 int32_t NewOffset)
const{
2059auto *
Base =
TII->getNamedOperand(
MI, AMDGPU::OpName::vaddr);
2060Base->setReg(NewBase);
2061Base->setIsKill(
false);
2062TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->setImm(NewOffset);
2065std::optional<int32_t>
2074if (!Def ||
Def->getOpcode() != AMDGPU::S_MOV_B32 ||
2075 !
Def->getOperand(1).isImm())
2078returnDef->getOperand(1).getImm();
2081// Analyze Base and extracts: 2082// - 32bit base registers, subregisters 2083// - 64bit constant offset 2084// Expecting base computation as: 2085// %OFFSET0:sgpr_32 = S_MOV_B32 8000 2086// %LO:vgpr_32, %c:sreg_64_xexec = 2087// V_ADD_CO_U32_e64 %BASE_LO:vgpr_32, %103:sgpr_32, 2088// %HI:vgpr_32, = V_ADDC_U32_e64 %BASE_HI:vgpr_32, 0, killed %c:sreg_64_xexec 2090// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1 2092 MemAddress &
Addr)
const{
2097if (!Def ||
Def->getOpcode() != AMDGPU::REG_SEQUENCE
2098 ||
Def->getNumOperands() != 5)
2109if (!BaseLoDef || BaseLoDef->
getOpcode() != AMDGPU::V_ADD_CO_U32_e64 ||
2110 !BaseHiDef || BaseHiDef->
getOpcode() != AMDGPU::V_ADDC_U32_e64)
2113constauto *Src0 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src0);
2114constauto *Src1 =
TII->getNamedOperand(*BaseLoDef, AMDGPU::OpName::src1);
2116auto Offset0P = extractConstOffset(*Src0);
2120if (!(Offset0P = extractConstOffset(*Src1)))
2128 Src0 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src0);
2129 Src1 =
TII->getNamedOperand(*BaseHiDef, AMDGPU::OpName::src1);
2134if (!Src1->isImm() || Src0->isImm())
2147Addr.Offset = (*Offset0P & 0x00000000ffffffff) | (Offset1 << 32);
2150bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
2152 MemInfoMap &Visited,
2158// TODO: Support FLAT_SCRATCH. Currently code expects 64-bit pointers. 2170if (
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm()) {
2175// Step1: Find the base-registers and a 64bit constant offset. 2178if (!Visited.contains(&
MI)) {
2179 processBaseWithConstOffset(
Base, MAddr);
2180 Visited[&
MI] = MAddr;
2182 MAddr = Visited[&
MI];
2184if (MAddr.Offset == 0) {
2185LLVM_DEBUG(
dbgs() <<
" Failed to extract constant-offset or there are no" 2186" constant offsets that can be promoted.\n";);
2192 <<
"} Offset: " << MAddr.Offset <<
"\n\n";);
2194// Step2: Traverse through MI's basic block and find an anchor(that has the 2195// same base-registers) with the highest 13bit distance from MI's offset. 2196// E.g. (64bit loads) 2198// addr1 = &a + 4096; load1 = load(addr1, 0) 2199// addr2 = &a + 6144; load2 = load(addr2, 0) 2200// addr3 = &a + 8192; load3 = load(addr3, 0) 2201// addr4 = &a + 10240; load4 = load(addr4, 0) 2202// addr5 = &a + 12288; load5 = load(addr5, 0) 2204// Starting from the first load, the optimization will try to find a new base 2205// from which (&a + 4096) has 13 bit distance. Both &a + 6144 and &a + 8192 2206// has 13bit distance from &a + 4096. The heuristic considers &a + 8192 2207// as the new-base(anchor) because of the maximum distance which can 2208// accommodate more intermediate bases presumably. 2210// Step3: move (&a + 8192) above load1. Compute and promote offsets from 2211// (&a + 8192) for load1, load2, load4. 2213// load1 = load(addr, -4096) 2214// load2 = load(addr, -2048) 2215// load3 = load(addr, 0) 2216// load4 = load(addr, 2048) 2217// addr5 = &a + 12288; load5 = load(addr5, 0) 2220 MemAddress AnchorAddr;
2221uint32_t MaxDist = std::numeric_limits<uint32_t>::min();
2233// TODO: Support finding an anchor(with same base) from store addresses or 2234// any other load addresses where the opcodes are different. 2236TII->getNamedOperand(MINext, AMDGPU::OpName::offset)->getImm())
2240 *
TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
2241 MemAddress MAddrNext;
2242if (!Visited.contains(&MINext)) {
2243 processBaseWithConstOffset(BaseNext, MAddrNext);
2244 Visited[&MINext] = MAddrNext;
2246 MAddrNext = Visited[&MINext];
2248if (MAddrNext.Base.LoReg != MAddr.Base.LoReg ||
2249 MAddrNext.Base.HiReg != MAddr.Base.HiReg ||
2250 MAddrNext.Base.LoSubReg != MAddr.Base.LoSubReg ||
2251 MAddrNext.Base.HiSubReg != MAddr.Base.HiSubReg)
2254 InstsWCommonBase.
emplace_back(&MINext, MAddrNext.Offset);
2256 int64_t Dist = MAddr.Offset - MAddrNext.Offset;
2261 (
uint32_t)std::abs(Dist) > MaxDist) {
2262 MaxDist = std::abs(Dist);
2264 AnchorAddr = MAddrNext;
2265 AnchorInst = &MINext;
2270LLVM_DEBUG(
dbgs() <<
" Anchor-Inst(with max-distance from Offset): ";
2271 AnchorInst->
dump());
2273 << AnchorAddr.Offset <<
"\n\n");
2275// Instead of moving up, just re-compute anchor-instruction's base address. 2278 updateBaseAndOffset(
MI,
Base, MAddr.Offset - AnchorAddr.Offset);
2281for (
auto [OtherMI, OtherOffset] : InstsWCommonBase) {
2284 AM.
BaseOffs = OtherOffset - AnchorAddr.Offset;
2289 updateBaseAndOffset(*OtherMI,
Base, OtherOffset - AnchorAddr.Offset);
2300void SILoadStoreOptimizer::addInstToMergeableList(
const CombineInfo &CI,
2301 std::list<std::list<CombineInfo> > &MergeableInsts)
const{
2302for (std::list<CombineInfo> &AddrList : MergeableInsts) {
2303if (AddrList.front().InstClass == CI.InstClass &&
2304 AddrList.front().IsAGPR == CI.IsAGPR &&
2305 AddrList.front().hasSameBaseAddress(CI)) {
2306 AddrList.emplace_back(CI);
2311// Base address not found, so add a new list. 2312 MergeableInsts.emplace_back(1, CI);
2315std::pair<MachineBasicBlock::iterator, bool>
2316SILoadStoreOptimizer::collectMergeableInsts(
2319 std::list<std::list<CombineInfo>> &MergeableInsts)
const{
2322// Sort potential mergeable instructions into lists. One list per base address. 2325for (; BlockI !=
End; ++BlockI) {
2328// We run this before checking if an address is mergeable, because it can produce 2329// better code even if the instructions aren't mergeable. 2333// Treat volatile accesses, ordered accesses and unmodeled side effects as 2334// barriers. We can look after this barrier for separate merges. 2335if (
MI.hasOrderedMemoryRef() ||
MI.hasUnmodeledSideEffects()) {
2338// Search will resume after this instruction in a separate merge list. 2343const InstClassEnum InstClass = getInstClass(
MI.getOpcode(), *
TII);
2344if (InstClass == UNKNOWN)
2347// Do not merge VMEM buffer instructions with "swizzled" bit set. 2350if (Swizzled != -1 &&
MI.getOperand(Swizzled).getImm())
2354 CI.setMI(
MI, *
this);
2357if (!CI.hasMergeableAddress(*
MRI))
2360if (CI.InstClass == DS_WRITE && CI.IsAGPR) {
2361// FIXME: nothing is illegal in a ds_write2 opcode with two AGPR data 2362// operands. However we are reporting that ds_write2 shall have 2363// only VGPR data so that machine copy propagation does not 2364// create an illegal instruction with a VGPR and AGPR sources. 2365// Consequenctially if we create such instruction the verifier 2375// At this point we have lists of Mergeable instructions. 2377// Part 2: Sort lists by offset and then for each CombineInfo object in the 2378// list try to find an instruction that can be merged with I. If an instruction 2379// is found, it is stored in the Paired field. If no instructions are found, then 2380// the CombineInfo object is deleted from the list. 2382for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2383 E = MergeableInsts.end();
I != E;) {
2385 std::list<CombineInfo> &MergeList = *
I;
2386if (MergeList.size() <= 1) {
2387// This means we have found only one instruction with a given address 2388// that can be merged, and we need at least 2 instructions to do a merge, 2389// so this list can be discarded. 2390I = MergeableInsts.erase(
I);
2394// Sort the lists by offsets, this way mergeable instructions will be 2395// adjacent to each other in the list, which will make it easier to find 2398 [] (
const CombineInfo &
A,
const CombineInfo &
B) {
2399returnA.Offset <
B.Offset;
2407// Scan through looking for adjacent LDS operations with constant offsets from 2408// the same base register. We rely on the scheduler to do the hard work of 2409// clustering nearby loads, and assume these are all adjacent. 2410bool SILoadStoreOptimizer::optimizeBlock(
2411 std::list<std::list<CombineInfo> > &MergeableInsts) {
2414for (std::list<std::list<CombineInfo>>::iterator
I = MergeableInsts.begin(),
2415 E = MergeableInsts.end();
I != E;) {
2416 std::list<CombineInfo> &MergeList = *
I;
2418bool OptimizeListAgain =
false;
2419if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
2420// We weren't able to make any changes, so delete the list so we don't 2421// process the same instructions the next time we try to optimize this 2423I = MergeableInsts.erase(
I);
2429// We made changes, but also determined that there were no more optimization 2430// opportunities, so we don't need to reprocess the list 2431if (!OptimizeListAgain) {
2432I = MergeableInsts.erase(
I);
2435 OptimizeAgain =
true;
2441SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
2442 std::list<CombineInfo> &MergeList,
2443bool &OptimizeListAgain) {
2444if (MergeList.empty())
2449for (
autoI = MergeList.begin(), Next = std::next(
I); Next != MergeList.end();
2450 Next = std::next(
I)) {
2455if ((*First).Order > (*Second).Order)
2457 CombineInfo &CI = *
First;
2458 CombineInfo &Paired = *Second;
2460 CombineInfo *Where = checkAndPrepareMerge(CI, Paired);
2468LLVM_DEBUG(
dbgs() <<
"Merging: " << *CI.I <<
" with: " << *Paired.I);
2471switch (CI.InstClass) {
2476 NewMI = mergeRead2Pair(CI, Paired, Where->I);
2479 NewMI = mergeWrite2Pair(CI, Paired, Where->I);
2481case S_BUFFER_LOAD_IMM:
2482case S_BUFFER_LOAD_SGPR_IMM:
2484 NewMI = mergeSMemLoadImmPair(CI, Paired, Where->I);
2485 OptimizeListAgain |= CI.Width + Paired.Width < 8;
2488 NewMI = mergeBufferLoadPair(CI, Paired, Where->I);
2489 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2492 NewMI = mergeBufferStorePair(CI, Paired, Where->I);
2493 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2496 NewMI = mergeImagePair(CI, Paired, Where->I);
2497 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2500 NewMI = mergeTBufferLoadPair(CI, Paired, Where->I);
2501 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2504 NewMI = mergeTBufferStorePair(CI, Paired, Where->I);
2505 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2509case GLOBAL_LOAD_SADDR:
2510 NewMI = mergeFlatLoadPair(CI, Paired, Where->I);
2511 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2515case GLOBAL_STORE_SADDR:
2516 NewMI = mergeFlatStorePair(CI, Paired, Where->I);
2517 OptimizeListAgain |= CI.Width + Paired.Width < 4;
2520 CI.setMI(NewMI, *
this);
2521 CI.Order = Where->Order;
2525 MergeList.erase(Second);
2531bool SILoadStoreOptimizerLegacy::runOnMachineFunction(
MachineFunction &MF) {
2534return SILoadStoreOptimizer(
2535 &getAnalysis<AAResultsWrapperPass>().getAAResults())
2545TRI = &
TII->getRegisterInfo();
2553// Contains the list of instructions for which constant offsets are being 2554// promoted to the IMM. This is tracked for an entire block at time. 2562bool CollectModified;
2563 std::list<std::list<CombineInfo>> MergeableInsts;
2565// First pass: Collect list of all instructions we know how to merge in a 2566// subset of the block. 2567 std::tie(SectionEnd, CollectModified) =
2573 OptimizeAgain =
false;
2575 }
while (OptimizeAgain);
2597bool Changed = SILoadStoreOptimizer(&AA).
run(MF);
unsigned const MachineRegisterInfo * MRI
INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE, "AMDGPU Image Intrinsic Optimizer", false, false) char AMDGPUImageIntrinsicOptimizer void addInstToMergeableList(IntrinsicInst *II, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr)
BasicBlock::iterator collectMergeableInsts(BasicBlock::iterator I, BasicBlock::iterator E, SmallVector< SmallVector< IntrinsicInst *, 4 > > &MergeableInsts)
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
std::optional< std::vector< StOtherPiece > > Other
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi)
static bool needsConstrainedOpcode(const GCNSubtarget &STM, ArrayRef< MachineMemOperand * > MMOs, unsigned Width)
static void addDefsUsesToList(const MachineInstr &MI, DenseSet< Register > &RegDefs, DenseSet< Register > &RegUses)
static unsigned getBufferFormatWithCompCount(unsigned OldFormat, unsigned ComponentCount, const GCNSubtarget &STI)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT, const TargetTransformInfo &TTI, const DataLayout &DL, bool HasBranchDivergence, DomTreeUpdater *DTU)
support::ulittle16_t & Lo
support::ulittle16_t & Hi
A manager for alias analyses.
Result run(Function &F, FunctionAnalysisManager &AM)
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
Class for arbitrary precision integers.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesCFG()
This function should be called by the pass, iff they do not:
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
Represents analyses that only rely on functions' control flow.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
bool hasOptNone() const
Do not optimize this function (-O0).
bool loadStoreOptEnabled() const
bool hasFlatInstOffsets() const
const SIInstrInfo * getInstrInfo() const override
bool hasDwordx3LoadStores() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasScalarDwordx3Loads() const
bool isXNACKEnabled() const
TypeSize getValue() const
Describe properties that are true of each instruction in the target description file.
An RAII based helper class to modify MachineFunctionProperties when running pass.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
virtual MachineFunctionProperties getRequiredProperties() const
Properties which a MachineFunction may have at a given point in time.
MachineFunctionProperties & set(Property P)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & cloneMergedMemRefs(ArrayRef< const MachineInstr * > OtherMIs) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
const MachinePointerInfo & getPointerInfo() const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
Wrapper class representing virtual and physical registers.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
static bool isFLATScratch(const MachineInstr &MI)
static bool isFLATGlobal(const MachineInstr &MI)
static bool isFLAT(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset)
Convert ByteOffset to dwords if the subtarget uses dword SMRD immediate offsets.
bool getMTBUFHasSrsrc(unsigned Opc)
int getMTBUFElements(unsigned Opc)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool getMTBUFHasSoffset(unsigned Opc)
int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements)
int getMUBUFBaseOpcode(unsigned Opc)
int getMTBUFBaseOpcode(unsigned Opc)
bool getMUBUFHasVAddr(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements)
bool getMUBUFHasSoffset(unsigned Opc)
const MIMGBaseOpcodeInfo * getMIMGBaseOpcode(unsigned Opc)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
bool getMTBUFHasVAddr(unsigned Opc)
int getMUBUFElements(unsigned Opc)
const GcnBufferFormatInfo * getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, uint8_t NumFormat, const MCSubtargetInfo &STI)
bool getMUBUFHasSrsrc(unsigned Opc)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Define
Register definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
bool operator<(int64_t V1, const APSInt &V2)
std::vector< std::pair< LineLocation, FunctionId > > AnchorList
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
FunctionPass * createSILoadStoreOptimizerLegacyPass()
char & SILoadStoreOptimizerLegacyID
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
constexpr unsigned BitWidth
Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This class contains a discriminated union of information about pointers in memory operands,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...