Movatterモバイル変換


[0]ホーム

URL:


LLVM 20.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPUInstructionSelector.h"
15#include "AMDGPU.h"
16#include "AMDGPUGlobalISelUtils.h"
17#include "AMDGPUInstrInfo.h"
18#include "AMDGPURegisterBankInfo.h"
19#include "AMDGPUTargetMachine.h"
20#include "SIMachineFunctionInfo.h"
21#include "Utils/AMDGPUBaseInfo.h"
22#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
27#include "llvm/CodeGen/MachineFrameInfo.h"
28#include "llvm/IR/DiagnosticInfo.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespacellvm;
35using namespaceMIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
43AMDGPUInstructionSelector::AMDGPUInstructionSelector(
44constGCNSubtarget &STI,constAMDGPURegisterBankInfo &RBI,
45constAMDGPUTargetMachine &TM)
46 :TII(*STI.getInstrInfo()),TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48#defineGET_GLOBALISEL_PREDICATES_INIT
49#include"AMDGPUGenGlobalISel.inc"
50#undefGET_GLOBALISEL_PREDICATES_INIT
51#defineGET_GLOBALISEL_TEMPORARIES_INIT
52#include"AMDGPUGenGlobalISel.inc"
53#undefGET_GLOBALISEL_TEMPORARIES_INIT
54{
55}
56
57constchar *AMDGPUInstructionSelector::getName() {returnDEBUG_TYPE; }
58
59voidAMDGPUInstructionSelector::setupMF(MachineFunction &MF,GISelKnownBits *KB,
60CodeGenCoverage *CoverageInfo,
61ProfileSummaryInfo *PSI,
62BlockFrequencyInfo *BFI) {
63 MRI = &MF.getRegInfo();
64 Subtarget = &MF.getSubtarget<GCNSubtarget>();
65 Subtarget->checkSubtargetFeatures(MF.getFunction());
66InstructionSelector::setupMF(MF,KB,CoverageInfo,PSI,BFI);
67}
68
69// Return the wave level SGPR base address if this is a wave address.
70staticRegistergetWaveAddress(constMachineInstr *Def) {
71return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72 ? Def->getOperand(1).getReg()
73 :Register();
74}
75
76bool AMDGPUInstructionSelector::isVCC(Register Reg,
77constMachineRegisterInfo &MRI) const{
78// The verifier is oblivious to s1 being a valid value for wavesize registers.
79if (Reg.isPhysical())
80returnfalse;
81
82auto &RegClassOrBank =MRI.getRegClassOrRegBank(Reg);
83constTargetRegisterClass *RC =
84 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
85if (RC) {
86constLLT Ty =MRI.getType(Reg);
87if (!Ty.isValid() || Ty.getSizeInBits() != 1)
88returnfalse;
89// G_TRUNC s1 result is never vcc.
90returnMRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
91 RC->hasSuperClassEq(TRI.getBoolRC());
92 }
93
94constRegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
95return RB->getID() == AMDGPU::VCCRegBankID;
96}
97
98bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
99unsigned NewOpc) const{
100MI.setDesc(TII.get(NewOpc));
101MI.removeOperand(1);// Remove intrinsic ID.
102MI.addOperand(*MF,MachineOperand::CreateReg(AMDGPU::EXEC,false,true));
103
104MachineOperand &Dst =MI.getOperand(0);
105MachineOperand &Src =MI.getOperand(1);
106
107// TODO: This should be legalized to s32 if needed
108if (MRI->getType(Dst.getReg()) ==LLT::scalar(1))
109returnfalse;
110
111constTargetRegisterClass *DstRC
112 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
113constTargetRegisterClass *SrcRC
114 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
115if (!DstRC || DstRC != SrcRC)
116returnfalse;
117
118return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
119 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
120}
121
122bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const{
123constDebugLoc &DL =I.getDebugLoc();
124MachineBasicBlock *BB =I.getParent();
125I.setDesc(TII.get(TargetOpcode::COPY));
126
127constMachineOperand &Src =I.getOperand(1);
128MachineOperand &Dst =I.getOperand(0);
129Register DstReg = Dst.getReg();
130Register SrcReg = Src.getReg();
131
132if (isVCC(DstReg, *MRI)) {
133if (SrcReg == AMDGPU::SCC) {
134constTargetRegisterClass *RC
135 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
136if (!RC)
137returntrue;
138return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
139 }
140
141if (!isVCC(SrcReg, *MRI)) {
142// TODO: Should probably leave the copy and let copyPhysReg expand it.
143if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
144returnfalse;
145
146constTargetRegisterClass *SrcRC
147 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
148
149 std::optional<ValueAndVReg> ConstVal =
150getIConstantVRegValWithLookThrough(SrcReg, *MRI,true);
151if (ConstVal) {
152unsigned MovOpc =
153 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
154BuildMI(*BB, &I,DL, TII.get(MovOpc), DstReg)
155 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
156 }else {
157Register MaskedReg =MRI->createVirtualRegister(SrcRC);
158
159// We can't trust the high bits at this point, so clear them.
160
161// TODO: Skip masking high bits if def is known boolean.
162
163if (AMDGPU::getRegBitWidth(SrcRC->getID()) == 16) {
164assert(Subtarget->useRealTrue16Insts());
165const int64_t NoMods = 0;
166BuildMI(*BB, &I,DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
167 .addImm(NoMods)
168 .addImm(1)
169 .addImm(NoMods)
170 .addReg(SrcReg)
171 .addImm(NoMods);
172BuildMI(*BB, &I,DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
173 .addImm(NoMods)
174 .addImm(0)
175 .addImm(NoMods)
176 .addReg(MaskedReg)
177 .addImm(NoMods);
178 }else {
179bool IsSGPR = TRI.isSGPRClass(SrcRC);
180unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
181autoAnd =BuildMI(*BB, &I,DL, TII.get(AndOpc), MaskedReg)
182 .addImm(1)
183 .addReg(SrcReg);
184if (IsSGPR)
185And.setOperandDead(3);// Dead scc
186
187BuildMI(*BB, &I,DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
188 .addImm(0)
189 .addReg(MaskedReg);
190 }
191 }
192
193if (!MRI->getRegClassOrNull(SrcReg))
194MRI->setRegClass(SrcReg, SrcRC);
195I.eraseFromParent();
196returntrue;
197 }
198
199constTargetRegisterClass *RC =
200 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
201if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
202returnfalse;
203
204returntrue;
205 }
206
207for (constMachineOperand &MO :I.operands()) {
208if (MO.getReg().isPhysical())
209continue;
210
211constTargetRegisterClass *RC =
212 TRI.getConstrainedRegClassForOperand(MO, *MRI);
213if (!RC)
214continue;
215 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
216 }
217returntrue;
218}
219
220bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(MachineInstr &I) const{
221constDebugLoc &DL =I.getDebugLoc();
222MachineBasicBlock *BB =I.getParent();
223
224unsigned CmpOpc =
225 STI.isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
226MachineInstr *Cmp =BuildMI(*BB, &I,DL, TII.get(CmpOpc))
227 .addReg(I.getOperand(1).getReg())
228 .addImm(0);
229if (!constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI))
230returnfalse;
231
232Register DstReg =I.getOperand(0).getReg();
233BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), DstReg).addReg(AMDGPU::SCC);
234
235I.eraseFromParent();
236return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
237}
238
239bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(MachineInstr &I) const{
240constDebugLoc &DL =I.getDebugLoc();
241MachineBasicBlock *BB =I.getParent();
242
243Register DstReg =I.getOperand(0).getReg();
244Register SrcReg =I.getOperand(1).getReg();
245 std::optional<ValueAndVReg> Arg =
246getIConstantVRegValWithLookThrough(I.getOperand(1).getReg(), *MRI);
247
248if (Arg) {
249const int64_tValue = Arg->Value.getZExtValue();
250if (Value == 0) {
251unsigned Opcode = STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
252BuildMI(*BB, &I,DL, TII.get(Opcode), DstReg).addImm(0);
253 }else {
254assert(Value == 1);
255BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), DstReg).addReg(TRI.getExec());
256 }
257I.eraseFromParent();
258return RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI);
259 }
260
261// RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0).
262BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), AMDGPU::SCC).addReg(SrcReg);
263
264unsigned SelectOpcode =
265 STI.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
266MachineInstr *Select =BuildMI(*BB, &I,DL, TII.get(SelectOpcode), DstReg)
267 .addReg(TRI.getExec())
268 .addImm(0);
269
270I.eraseFromParent();
271returnconstrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
272}
273
274bool AMDGPUInstructionSelector::selectReadAnyLane(MachineInstr &I) const{
275Register DstReg =I.getOperand(0).getReg();
276Register SrcReg =I.getOperand(1).getReg();
277
278constDebugLoc &DL =I.getDebugLoc();
279MachineBasicBlock *BB =I.getParent();
280
281auto RFL =BuildMI(*BB, &I,DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
282 .addReg(SrcReg);
283
284I.eraseFromParent();
285returnconstrainSelectedInstRegOperands(*RFL, TII, TRI, RBI);
286}
287
288bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const{
289constRegister DefReg =I.getOperand(0).getReg();
290constLLT DefTy =MRI->getType(DefReg);
291
292// S1 G_PHIs should not be selected in instruction-select, instead:
293// - divergent S1 G_PHI should go through lane mask merging algorithm
294// and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
295// - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
296if (DefTy ==LLT::scalar(1))
297returnfalse;
298
299// TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
300
301constRegClassOrRegBank &RegClassOrBank =
302MRI->getRegClassOrRegBank(DefReg);
303
304constTargetRegisterClass *DefRC =
305 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
306if (!DefRC) {
307if (!DefTy.isValid()) {
308LLVM_DEBUG(dbgs() <<"PHI operand has no type, not a gvreg?\n");
309returnfalse;
310 }
311
312constRegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
313 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
314if (!DefRC) {
315LLVM_DEBUG(dbgs() <<"PHI operand has unexpected size/bank\n");
316returnfalse;
317 }
318 }
319
320// If inputs have register bank, assign corresponding reg class.
321// Note: registers don't need to have the same reg bank.
322for (unsigned i = 1; i !=I.getNumOperands(); i += 2) {
323constRegister SrcReg =I.getOperand(i).getReg();
324
325constRegisterBank *RB =MRI->getRegBankOrNull(SrcReg);
326if (RB) {
327constLLT SrcTy =MRI->getType(SrcReg);
328constTargetRegisterClass *SrcRC =
329 TRI.getRegClassForTypeOnBank(SrcTy, *RB);
330if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
331returnfalse;
332 }
333 }
334
335I.setDesc(TII.get(TargetOpcode::PHI));
336return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
337}
338
339MachineOperand
340AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
341constTargetRegisterClass &SubRC,
342unsigned SubIdx) const{
343
344MachineInstr *MI = MO.getParent();
345MachineBasicBlock *BB = MO.getParent()->getParent();
346Register DstReg =MRI->createVirtualRegister(&SubRC);
347
348if (MO.isReg()) {
349unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
350RegisterReg = MO.getReg();
351BuildMI(*BB,MI,MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
352 .addReg(Reg, 0, ComposedSubIdx);
353
354returnMachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
355 MO.isKill(), MO.isDead(), MO.isUndef(),
356 MO.isEarlyClobber(), 0, MO.isDebug(),
357 MO.isInternalRead());
358 }
359
360assert(MO.isImm());
361
362APIntImm(64, MO.getImm());
363
364switch (SubIdx) {
365default:
366llvm_unreachable("do not know to split immediate with this sub index.");
367case AMDGPU::sub0:
368returnMachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
369case AMDGPU::sub1:
370returnMachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
371 }
372}
373
374staticunsignedgetLogicalBitOpcode(unsigned Opc,bool Is64) {
375switch (Opc) {
376case AMDGPU::G_AND:
377return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
378case AMDGPU::G_OR:
379return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
380case AMDGPU::G_XOR:
381return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
382default:
383llvm_unreachable("not a bit op");
384 }
385}
386
387bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const{
388Register DstReg =I.getOperand(0).getReg();
389unsignedSize = RBI.getSizeInBits(DstReg, *MRI, TRI);
390
391constRegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
392if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
393 DstRB->getID() != AMDGPU::VCCRegBankID)
394returnfalse;
395
396bool Is64 =Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
397 STI.isWave64());
398I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
399
400// Dead implicit-def of scc
401I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC,true,// isDef
402true,// isImp
403false,// isKill
404true));// isDead
405returnconstrainSelectedInstRegOperands(I, TII, TRI, RBI);
406}
407
408bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const{
409MachineBasicBlock *BB =I.getParent();
410MachineFunction *MF = BB->getParent();
411Register DstReg =I.getOperand(0).getReg();
412constDebugLoc &DL =I.getDebugLoc();
413LLT Ty =MRI->getType(DstReg);
414if (Ty.isVector())
415returnfalse;
416
417unsignedSize = Ty.getSizeInBits();
418constRegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
419constbool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
420constbool Sub =I.getOpcode() == TargetOpcode::G_SUB;
421
422if (Size == 32) {
423if (IsSALU) {
424constunsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
425MachineInstr *Add =
426BuildMI(*BB, &I,DL, TII.get(Opc), DstReg)
427 .add(I.getOperand(1))
428 .add(I.getOperand(2))
429 .setOperandDead(3);// Dead scc
430I.eraseFromParent();
431returnconstrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
432 }
433
434if (STI.hasAddNoCarry()) {
435constunsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
436I.setDesc(TII.get(Opc));
437I.addOperand(*MF,MachineOperand::CreateImm(0));
438I.addOperand(*MF,MachineOperand::CreateReg(AMDGPU::EXEC,false,true));
439returnconstrainSelectedInstRegOperands(I, TII, TRI, RBI);
440 }
441
442constunsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
443
444Register UnusedCarry =MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
445MachineInstr *Add
446 =BuildMI(*BB, &I,DL, TII.get(Opc), DstReg)
447 .addDef(UnusedCarry,RegState::Dead)
448 .add(I.getOperand(1))
449 .add(I.getOperand(2))
450 .addImm(0);
451I.eraseFromParent();
452returnconstrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
453 }
454
455assert(!Sub &&"illegal sub should not reach here");
456
457constTargetRegisterClass &RC
458 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
459constTargetRegisterClass &HalfRC
460 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
461
462MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
463MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
464MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
465MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
466
467Register DstLo =MRI->createVirtualRegister(&HalfRC);
468Register DstHi =MRI->createVirtualRegister(&HalfRC);
469
470if (IsSALU) {
471BuildMI(*BB, &I,DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
472 .add(Lo1)
473 .add(Lo2);
474BuildMI(*BB, &I,DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
475 .add(Hi1)
476 .add(Hi2)
477 .setOperandDead(3);// Dead scc
478 }else {
479constTargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
480Register CarryReg =MRI->createVirtualRegister(CarryRC);
481BuildMI(*BB, &I,DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
482 .addDef(CarryReg)
483 .add(Lo1)
484 .add(Lo2)
485 .addImm(0);
486MachineInstr *Addc =BuildMI(*BB, &I,DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
487 .addDef(MRI->createVirtualRegister(CarryRC),RegState::Dead)
488 .add(Hi1)
489 .add(Hi2)
490 .addReg(CarryReg,RegState::Kill)
491 .addImm(0);
492
493if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
494returnfalse;
495 }
496
497BuildMI(*BB, &I,DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
498 .addReg(DstLo)
499 .addImm(AMDGPU::sub0)
500 .addReg(DstHi)
501 .addImm(AMDGPU::sub1);
502
503
504if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
505returnfalse;
506
507I.eraseFromParent();
508returntrue;
509}
510
511bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
512MachineInstr &I) const{
513MachineBasicBlock *BB =I.getParent();
514MachineFunction *MF = BB->getParent();
515constDebugLoc &DL =I.getDebugLoc();
516Register Dst0Reg =I.getOperand(0).getReg();
517Register Dst1Reg =I.getOperand(1).getReg();
518constbool IsAdd =I.getOpcode() == AMDGPU::G_UADDO ||
519I.getOpcode() == AMDGPU::G_UADDE;
520constbool HasCarryIn =I.getOpcode() == AMDGPU::G_UADDE ||
521I.getOpcode() == AMDGPU::G_USUBE;
522
523if (isVCC(Dst1Reg, *MRI)) {
524unsigned NoCarryOpc =
525 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
526unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
527I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
528I.addOperand(*MF,MachineOperand::CreateReg(AMDGPU::EXEC,false,true));
529I.addOperand(*MF,MachineOperand::CreateImm(0));
530returnconstrainSelectedInstRegOperands(I, TII, TRI, RBI);
531 }
532
533Register Src0Reg =I.getOperand(2).getReg();
534Register Src1Reg =I.getOperand(3).getReg();
535
536if (HasCarryIn) {
537BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
538 .addReg(I.getOperand(4).getReg());
539 }
540
541unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
542unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
543
544auto CarryInst =BuildMI(*BB, &I,DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
545 .add(I.getOperand(2))
546 .add(I.getOperand(3));
547
548if (MRI->use_nodbg_empty(Dst1Reg)) {
549 CarryInst.setOperandDead(3);// Dead scc
550 }else {
551BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), Dst1Reg)
552 .addReg(AMDGPU::SCC);
553if (!MRI->getRegClassOrNull(Dst1Reg))
554MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
555 }
556
557if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
558 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
559 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
560returnfalse;
561
562if (HasCarryIn &&
563 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
564 AMDGPU::SReg_32RegClass, *MRI))
565returnfalse;
566
567I.eraseFromParent();
568returntrue;
569}
570
571bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
572MachineInstr &I) const{
573MachineBasicBlock *BB =I.getParent();
574MachineFunction *MF = BB->getParent();
575constbool IsUnsigned =I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
576
577unsigned Opc;
578if (Subtarget->hasMADIntraFwdBug())
579 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
580 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
581else
582 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
583I.setDesc(TII.get(Opc));
584I.addOperand(*MF,MachineOperand::CreateImm(0));
585I.addImplicitDefUseOperands(*MF);
586returnconstrainSelectedInstRegOperands(I, TII, TRI, RBI);
587}
588
589// TODO: We should probably legalize these to only using 32-bit results.
590bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const{
591MachineBasicBlock *BB =I.getParent();
592Register DstReg =I.getOperand(0).getReg();
593Register SrcReg =I.getOperand(1).getReg();
594LLT DstTy =MRI->getType(DstReg);
595LLT SrcTy =MRI->getType(SrcReg);
596constunsigned SrcSize = SrcTy.getSizeInBits();
597unsigned DstSize = DstTy.getSizeInBits();
598
599// TODO: Should handle any multiple of 32 offset.
600unsignedOffset =I.getOperand(2).getImm();
601if (Offset % 32 != 0 || DstSize > 128)
602returnfalse;
603
604// 16-bit operations really use 32-bit registers.
605// FIXME: Probably should not allow 16-bit G_EXTRACT results.
606if (DstSize == 16)
607 DstSize = 32;
608
609constTargetRegisterClass *DstRC =
610 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
611if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
612returnfalse;
613
614constRegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
615constTargetRegisterClass *SrcRC =
616 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
617if (!SrcRC)
618returnfalse;
619unsignedSubReg =SIRegisterInfo::getSubRegFromChannel(Offset / 32,
620 DstSize / 32);
621 SrcRC = TRI.getSubClassWithSubReg(SrcRC,SubReg);
622if (!SrcRC)
623returnfalse;
624
625 SrcReg =constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI,I,
626 *SrcRC,I.getOperand(1));
627constDebugLoc &DL =I.getDebugLoc();
628BuildMI(*BB, &I,DL, TII.get(TargetOpcode::COPY), DstReg)
629 .addReg(SrcReg, 0,SubReg);
630
631I.eraseFromParent();
632returntrue;
633}
634
635bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const{
636MachineBasicBlock *BB =MI.getParent();
637Register DstReg =MI.getOperand(0).getReg();
638LLT DstTy =MRI->getType(DstReg);
639LLT SrcTy =MRI->getType(MI.getOperand(1).getReg());
640
641constunsigned SrcSize = SrcTy.getSizeInBits();
642if (SrcSize < 32)
643return selectImpl(MI, *CoverageInfo);
644
645constDebugLoc &DL =MI.getDebugLoc();
646constRegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
647constunsigned DstSize = DstTy.getSizeInBits();
648constTargetRegisterClass *DstRC =
649 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
650if (!DstRC)
651returnfalse;
652
653ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
654MachineInstrBuilder MIB =
655BuildMI(*BB, &MI,DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
656for (intI = 0, E =MI.getNumOperands() - 1;I != E; ++I) {
657MachineOperand &Src =MI.getOperand(I + 1);
658 MIB.addReg(Src.getReg(),getUndefRegState(Src.isUndef()));
659 MIB.addImm(SubRegs[I]);
660
661constTargetRegisterClass *SrcRC
662 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
663if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
664returnfalse;
665 }
666
667if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
668returnfalse;
669
670MI.eraseFromParent();
671returntrue;
672}
673
674bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const{
675MachineBasicBlock *BB =MI.getParent();
676constint NumDst =MI.getNumOperands() - 1;
677
678MachineOperand &Src =MI.getOperand(NumDst);
679
680Register SrcReg = Src.getReg();
681Register DstReg0 =MI.getOperand(0).getReg();
682LLT DstTy =MRI->getType(DstReg0);
683LLT SrcTy =MRI->getType(SrcReg);
684
685constunsigned DstSize = DstTy.getSizeInBits();
686constunsigned SrcSize = SrcTy.getSizeInBits();
687constDebugLoc &DL =MI.getDebugLoc();
688constRegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
689
690constTargetRegisterClass *SrcRC =
691 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
692if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
693returnfalse;
694
695// Note we could have mixed SGPR and VGPR destination banks for an SGPR
696// source, and this relies on the fact that the same subregister indices are
697// used for both.
698ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
699for (intI = 0, E = NumDst;I != E; ++I) {
700MachineOperand &Dst =MI.getOperand(I);
701BuildMI(*BB, &MI,DL, TII.get(TargetOpcode::COPY), Dst.getReg())
702 .addReg(SrcReg, 0, SubRegs[I]);
703
704// Make sure the subregister index is valid for the source register.
705 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
706if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
707returnfalse;
708
709constTargetRegisterClass *DstRC =
710 TRI.getConstrainedRegClassForOperand(Dst, *MRI);
711if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
712returnfalse;
713 }
714
715MI.eraseFromParent();
716returntrue;
717}
718
719bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const{
720assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
721MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
722
723Register Src0 =MI.getOperand(1).getReg();
724Register Src1 =MI.getOperand(2).getReg();
725LLT SrcTy =MRI->getType(Src0);
726constunsigned SrcSize = SrcTy.getSizeInBits();
727
728// BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
729if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
730return selectG_MERGE_VALUES(MI);
731 }
732
733// Selection logic below is for V2S16 only.
734// For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
735Register Dst =MI.getOperand(0).getReg();
736if (MRI->getType(Dst) !=LLT::fixed_vector(2, 16) ||
737 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
738 SrcTy !=LLT::scalar(32)))
739return selectImpl(MI, *CoverageInfo);
740
741constRegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
742if (DstBank->getID() == AMDGPU::AGPRRegBankID)
743returnfalse;
744
745assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
746 DstBank->getID() == AMDGPU::VGPRRegBankID);
747constbool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
748
749constDebugLoc &DL =MI.getDebugLoc();
750MachineBasicBlock *BB =MI.getParent();
751
752// First, before trying TableGen patterns, check if both sources are
753// constants. In those cases, we can trivially compute the final constant
754// and emit a simple move.
755auto ConstSrc1 =getAnyConstantVRegValWithLookThrough(Src1, *MRI,true,true);
756if (ConstSrc1) {
757auto ConstSrc0 =
758getAnyConstantVRegValWithLookThrough(Src0, *MRI,true,true);
759if (ConstSrc0) {
760const int64_t K0 = ConstSrc0->Value.getSExtValue();
761const int64_t K1 = ConstSrc1->Value.getSExtValue();
762uint32_t Lo16 =static_cast<uint32_t>(K0) & 0xffff;
763uint32_t Hi16 =static_cast<uint32_t>(K1) & 0xffff;
764uint32_tImm = Lo16 | (Hi16 << 16);
765
766// VALU
767if (IsVector) {
768BuildMI(*BB, &MI,DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
769MI.eraseFromParent();
770return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
771 }
772
773// SALU
774BuildMI(*BB, &MI,DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
775MI.eraseFromParent();
776return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
777 }
778 }
779
780// Now try TableGen patterns.
781if (selectImpl(MI, *CoverageInfo))
782returntrue;
783
784// TODO: This should probably be a combine somewhere
785// (build_vector $src0, undef) -> copy $src0
786MachineInstr *Src1Def =getDefIgnoringCopies(Src1, *MRI);
787if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
788MI.setDesc(TII.get(AMDGPU::COPY));
789MI.removeOperand(2);
790constauto &RC =
791 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
792return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
793 RBI.constrainGenericRegister(Src0, RC, *MRI);
794 }
795
796// TODO: Can be improved?
797if (IsVector) {
798Register TmpReg =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
799auto MIB =BuildMI(*BB,MI,DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
800 .addImm(0xFFFF)
801 .addReg(Src0);
802if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
803returnfalse;
804
805 MIB =BuildMI(*BB,MI,DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
806 .addReg(Src1)
807 .addImm(16)
808 .addReg(TmpReg);
809if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
810returnfalse;
811
812MI.eraseFromParent();
813returntrue;
814 }
815
816Register ShiftSrc0;
817Register ShiftSrc1;
818
819// With multiple uses of the shift, this will duplicate the shift and
820// increase register pressure.
821//
822// (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
823// => (S_PACK_HH_B32_B16 $src0, $src1)
824// (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
825// => (S_PACK_HL_B32_B16 $src0, $src1)
826// (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
827// => (S_PACK_LH_B32_B16 $src0, $src1)
828// (build_vector $src0, $src1)
829// => (S_PACK_LL_B32_B16 $src0, $src1)
830
831bool Shift0 =mi_match(
832 Src0, *MRI,m_OneUse(m_GLShr(m_Reg(ShiftSrc0),m_SpecificICst(16))));
833
834bool Shift1 =mi_match(
835 Src1, *MRI,m_OneUse(m_GLShr(m_Reg(ShiftSrc1),m_SpecificICst(16))));
836
837unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
838if (Shift0 && Shift1) {
839 Opc = AMDGPU::S_PACK_HH_B32_B16;
840MI.getOperand(1).setReg(ShiftSrc0);
841MI.getOperand(2).setReg(ShiftSrc1);
842 }elseif (Shift1) {
843 Opc = AMDGPU::S_PACK_LH_B32_B16;
844MI.getOperand(2).setReg(ShiftSrc1);
845 }elseif (Shift0) {
846auto ConstSrc1 =
847getAnyConstantVRegValWithLookThrough(Src1, *MRI,true,true);
848if (ConstSrc1 && ConstSrc1->Value == 0) {
849// build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
850auto MIB =BuildMI(*BB, &MI,DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
851 .addReg(ShiftSrc0)
852 .addImm(16)
853 .setOperandDead(3);// Dead scc
854
855MI.eraseFromParent();
856returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
857 }
858if (STI.hasSPackHL()) {
859 Opc = AMDGPU::S_PACK_HL_B32_B16;
860MI.getOperand(1).setReg(ShiftSrc0);
861 }
862 }
863
864MI.setDesc(TII.get(Opc));
865returnconstrainSelectedInstRegOperands(MI, TII, TRI, RBI);
866}
867
868bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const{
869constMachineOperand &MO =I.getOperand(0);
870
871// FIXME: Interface for getConstrainedRegClassForOperand needs work. The
872// regbank check here is to know why getConstrainedRegClassForOperand failed.
873constTargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
874if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
875 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
876I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
877returntrue;
878 }
879
880returnfalse;
881}
882
883bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const{
884MachineBasicBlock *BB =I.getParent();
885
886Register DstReg =I.getOperand(0).getReg();
887Register Src0Reg =I.getOperand(1).getReg();
888Register Src1Reg =I.getOperand(2).getReg();
889LLT Src1Ty =MRI->getType(Src1Reg);
890
891unsigned DstSize =MRI->getType(DstReg).getSizeInBits();
892unsigned InsSize = Src1Ty.getSizeInBits();
893
894 int64_tOffset =I.getOperand(3).getImm();
895
896// FIXME: These cases should have been illegal and unnecessary to check here.
897if (Offset % 32 != 0 || InsSize % 32 != 0)
898returnfalse;
899
900// Currently not handled by getSubRegFromChannel.
901if (InsSize > 128)
902returnfalse;
903
904unsignedSubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
905if (SubReg == AMDGPU::NoSubRegister)
906returnfalse;
907
908constRegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
909constTargetRegisterClass *DstRC =
910 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
911if (!DstRC)
912returnfalse;
913
914constRegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
915constRegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
916constTargetRegisterClass *Src0RC =
917 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
918constTargetRegisterClass *Src1RC =
919 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
920
921// Deal with weird cases where the class only partially supports the subreg
922// index.
923 Src0RC = TRI.getSubClassWithSubReg(Src0RC,SubReg);
924if (!Src0RC || !Src1RC)
925returnfalse;
926
927if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
928 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
929 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
930returnfalse;
931
932constDebugLoc &DL =I.getDebugLoc();
933BuildMI(*BB, &I,DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
934 .addReg(Src0Reg)
935 .addReg(Src1Reg)
936 .addImm(SubReg);
937
938I.eraseFromParent();
939returntrue;
940}
941
942bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const{
943Register DstReg =MI.getOperand(0).getReg();
944Register SrcReg =MI.getOperand(1).getReg();
945Register OffsetReg =MI.getOperand(2).getReg();
946Register WidthReg =MI.getOperand(3).getReg();
947
948assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
949"scalar BFX instructions are expanded in regbankselect");
950assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
951"64-bit vector BFX instructions are expanded in regbankselect");
952
953constDebugLoc &DL =MI.getDebugLoc();
954MachineBasicBlock *MBB =MI.getParent();
955
956bool IsSigned =MI.getOpcode() == TargetOpcode::G_SBFX;
957unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
958auto MIB =BuildMI(*MBB, &MI,DL, TII.get(Opc), DstReg)
959 .addReg(SrcReg)
960 .addReg(OffsetReg)
961 .addReg(WidthReg);
962MI.eraseFromParent();
963returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
964}
965
966bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const{
967if (STI.getLDSBankCount() != 16)
968return selectImpl(MI, *CoverageInfo);
969
970Register Dst =MI.getOperand(0).getReg();
971Register Src0 =MI.getOperand(2).getReg();
972Register M0Val =MI.getOperand(6).getReg();
973if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
974 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
975 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
976returnfalse;
977
978// This requires 2 instructions. It is possible to write a pattern to support
979// this, but the generated isel emitter doesn't correctly deal with multiple
980// output instructions using the same physical register input. The copy to m0
981// is incorrectly placed before the second instruction.
982//
983// TODO: Match source modifiers.
984
985Register InterpMov =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
986constDebugLoc &DL =MI.getDebugLoc();
987MachineBasicBlock *MBB =MI.getParent();
988
989BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
990 .addReg(M0Val);
991BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
992 .addImm(2)
993 .addImm(MI.getOperand(4).getImm())// $attr
994 .addImm(MI.getOperand(3).getImm());// $attrchan
995
996BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
997 .addImm(0)// $src0_modifiers
998 .addReg(Src0)// $src0
999 .addImm(MI.getOperand(4).getImm())// $attr
1000 .addImm(MI.getOperand(3).getImm())// $attrchan
1001 .addImm(0)// $src2_modifiers
1002 .addReg(InterpMov)// $src2 - 2 f16 values selected by high
1003 .addImm(MI.getOperand(5).getImm())// $high
1004 .addImm(0)// $clamp
1005 .addImm(0);// $omod
1006
1007MI.eraseFromParent();
1008returntrue;
1009}
1010
1011// Writelane is special in that it can use SGPR and M0 (which would normally
1012// count as using the constant bus twice - but in this case it is allowed since
1013// the lane selector doesn't count as a use of the constant bus). However, it is
1014// still required to abide by the 1 SGPR rule. Fix this up if we might have
1015// multiple SGPRs.
1016bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const{
1017// With a constant bus limit of at least 2, there's no issue.
1018if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
1019return selectImpl(MI, *CoverageInfo);
1020
1021MachineBasicBlock *MBB =MI.getParent();
1022constDebugLoc &DL =MI.getDebugLoc();
1023Register VDst =MI.getOperand(0).getReg();
1024Register Val =MI.getOperand(2).getReg();
1025Register LaneSelect =MI.getOperand(3).getReg();
1026Register VDstIn =MI.getOperand(4).getReg();
1027
1028auto MIB =BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1029
1030 std::optional<ValueAndVReg> ConstSelect =
1031getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
1032if (ConstSelect) {
1033// The selector has to be an inline immediate, so we can use whatever for
1034// the other operands.
1035 MIB.addReg(Val);
1036 MIB.addImm(ConstSelect->Value.getSExtValue() &
1037 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
1038 }else {
1039 std::optional<ValueAndVReg> ConstVal =
1040getIConstantVRegValWithLookThrough(Val, *MRI);
1041
1042// If the value written is an inline immediate, we can get away without a
1043// copy to m0.
1044if (ConstVal &&AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
1045 STI.hasInv2PiInlineImm())) {
1046 MIB.addImm(ConstVal->Value.getSExtValue());
1047 MIB.addReg(LaneSelect);
1048 }else {
1049 MIB.addReg(Val);
1050
1051// If the lane selector was originally in a VGPR and copied with
1052// readfirstlane, there's a hazard to read the same SGPR from the
1053// VALU. Constrain to a different SGPR to help avoid needing a nop later.
1054 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
1055
1056BuildMI(*MBB, *MIB,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1057 .addReg(LaneSelect);
1058 MIB.addReg(AMDGPU::M0);
1059 }
1060 }
1061
1062 MIB.addReg(VDstIn);
1063
1064MI.eraseFromParent();
1065returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1066}
1067
1068// We need to handle this here because tablegen doesn't support matching
1069// instructions with multiple outputs.
1070bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const{
1071Register Dst0 =MI.getOperand(0).getReg();
1072Register Dst1 =MI.getOperand(1).getReg();
1073
1074LLT Ty =MRI->getType(Dst0);
1075unsigned Opc;
1076if (Ty ==LLT::scalar(32))
1077 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1078elseif (Ty ==LLT::scalar(64))
1079 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1080else
1081returnfalse;
1082
1083// TODO: Match source modifiers.
1084
1085constDebugLoc &DL =MI.getDebugLoc();
1086MachineBasicBlock *MBB =MI.getParent();
1087
1088Register Numer =MI.getOperand(3).getReg();
1089Register Denom =MI.getOperand(4).getReg();
1090unsigned ChooseDenom =MI.getOperand(5).getImm();
1091
1092Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1093
1094auto MIB =BuildMI(*MBB, &MI,DL, TII.get(Opc), Dst0)
1095 .addDef(Dst1)
1096 .addImm(0)// $src0_modifiers
1097 .addUse(Src0)// $src0
1098 .addImm(0)// $src1_modifiers
1099 .addUse(Denom)// $src1
1100 .addImm(0)// $src2_modifiers
1101 .addUse(Numer)// $src2
1102 .addImm(0)// $clamp
1103 .addImm(0);// $omod
1104
1105MI.eraseFromParent();
1106returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1107}
1108
1109bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const{
1110Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1111switch (IntrinsicID) {
1112case Intrinsic::amdgcn_if_break: {
1113MachineBasicBlock *BB =I.getParent();
1114
1115// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1116// SelectionDAG uses for wave32 vs wave64.
1117BuildMI(*BB, &I,I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1118 .add(I.getOperand(0))
1119 .add(I.getOperand(2))
1120 .add(I.getOperand(3));
1121
1122Register DstReg =I.getOperand(0).getReg();
1123Register Src0Reg =I.getOperand(2).getReg();
1124Register Src1Reg =I.getOperand(3).getReg();
1125
1126I.eraseFromParent();
1127
1128for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1129MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1130
1131returntrue;
1132 }
1133case Intrinsic::amdgcn_interp_p1_f16:
1134return selectInterpP1F16(I);
1135case Intrinsic::amdgcn_wqm:
1136return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1137case Intrinsic::amdgcn_softwqm:
1138return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1139case Intrinsic::amdgcn_strict_wwm:
1140case Intrinsic::amdgcn_wwm:
1141return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1142case Intrinsic::amdgcn_strict_wqm:
1143return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1144case Intrinsic::amdgcn_writelane:
1145return selectWritelane(I);
1146case Intrinsic::amdgcn_div_scale:
1147return selectDivScale(I);
1148case Intrinsic::amdgcn_icmp:
1149case Intrinsic::amdgcn_fcmp:
1150if (selectImpl(I, *CoverageInfo))
1151returntrue;
1152return selectIntrinsicCmp(I);
1153case Intrinsic::amdgcn_ballot:
1154return selectBallot(I);
1155case Intrinsic::amdgcn_reloc_constant:
1156return selectRelocConstant(I);
1157case Intrinsic::amdgcn_groupstaticsize:
1158return selectGroupStaticSize(I);
1159case Intrinsic::returnaddress:
1160return selectReturnAddress(I);
1161case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1162case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1163case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1164case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1165case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1166case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1167case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1168case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1169case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1170case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1171case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1172case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1173case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1174case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1175case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1176case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1177case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1178case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1179case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1180case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1181case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1182case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1183case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1184case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1185case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1186case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1187case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1188case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1189return selectSMFMACIntrin(I);
1190case Intrinsic::amdgcn_permlane16_swap:
1191case Intrinsic::amdgcn_permlane32_swap:
1192return selectPermlaneSwapIntrin(I, IntrinsicID);
1193default:
1194return selectImpl(I, *CoverageInfo);
1195 }
1196}
1197
1198staticintgetV_CMPOpcode(CmpInst::PredicateP,unsignedSize,
1199constGCNSubtarget &ST) {
1200if (Size != 16 &&Size != 32 &&Size != 64)
1201return -1;
1202
1203if (Size == 16 && !ST.has16BitInsts())
1204return -1;
1205
1206constautoSelect = [&](unsigned S16Opc,unsigned TrueS16Opc,
1207unsigned FakeS16Opc,unsigned S32Opc,
1208unsigned S64Opc) {
1209if (Size == 16)
1210// FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code
1211return ST.hasTrue16BitInsts()
1212 ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc
1213 : S16Opc;
1214if (Size == 32)
1215return S32Opc;
1216return S64Opc;
1217 };
1218
1219switch (P) {
1220default:
1221llvm_unreachable("Unknown condition code!");
1222caseCmpInst::ICMP_NE:
1223returnSelect(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1224 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1225 AMDGPU::V_CMP_NE_U64_e64);
1226caseCmpInst::ICMP_EQ:
1227returnSelect(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1228 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1229 AMDGPU::V_CMP_EQ_U64_e64);
1230caseCmpInst::ICMP_SGT:
1231returnSelect(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1232 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1233 AMDGPU::V_CMP_GT_I64_e64);
1234caseCmpInst::ICMP_SGE:
1235returnSelect(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1236 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1237 AMDGPU::V_CMP_GE_I64_e64);
1238caseCmpInst::ICMP_SLT:
1239returnSelect(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1240 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1241 AMDGPU::V_CMP_LT_I64_e64);
1242caseCmpInst::ICMP_SLE:
1243returnSelect(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1244 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1245 AMDGPU::V_CMP_LE_I64_e64);
1246caseCmpInst::ICMP_UGT:
1247returnSelect(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1248 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1249 AMDGPU::V_CMP_GT_U64_e64);
1250caseCmpInst::ICMP_UGE:
1251returnSelect(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1252 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1253 AMDGPU::V_CMP_GE_U64_e64);
1254caseCmpInst::ICMP_ULT:
1255returnSelect(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1256 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1257 AMDGPU::V_CMP_LT_U64_e64);
1258caseCmpInst::ICMP_ULE:
1259returnSelect(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1260 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1261 AMDGPU::V_CMP_LE_U64_e64);
1262
1263caseCmpInst::FCMP_OEQ:
1264returnSelect(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1265 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1266 AMDGPU::V_CMP_EQ_F64_e64);
1267caseCmpInst::FCMP_OGT:
1268returnSelect(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1269 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1270 AMDGPU::V_CMP_GT_F64_e64);
1271caseCmpInst::FCMP_OGE:
1272returnSelect(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1273 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1274 AMDGPU::V_CMP_GE_F64_e64);
1275caseCmpInst::FCMP_OLT:
1276returnSelect(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1277 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1278 AMDGPU::V_CMP_LT_F64_e64);
1279caseCmpInst::FCMP_OLE:
1280returnSelect(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1281 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1282 AMDGPU::V_CMP_LE_F64_e64);
1283caseCmpInst::FCMP_ONE:
1284returnSelect(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1285 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1286 AMDGPU::V_CMP_NEQ_F64_e64);
1287caseCmpInst::FCMP_ORD:
1288returnSelect(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1289 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1290 AMDGPU::V_CMP_O_F64_e64);
1291caseCmpInst::FCMP_UNO:
1292returnSelect(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1293 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1294 AMDGPU::V_CMP_U_F64_e64);
1295caseCmpInst::FCMP_UEQ:
1296returnSelect(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1297 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1298 AMDGPU::V_CMP_NLG_F64_e64);
1299caseCmpInst::FCMP_UGT:
1300returnSelect(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1301 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1302 AMDGPU::V_CMP_NLE_F64_e64);
1303caseCmpInst::FCMP_UGE:
1304returnSelect(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1305 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1306 AMDGPU::V_CMP_NLT_F64_e64);
1307caseCmpInst::FCMP_ULT:
1308returnSelect(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1309 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1310 AMDGPU::V_CMP_NGE_F64_e64);
1311caseCmpInst::FCMP_ULE:
1312returnSelect(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1313 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1314 AMDGPU::V_CMP_NGT_F64_e64);
1315caseCmpInst::FCMP_UNE:
1316returnSelect(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1317 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1318 AMDGPU::V_CMP_NEQ_F64_e64);
1319caseCmpInst::FCMP_TRUE:
1320returnSelect(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1321 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1322 AMDGPU::V_CMP_TRU_F64_e64);
1323caseCmpInst::FCMP_FALSE:
1324returnSelect(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1325 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1326 AMDGPU::V_CMP_F_F64_e64);
1327 }
1328}
1329
1330int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::PredicateP,
1331unsignedSize) const{
1332if (Size == 64) {
1333if (!STI.hasScalarCompareEq64())
1334return -1;
1335
1336switch (P) {
1337caseCmpInst::ICMP_NE:
1338return AMDGPU::S_CMP_LG_U64;
1339caseCmpInst::ICMP_EQ:
1340return AMDGPU::S_CMP_EQ_U64;
1341default:
1342return -1;
1343 }
1344 }
1345
1346if (Size == 32) {
1347switch (P) {
1348caseCmpInst::ICMP_NE:
1349return AMDGPU::S_CMP_LG_U32;
1350caseCmpInst::ICMP_EQ:
1351return AMDGPU::S_CMP_EQ_U32;
1352caseCmpInst::ICMP_SGT:
1353return AMDGPU::S_CMP_GT_I32;
1354caseCmpInst::ICMP_SGE:
1355return AMDGPU::S_CMP_GE_I32;
1356caseCmpInst::ICMP_SLT:
1357return AMDGPU::S_CMP_LT_I32;
1358caseCmpInst::ICMP_SLE:
1359return AMDGPU::S_CMP_LE_I32;
1360caseCmpInst::ICMP_UGT:
1361return AMDGPU::S_CMP_GT_U32;
1362caseCmpInst::ICMP_UGE:
1363return AMDGPU::S_CMP_GE_U32;
1364caseCmpInst::ICMP_ULT:
1365return AMDGPU::S_CMP_LT_U32;
1366caseCmpInst::ICMP_ULE:
1367return AMDGPU::S_CMP_LE_U32;
1368caseCmpInst::FCMP_OEQ:
1369return AMDGPU::S_CMP_EQ_F32;
1370caseCmpInst::FCMP_OGT:
1371return AMDGPU::S_CMP_GT_F32;
1372caseCmpInst::FCMP_OGE:
1373return AMDGPU::S_CMP_GE_F32;
1374caseCmpInst::FCMP_OLT:
1375return AMDGPU::S_CMP_LT_F32;
1376caseCmpInst::FCMP_OLE:
1377return AMDGPU::S_CMP_LE_F32;
1378caseCmpInst::FCMP_ONE:
1379return AMDGPU::S_CMP_LG_F32;
1380caseCmpInst::FCMP_ORD:
1381return AMDGPU::S_CMP_O_F32;
1382caseCmpInst::FCMP_UNO:
1383return AMDGPU::S_CMP_U_F32;
1384caseCmpInst::FCMP_UEQ:
1385return AMDGPU::S_CMP_NLG_F32;
1386caseCmpInst::FCMP_UGT:
1387return AMDGPU::S_CMP_NLE_F32;
1388caseCmpInst::FCMP_UGE:
1389return AMDGPU::S_CMP_NLT_F32;
1390caseCmpInst::FCMP_ULT:
1391return AMDGPU::S_CMP_NGE_F32;
1392caseCmpInst::FCMP_ULE:
1393return AMDGPU::S_CMP_NGT_F32;
1394caseCmpInst::FCMP_UNE:
1395return AMDGPU::S_CMP_NEQ_F32;
1396default:
1397llvm_unreachable("Unknown condition code!");
1398 }
1399 }
1400
1401if (Size == 16) {
1402if (!STI.hasSALUFloatInsts())
1403return -1;
1404
1405switch (P) {
1406caseCmpInst::FCMP_OEQ:
1407return AMDGPU::S_CMP_EQ_F16;
1408caseCmpInst::FCMP_OGT:
1409return AMDGPU::S_CMP_GT_F16;
1410caseCmpInst::FCMP_OGE:
1411return AMDGPU::S_CMP_GE_F16;
1412caseCmpInst::FCMP_OLT:
1413return AMDGPU::S_CMP_LT_F16;
1414caseCmpInst::FCMP_OLE:
1415return AMDGPU::S_CMP_LE_F16;
1416caseCmpInst::FCMP_ONE:
1417return AMDGPU::S_CMP_LG_F16;
1418caseCmpInst::FCMP_ORD:
1419return AMDGPU::S_CMP_O_F16;
1420caseCmpInst::FCMP_UNO:
1421return AMDGPU::S_CMP_U_F16;
1422caseCmpInst::FCMP_UEQ:
1423return AMDGPU::S_CMP_NLG_F16;
1424caseCmpInst::FCMP_UGT:
1425return AMDGPU::S_CMP_NLE_F16;
1426caseCmpInst::FCMP_UGE:
1427return AMDGPU::S_CMP_NLT_F16;
1428caseCmpInst::FCMP_ULT:
1429return AMDGPU::S_CMP_NGE_F16;
1430caseCmpInst::FCMP_ULE:
1431return AMDGPU::S_CMP_NGT_F16;
1432caseCmpInst::FCMP_UNE:
1433return AMDGPU::S_CMP_NEQ_F16;
1434default:
1435llvm_unreachable("Unknown condition code!");
1436 }
1437 }
1438
1439return -1;
1440}
1441
1442bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const{
1443
1444MachineBasicBlock *BB =I.getParent();
1445constDebugLoc &DL =I.getDebugLoc();
1446
1447Register SrcReg =I.getOperand(2).getReg();
1448unsignedSize = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1449
1450auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1451
1452Register CCReg =I.getOperand(0).getReg();
1453if (!isVCC(CCReg, *MRI)) {
1454int Opcode = getS_CMPOpcode(Pred,Size);
1455if (Opcode == -1)
1456returnfalse;
1457MachineInstr *ICmp =BuildMI(*BB, &I,DL, TII.get(Opcode))
1458 .add(I.getOperand(2))
1459 .add(I.getOperand(3));
1460BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), CCReg)
1461 .addReg(AMDGPU::SCC);
1462boolRet =
1463constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1464 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1465I.eraseFromParent();
1466returnRet;
1467 }
1468
1469if (I.getOpcode() == AMDGPU::G_FCMP)
1470returnfalse;
1471
1472int Opcode =getV_CMPOpcode(Pred,Size, *Subtarget);
1473if (Opcode == -1)
1474returnfalse;
1475
1476MachineInstr *ICmp =BuildMI(*BB, &I,DL, TII.get(Opcode),
1477I.getOperand(0).getReg())
1478 .add(I.getOperand(2))
1479 .add(I.getOperand(3));
1480 RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
1481 *TRI.getBoolRC(), *MRI);
1482boolRet =constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1483I.eraseFromParent();
1484returnRet;
1485}
1486
1487bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const{
1488Register Dst =I.getOperand(0).getReg();
1489if (isVCC(Dst, *MRI))
1490returnfalse;
1491
1492LLT DstTy =MRI->getType(Dst);
1493if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1494returnfalse;
1495
1496MachineBasicBlock *BB =I.getParent();
1497constDebugLoc &DL =I.getDebugLoc();
1498Register SrcReg =I.getOperand(2).getReg();
1499unsignedSize = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1500
1501// i1 inputs are not supported in GlobalISel.
1502if (Size == 1)
1503returnfalse;
1504
1505auto Pred =static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1506if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1507BuildMI(*BB, &I,DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1508I.eraseFromParent();
1509return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1510 }
1511
1512constint Opcode =getV_CMPOpcode(Pred,Size, *Subtarget);
1513if (Opcode == -1)
1514returnfalse;
1515
1516MachineInstrBuilder SelectedMI;
1517MachineOperand &LHS =I.getOperand(2);
1518MachineOperand &RHS =I.getOperand(3);
1519auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS.getReg());
1520auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS.getReg());
1521Register Src0Reg =
1522 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I,/*ForceVGPR*/true);
1523Register Src1Reg =
1524 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I,/*ForceVGPR*/true);
1525 SelectedMI =BuildMI(*BB, &I,DL, TII.get(Opcode), Dst);
1526if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1527 SelectedMI.addImm(Src0Mods);
1528 SelectedMI.addReg(Src0Reg);
1529if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1530 SelectedMI.addImm(Src1Mods);
1531 SelectedMI.addReg(Src1Reg);
1532if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1533 SelectedMI.addImm(0);// clamp
1534if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1535 SelectedMI.addImm(0);// op_sel
1536
1537 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1538if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1539returnfalse;
1540
1541I.eraseFromParent();
1542returntrue;
1543}
1544
1545// Ballot has to zero bits in input lane-mask that are zero in current exec,
1546// Done as AND with exec. For inputs that are results of instruction that
1547// implicitly use same exec, for example compares in same basic block or SCC to
1548// VCC copy, use copy.
1549staticboolisLaneMaskFromSameBlock(Register Reg,MachineRegisterInfo &MRI,
1550MachineBasicBlock *MBB) {
1551MachineInstr *MI =MRI.getVRegDef(Reg);
1552if (MI->getParent() !=MBB)
1553returnfalse;
1554
1555// Lane mask generated by SCC to VCC copy.
1556if (MI->getOpcode() == AMDGPU::COPY) {
1557auto DstRB =MRI.getRegBankOrNull(MI->getOperand(0).getReg());
1558auto SrcRB =MRI.getRegBankOrNull(MI->getOperand(1).getReg());
1559if (DstRB && SrcRB && DstRB->getID() == AMDGPU::VCCRegBankID &&
1560 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1561returntrue;
1562 }
1563
1564// Lane mask generated using compare with same exec.
1565if (isa<GAnyCmp>(MI))
1566returntrue;
1567
1568RegisterLHS,RHS;
1569// Look through AND.
1570if (mi_match(Reg,MRI,m_GAnd(m_Reg(LHS),m_Reg(RHS))))
1571returnisLaneMaskFromSameBlock(LHS,MRI,MBB) ||
1572isLaneMaskFromSameBlock(RHS,MRI,MBB);
1573
1574returnfalse;
1575}
1576
1577bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const{
1578MachineBasicBlock *BB =I.getParent();
1579constDebugLoc &DL =I.getDebugLoc();
1580Register DstReg =I.getOperand(0).getReg();
1581Register SrcReg =I.getOperand(2).getReg();
1582constunsigned BallotSize =MRI->getType(DstReg).getSizeInBits();
1583constunsigned WaveSize = STI.getWavefrontSize();
1584
1585// In the common case, the return type matches the wave size.
1586// However we also support emitting i64 ballots in wave32 mode.
1587if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1588returnfalse;
1589
1590 std::optional<ValueAndVReg> Arg =
1591getIConstantVRegValWithLookThrough(SrcReg, *MRI);
1592
1593Register Dst = DstReg;
1594// i64 ballot on Wave32: new Dst(i32) for WaveSize ballot.
1595if (BallotSize != WaveSize) {
1596 Dst =MRI->createVirtualRegister(TRI.getBoolRC());
1597 }
1598
1599if (Arg) {
1600const int64_tValue = Arg->Value.getZExtValue();
1601if (Value == 0) {
1602// Dst = S_MOV 0
1603unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1604BuildMI(*BB, &I,DL, TII.get(Opcode), Dst).addImm(0);
1605 }else {
1606// Dst = COPY EXEC
1607assert(Value == 1);
1608BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), Dst).addReg(TRI.getExec());
1609 }
1610if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1611returnfalse;
1612 }else {
1613if (isLaneMaskFromSameBlock(SrcReg, *MRI, BB)) {
1614// Dst = COPY SrcReg
1615BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), Dst).addReg(SrcReg);
1616if (!RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI))
1617returnfalse;
1618 }else {
1619// Dst = S_AND SrcReg, EXEC
1620unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1621autoAnd =BuildMI(*BB, &I,DL, TII.get(AndOpc), Dst)
1622 .addReg(SrcReg)
1623 .addReg(TRI.getExec())
1624 .setOperandDead(3);// Dead scc
1625if (!constrainSelectedInstRegOperands(*And, TII, TRI, RBI))
1626returnfalse;
1627 }
1628 }
1629
1630// i64 ballot on Wave32: zero-extend i32 ballot to i64.
1631if (BallotSize != WaveSize) {
1632Register HiReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1633BuildMI(*BB, &I,DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1634BuildMI(*BB, &I,DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1635 .addReg(Dst)
1636 .addImm(AMDGPU::sub0)
1637 .addReg(HiReg)
1638 .addImm(AMDGPU::sub1);
1639 }
1640
1641I.eraseFromParent();
1642returntrue;
1643}
1644
1645bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const{
1646Register DstReg =I.getOperand(0).getReg();
1647constRegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1648constTargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1649if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1650returnfalse;
1651
1652constbool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1653
1654Module *M =MF->getFunction().getParent();
1655constMDNode *Metadata =I.getOperand(2).getMetadata();
1656autoSymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1657auto *RelocSymbol = cast<GlobalVariable>(
1658M->getOrInsertGlobal(SymbolName,Type::getInt32Ty(M->getContext())));
1659
1660MachineBasicBlock *BB =I.getParent();
1661BuildMI(*BB, &I,I.getDebugLoc(),
1662 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1663 .addGlobalAddress(RelocSymbol, 0,SIInstrInfo::MO_ABS32_LO);
1664
1665I.eraseFromParent();
1666returntrue;
1667}
1668
1669bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const{
1670Triple::OSTypeOS =MF->getTarget().getTargetTriple().getOS();
1671
1672Register DstReg =I.getOperand(0).getReg();
1673constRegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1674unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1675 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1676
1677MachineBasicBlock *MBB =I.getParent();
1678constDebugLoc &DL =I.getDebugLoc();
1679
1680auto MIB =BuildMI(*MBB, &I,DL, TII.get(Mov), DstReg);
1681
1682if (OS ==Triple::AMDHSA ||OS ==Triple::AMDPAL) {
1683constSIMachineFunctionInfo *MFI =MF->getInfo<SIMachineFunctionInfo>();
1684 MIB.addImm(MFI->getLDSSize());
1685 }else {
1686Module *M =MF->getFunction().getParent();
1687constGlobalValue *GV =
1688Intrinsic::getOrInsertDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1689 MIB.addGlobalAddress(GV, 0,SIInstrInfo::MO_ABS32_LO);
1690 }
1691
1692I.eraseFromParent();
1693returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1694}
1695
1696bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const{
1697MachineBasicBlock *MBB =I.getParent();
1698MachineFunction &MF = *MBB->getParent();
1699constDebugLoc &DL =I.getDebugLoc();
1700
1701MachineOperand &Dst =I.getOperand(0);
1702Register DstReg = Dst.getReg();
1703unsignedDepth =I.getOperand(2).getImm();
1704
1705constTargetRegisterClass *RC
1706 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1707if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1708 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1709returnfalse;
1710
1711// Check for kernel and shader functions
1712if (Depth != 0 ||
1713MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1714BuildMI(*MBB, &I,DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1715 .addImm(0);
1716I.eraseFromParent();
1717returntrue;
1718 }
1719
1720MachineFrameInfo &MFI =MF.getFrameInfo();
1721// There is a call to @llvm.returnaddress in this function
1722 MFI.setReturnAddressIsTaken(true);
1723
1724// Get the return address reg and mark it as an implicit live-in
1725Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1726Register LiveIn =getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1727 AMDGPU::SReg_64RegClass,DL);
1728BuildMI(*MBB, &I,DL, TII.get(AMDGPU::COPY), DstReg)
1729 .addReg(LiveIn);
1730I.eraseFromParent();
1731returntrue;
1732}
1733
1734bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const{
1735// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1736// SelectionDAG uses for wave32 vs wave64.
1737MachineBasicBlock *BB =MI.getParent();
1738BuildMI(*BB, &MI,MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1739 .add(MI.getOperand(1));
1740
1741RegisterReg =MI.getOperand(1).getReg();
1742MI.eraseFromParent();
1743
1744if (!MRI->getRegClassOrNull(Reg))
1745MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1746returntrue;
1747}
1748
1749bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1750MachineInstr &MI,Intrinsic::ID IntrID) const{
1751MachineBasicBlock *MBB =MI.getParent();
1752MachineFunction *MF =MBB->getParent();
1753constDebugLoc &DL =MI.getDebugLoc();
1754
1755unsigned IndexOperand =MI.getOperand(7).getImm();
1756bool WaveRelease =MI.getOperand(8).getImm() != 0;
1757bool WaveDone =MI.getOperand(9).getImm() != 0;
1758
1759if (WaveDone && !WaveRelease)
1760report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1761
1762unsigned OrderedCountIndex = IndexOperand & 0x3f;
1763 IndexOperand &= ~0x3f;
1764unsigned CountDw = 0;
1765
1766if (STI.getGeneration() >=AMDGPUSubtarget::GFX10) {
1767 CountDw = (IndexOperand >> 24) & 0xf;
1768 IndexOperand &= ~(0xf << 24);
1769
1770if (CountDw < 1 || CountDw > 4) {
1771report_fatal_error(
1772"ds_ordered_count: dword count must be between 1 and 4");
1773 }
1774 }
1775
1776if (IndexOperand)
1777report_fatal_error("ds_ordered_count: bad index operand");
1778
1779unsignedInstruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1780unsigned ShaderType =SIInstrInfo::getDSShaderTypeValue(*MF);
1781
1782unsigned Offset0 = OrderedCountIndex << 2;
1783unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1784
1785if (STI.getGeneration() >=AMDGPUSubtarget::GFX10)
1786 Offset1 |= (CountDw - 1) << 6;
1787
1788if (STI.getGeneration() <AMDGPUSubtarget::GFX11)
1789 Offset1 |= ShaderType << 2;
1790
1791unsignedOffset = Offset0 | (Offset1 << 8);
1792
1793Register M0Val =MI.getOperand(2).getReg();
1794BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1795 .addReg(M0Val);
1796
1797Register DstReg =MI.getOperand(0).getReg();
1798Register ValReg =MI.getOperand(3).getReg();
1799MachineInstrBuilderDS =
1800BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1801 .addReg(ValReg)
1802 .addImm(Offset)
1803 .cloneMemRefs(MI);
1804
1805if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1806returnfalse;
1807
1808boolRet =constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1809MI.eraseFromParent();
1810returnRet;
1811}
1812
1813staticunsignedgwsIntrinToOpcode(unsigned IntrID) {
1814switch (IntrID) {
1815case Intrinsic::amdgcn_ds_gws_init:
1816return AMDGPU::DS_GWS_INIT;
1817case Intrinsic::amdgcn_ds_gws_barrier:
1818return AMDGPU::DS_GWS_BARRIER;
1819case Intrinsic::amdgcn_ds_gws_sema_v:
1820return AMDGPU::DS_GWS_SEMA_V;
1821case Intrinsic::amdgcn_ds_gws_sema_br:
1822return AMDGPU::DS_GWS_SEMA_BR;
1823case Intrinsic::amdgcn_ds_gws_sema_p:
1824return AMDGPU::DS_GWS_SEMA_P;
1825case Intrinsic::amdgcn_ds_gws_sema_release_all:
1826return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1827default:
1828llvm_unreachable("not a gws intrinsic");
1829 }
1830}
1831
1832bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1833Intrinsic::ID IID) const{
1834if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1835 !STI.hasGWSSemaReleaseAll()))
1836returnfalse;
1837
1838// intrinsic ID, vsrc, offset
1839constbool HasVSrc =MI.getNumOperands() == 3;
1840assert(HasVSrc ||MI.getNumOperands() == 2);
1841
1842Register BaseOffset =MI.getOperand(HasVSrc ? 2 : 1).getReg();
1843constRegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1844if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1845returnfalse;
1846
1847MachineInstr *OffsetDef =getDefIgnoringCopies(BaseOffset, *MRI);
1848unsigned ImmOffset;
1849
1850MachineBasicBlock *MBB =MI.getParent();
1851constDebugLoc &DL =MI.getDebugLoc();
1852
1853MachineInstr *Readfirstlane =nullptr;
1854
1855// If we legalized the VGPR input, strip out the readfirstlane to analyze the
1856// incoming offset, in case there's an add of a constant. We'll have to put it
1857// back later.
1858if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1859 Readfirstlane = OffsetDef;
1860 BaseOffset = OffsetDef->getOperand(1).getReg();
1861 OffsetDef =getDefIgnoringCopies(BaseOffset, *MRI);
1862 }
1863
1864if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1865// If we have a constant offset, try to use the 0 in m0 as the base.
1866// TODO: Look into changing the default m0 initialization value. If the
1867// default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1868// the immediate offset.
1869
1870 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1871BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1872 .addImm(0);
1873 }else {
1874 std::tie(BaseOffset, ImmOffset) =
1875AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset,KB);
1876
1877if (Readfirstlane) {
1878// We have the constant offset now, so put the readfirstlane back on the
1879// variable component.
1880if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1881returnfalse;
1882
1883 Readfirstlane->getOperand(1).setReg(BaseOffset);
1884 BaseOffset = Readfirstlane->getOperand(0).getReg();
1885 }else {
1886if (!RBI.constrainGenericRegister(BaseOffset,
1887 AMDGPU::SReg_32RegClass, *MRI))
1888returnfalse;
1889 }
1890
1891Register M0Base =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1892BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1893 .addReg(BaseOffset)
1894 .addImm(16)
1895 .setOperandDead(3);// Dead scc
1896
1897BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1898 .addReg(M0Base);
1899 }
1900
1901// The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1902// offset field) % 64. Some versions of the programming guide omit the m0
1903// part, or claim it's from offset 0.
1904auto MIB =BuildMI(*MBB, &MI,DL, TII.get(gwsIntrinToOpcode(IID)));
1905
1906if (HasVSrc) {
1907Register VSrc =MI.getOperand(1).getReg();
1908 MIB.addReg(VSrc);
1909
1910if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1911returnfalse;
1912 }
1913
1914 MIB.addImm(ImmOffset)
1915 .cloneMemRefs(MI);
1916
1917 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1918
1919MI.eraseFromParent();
1920returntrue;
1921}
1922
1923bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1924bool IsAppend) const{
1925Register PtrBase =MI.getOperand(2).getReg();
1926LLT PtrTy =MRI->getType(PtrBase);
1927bool IsGDS = PtrTy.getAddressSpace() ==AMDGPUAS::REGION_ADDRESS;
1928
1929unsignedOffset;
1930 std::tie(PtrBase,Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1931
1932// TODO: Should this try to look through readfirstlane like GWS?
1933if (!isDSOffsetLegal(PtrBase,Offset)) {
1934 PtrBase =MI.getOperand(2).getReg();
1935Offset = 0;
1936 }
1937
1938MachineBasicBlock *MBB =MI.getParent();
1939constDebugLoc &DL =MI.getDebugLoc();
1940constunsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1941
1942BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1943 .addReg(PtrBase);
1944if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1945returnfalse;
1946
1947auto MIB =BuildMI(*MBB, &MI,DL, TII.get(Opc),MI.getOperand(0).getReg())
1948 .addImm(Offset)
1949 .addImm(IsGDS ? -1 : 0)
1950 .cloneMemRefs(MI);
1951MI.eraseFromParent();
1952returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1953}
1954
1955bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const{
1956MachineFunction *MF =MI.getParent()->getParent();
1957SIMachineFunctionInfo *MFInfo =MF->getInfo<SIMachineFunctionInfo>();
1958
1959 MFInfo->setInitWholeWave();
1960return selectImpl(MI, *CoverageInfo);
1961}
1962
1963bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const{
1964Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
1965if (TM.getOptLevel() >CodeGenOptLevel::None) {
1966unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1967if (WGSize <= STI.getWavefrontSize()) {
1968// If the workgroup fits in a wave, remove s_barrier_signal and lower
1969// s_barrier/s_barrier_wait to wave_barrier.
1970if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1971 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1972MachineBasicBlock *MBB =MI.getParent();
1973constDebugLoc &DL =MI.getDebugLoc();
1974BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::WAVE_BARRIER));
1975 }
1976MI.eraseFromParent();
1977returntrue;
1978 }
1979 }
1980
1981if (STI.hasSplitBarriers() && IntrinsicID == Intrinsic::amdgcn_s_barrier) {
1982// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1983MachineBasicBlock *MBB =MI.getParent();
1984constDebugLoc &DL =MI.getDebugLoc();
1985BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1986 .addImm(AMDGPU::Barrier::WORKGROUP);
1987BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1988 .addImm(AMDGPU::Barrier::WORKGROUP);
1989MI.eraseFromParent();
1990returntrue;
1991 }
1992
1993return selectImpl(MI, *CoverageInfo);
1994}
1995
1996staticboolparseTexFail(uint64_t TexFailCtrl,bool &TFE,bool &LWE,
1997bool &IsTexFail) {
1998if (TexFailCtrl)
1999 IsTexFail =true;
2000
2001 TFE = (TexFailCtrl & 0x1) ?true :false;
2002 TexFailCtrl &= ~(uint64_t)0x1;
2003 LWE = (TexFailCtrl & 0x2) ?true :false;
2004 TexFailCtrl &= ~(uint64_t)0x2;
2005
2006return TexFailCtrl == 0;
2007}
2008
2009bool AMDGPUInstructionSelector::selectImageIntrinsic(
2010MachineInstr &MI,constAMDGPU::ImageDimIntrinsicInfo *Intr) const{
2011MachineBasicBlock *MBB =MI.getParent();
2012constDebugLoc &DL =MI.getDebugLoc();
2013
2014constAMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
2015AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
2016
2017constAMDGPU::MIMGDimInfo *DimInfo =AMDGPU::getMIMGDimInfo(Intr->Dim);
2018unsigned IntrOpcode =Intr->BaseOpcode;
2019constbool IsGFX10Plus =AMDGPU::isGFX10Plus(STI);
2020constbool IsGFX11Plus =AMDGPU::isGFX11Plus(STI);
2021constbool IsGFX12Plus =AMDGPU::isGFX12Plus(STI);
2022
2023constunsigned ArgOffset =MI.getNumExplicitDefs() + 1;
2024
2025Register VDataIn, VDataOut;
2026LLT VDataTy;
2027int NumVDataDwords = -1;
2028bool IsD16 =MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2029MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2030
2031bool Unorm;
2032if (!BaseOpcode->Sampler)
2033 Unorm =true;
2034else
2035 Unorm =MI.getOperand(ArgOffset +Intr->UnormIndex).getImm() != 0;
2036
2037bool TFE;
2038bool LWE;
2039bool IsTexFail =false;
2040if (!parseTexFail(MI.getOperand(ArgOffset +Intr->TexFailCtrlIndex).getImm(),
2041 TFE, LWE, IsTexFail))
2042returnfalse;
2043
2044constintFlags =MI.getOperand(ArgOffset +Intr->NumArgs).getImm();
2045constbool IsA16 = (Flags & 1) != 0;
2046constbool IsG16 = (Flags & 2) != 0;
2047
2048// A16 implies 16 bit gradients if subtarget doesn't support G16
2049if (IsA16 && !STI.hasG16() && !IsG16)
2050returnfalse;
2051
2052unsigned DMask = 0;
2053unsigned DMaskLanes = 0;
2054
2055if (BaseOpcode->Atomic) {
2056 VDataOut =MI.getOperand(0).getReg();
2057 VDataIn =MI.getOperand(2).getReg();
2058LLT Ty =MRI->getType(VDataIn);
2059
2060// Be careful to allow atomic swap on 16-bit element vectors.
2061constbool Is64Bit = BaseOpcode->AtomicX2 ?
2062 Ty.getSizeInBits() == 128 :
2063 Ty.getSizeInBits() == 64;
2064
2065if (BaseOpcode->AtomicX2) {
2066assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2067
2068 DMask = Is64Bit ? 0xf : 0x3;
2069 NumVDataDwords = Is64Bit ? 4 : 2;
2070 }else {
2071 DMask = Is64Bit ? 0x3 : 0x1;
2072 NumVDataDwords = Is64Bit ? 2 : 1;
2073 }
2074 }else {
2075 DMask =MI.getOperand(ArgOffset +Intr->DMaskIndex).getImm();
2076 DMaskLanes = BaseOpcode->Gather4 ? 4 :llvm::popcount(DMask);
2077
2078if (BaseOpcode->Store) {
2079 VDataIn =MI.getOperand(1).getReg();
2080 VDataTy =MRI->getType(VDataIn);
2081 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
2082 }elseif (BaseOpcode->NoReturn) {
2083 NumVDataDwords = 0;
2084 }else {
2085 VDataOut =MI.getOperand(0).getReg();
2086 VDataTy =MRI->getType(VDataOut);
2087 NumVDataDwords = DMaskLanes;
2088
2089if (IsD16 && !STI.hasUnpackedD16VMem())
2090 NumVDataDwords = (DMaskLanes + 1) / 2;
2091 }
2092 }
2093
2094// Set G16 opcode
2095if (Subtarget->hasG16() && IsG16) {
2096constAMDGPU::MIMGG16MappingInfo *G16MappingInfo =
2097AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
2098assert(G16MappingInfo);
2099 IntrOpcode = G16MappingInfo->G16;// set opcode to variant with _g16
2100 }
2101
2102// TODO: Check this in verifier.
2103assert((!IsTexFail || DMaskLanes >= 1) &&"should have legalized this");
2104
2105unsignedCPol =MI.getOperand(ArgOffset +Intr->CachePolicyIndex).getImm();
2106if (BaseOpcode->Atomic)
2107CPol |=AMDGPU::CPol::GLC;// TODO no-return optimization
2108if (CPol & ~((IsGFX12Plus ?AMDGPU::CPol::ALL :AMDGPU::CPol::ALL_pregfx12) |
2109AMDGPU::CPol::VOLATILE))
2110returnfalse;
2111
2112int NumVAddrRegs = 0;
2113int NumVAddrDwords = 0;
2114for (unsignedI =Intr->VAddrStart; I < Intr->VAddrEnd;I++) {
2115// Skip the $noregs and 0s inserted during legalization.
2116MachineOperand &AddrOp =MI.getOperand(ArgOffset +I);
2117if (!AddrOp.isReg())
2118continue;// XXX - Break?
2119
2120RegisterAddr = AddrOp.getReg();
2121if (!Addr)
2122break;
2123
2124 ++NumVAddrRegs;
2125 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
2126 }
2127
2128// The legalizer preprocessed the intrinsic arguments. If we aren't using
2129// NSA, these should have been packed into a single value in the first
2130// address register
2131constbool UseNSA =
2132 NumVAddrRegs != 1 &&
2133 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
2134 : NumVAddrDwords == NumVAddrRegs);
2135if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2136LLVM_DEBUG(dbgs() <<"Trying to use NSA on non-NSA target\n");
2137returnfalse;
2138 }
2139
2140if (IsTexFail)
2141 ++NumVDataDwords;
2142
2143int Opcode = -1;
2144if (IsGFX12Plus) {
2145 Opcode =AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
2146 NumVDataDwords, NumVAddrDwords);
2147 }elseif (IsGFX11Plus) {
2148 Opcode =AMDGPU::getMIMGOpcode(IntrOpcode,
2149 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2150 : AMDGPU::MIMGEncGfx11Default,
2151 NumVDataDwords, NumVAddrDwords);
2152 }elseif (IsGFX10Plus) {
2153 Opcode =AMDGPU::getMIMGOpcode(IntrOpcode,
2154 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2155 : AMDGPU::MIMGEncGfx10Default,
2156 NumVDataDwords, NumVAddrDwords);
2157 }else {
2158if (Subtarget->hasGFX90AInsts()) {
2159 Opcode =AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
2160 NumVDataDwords, NumVAddrDwords);
2161if (Opcode == -1) {
2162LLVM_DEBUG(
2163dbgs()
2164 <<"requested image instruction is not supported on this GPU\n");
2165returnfalse;
2166 }
2167 }
2168if (Opcode == -1 &&
2169 STI.getGeneration() >=AMDGPUSubtarget::VOLCANIC_ISLANDS)
2170 Opcode =AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
2171 NumVDataDwords, NumVAddrDwords);
2172if (Opcode == -1)
2173 Opcode =AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
2174 NumVDataDwords, NumVAddrDwords);
2175 }
2176if (Opcode == -1)
2177returnfalse;
2178
2179auto MIB =BuildMI(*MBB, &MI,DL, TII.get(Opcode))
2180 .cloneMemRefs(MI);
2181
2182if (VDataOut) {
2183if (BaseOpcode->AtomicX2) {
2184constbool Is64 =MRI->getType(VDataOut).getSizeInBits() == 64;
2185
2186Register TmpReg =MRI->createVirtualRegister(
2187 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2188unsignedSubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2189
2190 MIB.addDef(TmpReg);
2191if (!MRI->use_empty(VDataOut)) {
2192BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::COPY), VDataOut)
2193 .addReg(TmpReg,RegState::Kill,SubReg);
2194 }
2195
2196 }else {
2197 MIB.addDef(VDataOut);// vdata output
2198 }
2199 }
2200
2201if (VDataIn)
2202 MIB.addReg(VDataIn);// vdata input
2203
2204for (intI = 0;I != NumVAddrRegs; ++I) {
2205MachineOperand &SrcOp =MI.getOperand(ArgOffset +Intr->VAddrStart +I);
2206if (SrcOp.isReg()) {
2207assert(SrcOp.getReg() != 0);
2208 MIB.addReg(SrcOp.getReg());
2209 }
2210 }
2211
2212 MIB.addReg(MI.getOperand(ArgOffset +Intr->RsrcIndex).getReg());
2213if (BaseOpcode->Sampler)
2214 MIB.addReg(MI.getOperand(ArgOffset +Intr->SampIndex).getReg());
2215
2216 MIB.addImm(DMask);// dmask
2217
2218if (IsGFX10Plus)
2219 MIB.addImm(DimInfo->Encoding);
2220if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2221 MIB.addImm(Unorm);
2222
2223 MIB.addImm(CPol);
2224 MIB.addImm(IsA16 &&// a16 or r128
2225 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2226if (IsGFX10Plus)
2227 MIB.addImm(IsA16 ? -1 : 0);
2228
2229if (!Subtarget->hasGFX90AInsts()) {
2230 MIB.addImm(TFE);// tfe
2231 }elseif (TFE) {
2232LLVM_DEBUG(dbgs() <<"TFE is not supported on this GPU\n");
2233returnfalse;
2234 }
2235
2236if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2237 MIB.addImm(LWE);// lwe
2238if (!IsGFX10Plus)
2239 MIB.addImm(DimInfo->DA ? -1 : 0);
2240if (BaseOpcode->HasD16)
2241 MIB.addImm(IsD16 ? -1 : 0);
2242
2243MI.eraseFromParent();
2244constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2245 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2246returntrue;
2247}
2248
2249// We need to handle this here because tablegen doesn't support matching
2250// instructions with multiple outputs.
2251bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2252MachineInstr &MI) const{
2253Register Dst0 =MI.getOperand(0).getReg();
2254Register Dst1 =MI.getOperand(1).getReg();
2255
2256constDebugLoc &DL =MI.getDebugLoc();
2257MachineBasicBlock *MBB =MI.getParent();
2258
2259RegisterAddr =MI.getOperand(3).getReg();
2260Register Data0 =MI.getOperand(4).getReg();
2261Register Data1 =MI.getOperand(5).getReg();
2262unsignedOffset =MI.getOperand(6).getImm();
2263
2264auto MIB =BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2265 .addDef(Dst1)
2266 .addUse(Addr)
2267 .addUse(Data0)
2268 .addUse(Data1)
2269 .addImm(Offset)
2270 .cloneMemRefs(MI);
2271
2272MI.eraseFromParent();
2273returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2274}
2275
2276bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2277MachineInstr &I) const{
2278Intrinsic::ID IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2279switch (IntrinsicID) {
2280case Intrinsic::amdgcn_end_cf:
2281return selectEndCfIntrinsic(I);
2282case Intrinsic::amdgcn_ds_ordered_add:
2283case Intrinsic::amdgcn_ds_ordered_swap:
2284return selectDSOrderedIntrinsic(I, IntrinsicID);
2285case Intrinsic::amdgcn_ds_gws_init:
2286case Intrinsic::amdgcn_ds_gws_barrier:
2287case Intrinsic::amdgcn_ds_gws_sema_v:
2288case Intrinsic::amdgcn_ds_gws_sema_br:
2289case Intrinsic::amdgcn_ds_gws_sema_p:
2290case Intrinsic::amdgcn_ds_gws_sema_release_all:
2291return selectDSGWSIntrinsic(I, IntrinsicID);
2292case Intrinsic::amdgcn_ds_append:
2293return selectDSAppendConsume(I,true);
2294case Intrinsic::amdgcn_ds_consume:
2295return selectDSAppendConsume(I,false);
2296case Intrinsic::amdgcn_init_whole_wave:
2297return selectInitWholeWave(I);
2298case Intrinsic::amdgcn_s_barrier:
2299case Intrinsic::amdgcn_s_barrier_signal:
2300case Intrinsic::amdgcn_s_barrier_wait:
2301return selectSBarrier(I);
2302case Intrinsic::amdgcn_raw_buffer_load_lds:
2303case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2304case Intrinsic::amdgcn_struct_buffer_load_lds:
2305case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2306return selectBufferLoadLds(I);
2307case Intrinsic::amdgcn_global_load_lds:
2308return selectGlobalLoadLds(I);
2309case Intrinsic::amdgcn_exp_compr:
2310if (!STI.hasCompressedExport()) {
2311Function &F =I.getMF()->getFunction();
2312DiagnosticInfoUnsupported NoFpRet(
2313F,"intrinsic not supported on subtarget",I.getDebugLoc(),DS_Error);
2314F.getContext().diagnose(NoFpRet);
2315returnfalse;
2316 }
2317break;
2318case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2319return selectDSBvhStackIntrinsic(I);
2320case Intrinsic::amdgcn_s_barrier_init:
2321case Intrinsic::amdgcn_s_barrier_signal_var:
2322return selectNamedBarrierInit(I, IntrinsicID);
2323case Intrinsic::amdgcn_s_barrier_join:
2324case Intrinsic::amdgcn_s_get_named_barrier_state:
2325return selectNamedBarrierInst(I, IntrinsicID);
2326case Intrinsic::amdgcn_s_get_barrier_state:
2327return selectSGetBarrierState(I, IntrinsicID);
2328case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2329return selectSBarrierSignalIsfirst(I, IntrinsicID);
2330 }
2331return selectImpl(I, *CoverageInfo);
2332}
2333
2334bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const{
2335if (selectImpl(I, *CoverageInfo))
2336returntrue;
2337
2338MachineBasicBlock *BB =I.getParent();
2339constDebugLoc &DL =I.getDebugLoc();
2340
2341Register DstReg =I.getOperand(0).getReg();
2342unsignedSize = RBI.getSizeInBits(DstReg, *MRI, TRI);
2343assert(Size <= 32 ||Size == 64);
2344constMachineOperand &CCOp =I.getOperand(1);
2345Register CCReg = CCOp.getReg();
2346if (!isVCC(CCReg, *MRI)) {
2347unsigned SelectOpcode =Size == 64 ? AMDGPU::S_CSELECT_B64 :
2348 AMDGPU::S_CSELECT_B32;
2349MachineInstr *CopySCC =BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2350 .addReg(CCReg);
2351
2352// The generic constrainSelectedInstRegOperands doesn't work for the scc register
2353// bank, because it does not cover the register class that we used to represent
2354// for it. So we need to manually set the register class here.
2355if (!MRI->getRegClassOrNull(CCReg))
2356MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2357MachineInstr *Select =BuildMI(*BB, &I,DL, TII.get(SelectOpcode), DstReg)
2358 .add(I.getOperand(2))
2359 .add(I.getOperand(3));
2360
2361boolRet =false;
2362Ret |=constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2363Ret |=constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2364I.eraseFromParent();
2365returnRet;
2366 }
2367
2368// Wide VGPR select should have been split in RegBankSelect.
2369if (Size > 32)
2370returnfalse;
2371
2372MachineInstr *Select =
2373BuildMI(*BB, &I,DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2374 .addImm(0)
2375 .add(I.getOperand(3))
2376 .addImm(0)
2377 .add(I.getOperand(2))
2378 .add(I.getOperand(1));
2379
2380boolRet =constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2381I.eraseFromParent();
2382returnRet;
2383}
2384
2385bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const{
2386Register DstReg =I.getOperand(0).getReg();
2387Register SrcReg =I.getOperand(1).getReg();
2388constLLT DstTy =MRI->getType(DstReg);
2389constLLT SrcTy =MRI->getType(SrcReg);
2390constLLTS1 =LLT::scalar(1);
2391
2392constRegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2393constRegisterBank *DstRB;
2394if (DstTy ==S1) {
2395// This is a special case. We don't treat s1 for legalization artifacts as
2396// vcc booleans.
2397 DstRB = SrcRB;
2398 }else {
2399 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2400if (SrcRB != DstRB)
2401returnfalse;
2402 }
2403
2404constbool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2405
2406unsigned DstSize = DstTy.getSizeInBits();
2407unsigned SrcSize = SrcTy.getSizeInBits();
2408
2409constTargetRegisterClass *SrcRC =
2410 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2411constTargetRegisterClass *DstRC =
2412 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2413if (!SrcRC || !DstRC)
2414returnfalse;
2415
2416if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2417 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2418LLVM_DEBUG(dbgs() <<"Failed to constrain G_TRUNC\n");
2419returnfalse;
2420 }
2421
2422if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2423assert(STI.useRealTrue16Insts());
2424constDebugLoc &DL =I.getDebugLoc();
2425MachineBasicBlock *MBB =I.getParent();
2426BuildMI(*MBB,I,DL, TII.get(AMDGPU::COPY), DstReg)
2427 .addReg(SrcReg, 0, AMDGPU::lo16);
2428I.eraseFromParent();
2429returntrue;
2430 }
2431
2432if (DstTy ==LLT::fixed_vector(2, 16) && SrcTy ==LLT::fixed_vector(2, 32)) {
2433MachineBasicBlock *MBB =I.getParent();
2434constDebugLoc &DL =I.getDebugLoc();
2435
2436Register LoReg =MRI->createVirtualRegister(DstRC);
2437Register HiReg =MRI->createVirtualRegister(DstRC);
2438BuildMI(*MBB,I,DL, TII.get(AMDGPU::COPY), LoReg)
2439 .addReg(SrcReg, 0, AMDGPU::sub0);
2440BuildMI(*MBB,I,DL, TII.get(AMDGPU::COPY), HiReg)
2441 .addReg(SrcReg, 0, AMDGPU::sub1);
2442
2443if (IsVALU && STI.hasSDWA()) {
2444// Write the low 16-bits of the high element into the high 16-bits of the
2445// low element.
2446MachineInstr *MovSDWA =
2447BuildMI(*MBB,I,DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2448 .addImm(0)// $src0_modifiers
2449 .addReg(HiReg)// $src0
2450 .addImm(0)// $clamp
2451 .addImm(AMDGPU::SDWA::WORD_1)// $dst_sel
2452 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE)// $dst_unused
2453 .addImm(AMDGPU::SDWA::WORD_0)// $src0_sel
2454 .addReg(LoReg,RegState::Implicit);
2455 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2456 }else {
2457Register TmpReg0 =MRI->createVirtualRegister(DstRC);
2458Register TmpReg1 =MRI->createVirtualRegister(DstRC);
2459Register ImmReg =MRI->createVirtualRegister(DstRC);
2460if (IsVALU) {
2461BuildMI(*MBB,I,DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2462 .addImm(16)
2463 .addReg(HiReg);
2464 }else {
2465BuildMI(*MBB,I,DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2466 .addReg(HiReg)
2467 .addImm(16)
2468 .setOperandDead(3);// Dead scc
2469 }
2470
2471unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2472unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2473unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2474
2475BuildMI(*MBB,I,DL, TII.get(MovOpc), ImmReg)
2476 .addImm(0xffff);
2477autoAnd =BuildMI(*MBB,I,DL, TII.get(AndOpc), TmpReg1)
2478 .addReg(LoReg)
2479 .addReg(ImmReg);
2480autoOr =BuildMI(*MBB,I,DL, TII.get(OrOpc), DstReg)
2481 .addReg(TmpReg0)
2482 .addReg(TmpReg1);
2483
2484if (!IsVALU) {
2485And.setOperandDead(3);// Dead scc
2486Or.setOperandDead(3);// Dead scc
2487 }
2488 }
2489
2490I.eraseFromParent();
2491returntrue;
2492 }
2493
2494if (!DstTy.isScalar())
2495returnfalse;
2496
2497if (SrcSize > 32) {
2498unsigned SubRegIdx =
2499 DstSize < 32 ? AMDGPU::sub0 : TRI.getSubRegFromChannel(0, DstSize / 32);
2500if (SubRegIdx == AMDGPU::NoSubRegister)
2501returnfalse;
2502
2503// Deal with weird cases where the class only partially supports the subreg
2504// index.
2505constTargetRegisterClass *SrcWithSubRC
2506 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2507if (!SrcWithSubRC)
2508returnfalse;
2509
2510if (SrcWithSubRC != SrcRC) {
2511if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2512returnfalse;
2513 }
2514
2515I.getOperand(1).setSubReg(SubRegIdx);
2516 }
2517
2518I.setDesc(TII.get(TargetOpcode::COPY));
2519returntrue;
2520}
2521
2522/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2523staticboolshouldUseAndMask(unsignedSize,unsigned &Mask) {
2524 Mask = maskTrailingOnes<unsigned>(Size);
2525int SignedMask =static_cast<int>(Mask);
2526return SignedMask >= -16 && SignedMask <= 64;
2527}
2528
2529// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2530constRegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2531Register Reg,constMachineRegisterInfo &MRI,
2532constTargetRegisterInfo &TRI) const{
2533constRegClassOrRegBank &RegClassOrBank =MRI.getRegClassOrRegBank(Reg);
2534if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2535return RB;
2536
2537// Ignore the type, since we don't use vcc in artifacts.
2538if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2539return &RBI.getRegBankFromRegClass(*RC,LLT());
2540returnnullptr;
2541}
2542
2543bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const{
2544bool InReg =I.getOpcode() == AMDGPU::G_SEXT_INREG;
2545boolSigned =I.getOpcode() == AMDGPU::G_SEXT || InReg;
2546constDebugLoc &DL =I.getDebugLoc();
2547MachineBasicBlock &MBB = *I.getParent();
2548constRegister DstReg =I.getOperand(0).getReg();
2549constRegister SrcReg =I.getOperand(1).getReg();
2550
2551constLLT DstTy =MRI->getType(DstReg);
2552constLLT SrcTy =MRI->getType(SrcReg);
2553constunsigned SrcSize =I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2554I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2555constunsigned DstSize = DstTy.getSizeInBits();
2556if (!DstTy.isScalar())
2557returnfalse;
2558
2559// Artifact casts should never use vcc.
2560constRegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2561
2562// FIXME: This should probably be illegal and split earlier.
2563if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2564if (DstSize <= 32)
2565return selectCOPY(I);
2566
2567constTargetRegisterClass *SrcRC =
2568TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2569constRegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2570constTargetRegisterClass *DstRC =
2571TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2572
2573Register UndefReg =MRI->createVirtualRegister(SrcRC);
2574BuildMI(MBB,I,DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2575BuildMI(MBB,I,DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2576 .addReg(SrcReg)
2577 .addImm(AMDGPU::sub0)
2578 .addReg(UndefReg)
2579 .addImm(AMDGPU::sub1);
2580I.eraseFromParent();
2581
2582return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2583 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2584 }
2585
2586if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2587// 64-bit should have been split up in RegBankSelect
2588
2589// Try to use an and with a mask if it will save code size.
2590unsignedMask;
2591if (!Signed &&shouldUseAndMask(SrcSize, Mask)) {
2592MachineInstr *ExtI =
2593BuildMI(MBB,I,DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2594 .addImm(Mask)
2595 .addReg(SrcReg);
2596I.eraseFromParent();
2597returnconstrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2598 }
2599
2600constunsignedBFE =Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2601MachineInstr *ExtI =
2602BuildMI(MBB,I,DL, TII.get(BFE), DstReg)
2603 .addReg(SrcReg)
2604 .addImm(0)// Offset
2605 .addImm(SrcSize);// Width
2606I.eraseFromParent();
2607returnconstrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2608 }
2609
2610if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2611constTargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2612 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2613if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2614returnfalse;
2615
2616if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2617constunsigned SextOpc = SrcSize == 8 ?
2618 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2619BuildMI(MBB,I,DL, TII.get(SextOpc), DstReg)
2620 .addReg(SrcReg);
2621I.eraseFromParent();
2622return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2623 }
2624
2625// Using a single 32-bit SALU to calculate the high half is smaller than
2626// S_BFE with a literal constant operand.
2627if (DstSize > 32 && SrcSize == 32) {
2628Register HiReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2629unsignedSubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2630if (Signed) {
2631BuildMI(MBB,I,DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2632 .addReg(SrcReg, 0,SubReg)
2633 .addImm(31)
2634 .setOperandDead(3);// Dead scc
2635 }else {
2636BuildMI(MBB,I,DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2637 .addImm(0);
2638 }
2639BuildMI(MBB,I,DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2640 .addReg(SrcReg, 0,SubReg)
2641 .addImm(AMDGPU::sub0)
2642 .addReg(HiReg)
2643 .addImm(AMDGPU::sub1);
2644I.eraseFromParent();
2645return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2646 *MRI);
2647 }
2648
2649constunsigned BFE64 =Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2650constunsigned BFE32 =Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2651
2652// Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2653if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2654// We need a 64-bit register source, but the high bits don't matter.
2655Register ExtReg =MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2656Register UndefReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2657unsignedSubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2658
2659BuildMI(MBB,I,DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2660BuildMI(MBB,I,DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2661 .addReg(SrcReg, 0,SubReg)
2662 .addImm(AMDGPU::sub0)
2663 .addReg(UndefReg)
2664 .addImm(AMDGPU::sub1);
2665
2666BuildMI(MBB,I,DL, TII.get(BFE64), DstReg)
2667 .addReg(ExtReg)
2668 .addImm(SrcSize << 16);
2669
2670I.eraseFromParent();
2671return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2672 }
2673
2674unsignedMask;
2675if (!Signed &&shouldUseAndMask(SrcSize, Mask)) {
2676BuildMI(MBB,I,DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2677 .addReg(SrcReg)
2678 .addImm(Mask)
2679 .setOperandDead(3);// Dead scc
2680 }else {
2681BuildMI(MBB,I,DL, TII.get(BFE32), DstReg)
2682 .addReg(SrcReg)
2683 .addImm(SrcSize << 16);
2684 }
2685
2686I.eraseFromParent();
2687return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2688 }
2689
2690returnfalse;
2691}
2692
2693staticRegisterstripCopy(Register Reg,MachineRegisterInfo &MRI) {
2694returngetDefSrcRegIgnoringCopies(Reg,MRI)->Reg;
2695}
2696
2697staticRegisterstripBitCast(Register Reg,MachineRegisterInfo &MRI) {
2698Register BitcastSrc;
2699if (mi_match(Reg,MRI,m_GBitcast(m_Reg(BitcastSrc))))
2700 Reg = BitcastSrc;
2701return Reg;
2702}
2703
2704staticboolisExtractHiElt(MachineRegisterInfo &MRI,Register In,
2705Register &Out) {
2706Register Trunc;
2707if (!mi_match(In,MRI,m_GTrunc(m_Reg(Trunc))))
2708returnfalse;
2709
2710Register LShlSrc;
2711Register Cst;
2712if (mi_match(Trunc,MRI,m_GLShr(m_Reg(LShlSrc),m_Reg(Cst)))) {
2713 Cst =stripCopy(Cst,MRI);
2714if (mi_match(Cst,MRI,m_SpecificICst(16))) {
2715 Out =stripBitCast(LShlSrc,MRI);
2716returntrue;
2717 }
2718 }
2719
2720MachineInstr *Shuffle =MRI.getVRegDef(Trunc);
2721if (Shuffle->getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2722returnfalse;
2723
2724assert(MRI.getType(Shuffle->getOperand(0).getReg()) ==
2725LLT::fixed_vector(2, 16));
2726
2727ArrayRef<int> Mask = Shuffle->getOperand(3).getShuffleMask();
2728assert(Mask.size() == 2);
2729
2730if (Mask[0] == 1 && Mask[1] <= 1) {
2731 Out = Shuffle->getOperand(0).getReg();
2732returntrue;
2733 }
2734
2735returnfalse;
2736}
2737
2738bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const{
2739if (!Subtarget->hasSALUFloatInsts())
2740returnfalse;
2741
2742Register Dst =I.getOperand(0).getReg();
2743constRegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2744if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2745returnfalse;
2746
2747Register Src =I.getOperand(1).getReg();
2748
2749if (MRI->getType(Dst) ==LLT::scalar(32) &&
2750MRI->getType(Src) ==LLT::scalar(16)) {
2751if (isExtractHiElt(*MRI, Src, Src)) {
2752MachineBasicBlock *BB =I.getParent();
2753BuildMI(*BB, &I,I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2754 .addUse(Src);
2755I.eraseFromParent();
2756return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2757 }
2758 }
2759
2760returnfalse;
2761}
2762
2763bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const{
2764// Only manually handle the f64 SGPR case.
2765//
2766// FIXME: This is a workaround for 2.5 different tablegen problems. Because
2767// the bit ops theoretically have a second result due to the implicit def of
2768// SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2769// that is easy by disabling the check. The result works, but uses a
2770// nonsensical sreg32orlds_and_sreg_1 regclass.
2771//
2772// The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2773// the variadic REG_SEQUENCE operands.
2774
2775Register Dst =MI.getOperand(0).getReg();
2776constRegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2777if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2778MRI->getType(Dst) !=LLT::scalar(64))
2779returnfalse;
2780
2781Register Src =MI.getOperand(1).getReg();
2782MachineInstr *Fabs =getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2783if (Fabs)
2784 Src = Fabs->getOperand(1).getReg();
2785
2786if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2787 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2788returnfalse;
2789
2790MachineBasicBlock *BB =MI.getParent();
2791constDebugLoc &DL =MI.getDebugLoc();
2792Register LoReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2793Register HiReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2794Register ConstReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2795Register OpReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2796
2797BuildMI(*BB, &MI,DL, TII.get(AMDGPU::COPY), LoReg)
2798 .addReg(Src, 0, AMDGPU::sub0);
2799BuildMI(*BB, &MI,DL, TII.get(AMDGPU::COPY), HiReg)
2800 .addReg(Src, 0, AMDGPU::sub1);
2801BuildMI(*BB, &MI,DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2802 .addImm(0x80000000);
2803
2804// Set or toggle sign bit.
2805unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2806BuildMI(*BB, &MI,DL, TII.get(Opc), OpReg)
2807 .addReg(HiReg)
2808 .addReg(ConstReg)
2809 .setOperandDead(3);// Dead scc
2810BuildMI(*BB, &MI,DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2811 .addReg(LoReg)
2812 .addImm(AMDGPU::sub0)
2813 .addReg(OpReg)
2814 .addImm(AMDGPU::sub1);
2815MI.eraseFromParent();
2816returntrue;
2817}
2818
2819// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2820bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const{
2821Register Dst =MI.getOperand(0).getReg();
2822constRegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2823if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2824MRI->getType(Dst) !=LLT::scalar(64))
2825returnfalse;
2826
2827Register Src =MI.getOperand(1).getReg();
2828MachineBasicBlock *BB =MI.getParent();
2829constDebugLoc &DL =MI.getDebugLoc();
2830Register LoReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2831Register HiReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2832Register ConstReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2833Register OpReg =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2834
2835if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2836 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2837returnfalse;
2838
2839BuildMI(*BB, &MI,DL, TII.get(AMDGPU::COPY), LoReg)
2840 .addReg(Src, 0, AMDGPU::sub0);
2841BuildMI(*BB, &MI,DL, TII.get(AMDGPU::COPY), HiReg)
2842 .addReg(Src, 0, AMDGPU::sub1);
2843BuildMI(*BB, &MI,DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2844 .addImm(0x7fffffff);
2845
2846// Clear sign bit.
2847// TODO: Should this used S_BITSET0_*?
2848BuildMI(*BB, &MI,DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2849 .addReg(HiReg)
2850 .addReg(ConstReg)
2851 .setOperandDead(3);// Dead scc
2852BuildMI(*BB, &MI,DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2853 .addReg(LoReg)
2854 .addImm(AMDGPU::sub0)
2855 .addReg(OpReg)
2856 .addImm(AMDGPU::sub1);
2857
2858MI.eraseFromParent();
2859returntrue;
2860}
2861
2862staticboolisConstant(constMachineInstr &MI) {
2863returnMI.getOpcode() == TargetOpcode::G_CONSTANT;
2864}
2865
2866void AMDGPUInstructionSelector::getAddrModeInfo(constMachineInstr &Load,
2867constMachineRegisterInfo &MRI,SmallVectorImpl<GEPInfo> &AddrInfo) const{
2868
2869unsigned OpNo =Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2870constMachineInstr *PtrMI =
2871MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2872
2873assert(PtrMI);
2874
2875if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2876return;
2877
2878 GEPInfo GEPInfo;
2879
2880for (unsigned i = 1; i != 3; ++i) {
2881constMachineOperand &GEPOp = PtrMI->getOperand(i);
2882constMachineInstr *OpDef =MRI.getUniqueVRegDef(GEPOp.getReg());
2883assert(OpDef);
2884if (i == 2 &&isConstant(*OpDef)) {
2885// TODO: Could handle constant base + variable offset, but a combine
2886// probably should have commuted it.
2887assert(GEPInfo.Imm == 0);
2888 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2889continue;
2890 }
2891constRegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2892if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2893 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2894else
2895 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2896 }
2897
2898 AddrInfo.push_back(GEPInfo);
2899 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2900}
2901
2902bool AMDGPUInstructionSelector::isSGPR(Register Reg) const{
2903return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2904}
2905
2906bool AMDGPUInstructionSelector::isInstrUniform(constMachineInstr &MI) const{
2907if (!MI.hasOneMemOperand())
2908returnfalse;
2909
2910constMachineMemOperand *MMO = *MI.memoperands_begin();
2911constValue *Ptr = MMO->getValue();
2912
2913// UndefValue means this is a load of a kernel input. These are uniform.
2914// Sometimes LDS instructions have constant pointers.
2915// If Ptr is null, then that means this mem operand contains a
2916// PseudoSourceValue like GOT.
2917if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2918 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2919returntrue;
2920
2921if (MMO->getAddrSpace() ==AMDGPUAS::CONSTANT_ADDRESS_32BIT)
2922returntrue;
2923
2924if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2925return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2926 AMDGPU::SGPRRegBankID;
2927
2928constInstruction *I = dyn_cast<Instruction>(Ptr);
2929returnI &&I->getMetadata("amdgpu.uniform");
2930}
2931
2932bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const{
2933for (const GEPInfo &GEPInfo : AddrInfo) {
2934if (!GEPInfo.VgprParts.empty())
2935returntrue;
2936 }
2937returnfalse;
2938}
2939
2940void AMDGPUInstructionSelector::initM0(MachineInstr &I) const{
2941constLLT PtrTy =MRI->getType(I.getOperand(1).getReg());
2942unsigned AS = PtrTy.getAddressSpace();
2943if ((AS ==AMDGPUAS::LOCAL_ADDRESS || AS ==AMDGPUAS::REGION_ADDRESS) &&
2944 STI.ldsRequiresM0Init()) {
2945MachineBasicBlock *BB =I.getParent();
2946
2947// If DS instructions require M0 initialization, insert it before selecting.
2948BuildMI(*BB, &I,I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2949 .addImm(-1);
2950 }
2951}
2952
2953bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2954MachineInstr &I) const{
2955 initM0(I);
2956return selectImpl(I, *CoverageInfo);
2957}
2958
2959staticboolisVCmpResult(Register Reg,MachineRegisterInfo &MRI) {
2960if (Reg.isPhysical())
2961returnfalse;
2962
2963MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2964constunsigned Opcode =MI.getOpcode();
2965
2966if (Opcode == AMDGPU::COPY)
2967returnisVCmpResult(MI.getOperand(1).getReg(),MRI);
2968
2969if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2970 Opcode == AMDGPU::G_XOR)
2971returnisVCmpResult(MI.getOperand(1).getReg(),MRI) &&
2972isVCmpResult(MI.getOperand(2).getReg(),MRI);
2973
2974if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2975return GI->is(Intrinsic::amdgcn_class);
2976
2977return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2978}
2979
2980bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const{
2981MachineBasicBlock *BB =I.getParent();
2982MachineOperand &CondOp =I.getOperand(0);
2983Register CondReg = CondOp.getReg();
2984constDebugLoc &DL =I.getDebugLoc();
2985
2986unsigned BrOpcode;
2987Register CondPhysReg;
2988constTargetRegisterClass *ConstrainRC;
2989
2990// In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2991// whether the branch is uniform when selecting the instruction. In
2992// GlobalISel, we should push that decision into RegBankSelect. Assume for now
2993// RegBankSelect knows what it's doing if the branch condition is scc, even
2994// though it currently does not.
2995if (!isVCC(CondReg, *MRI)) {
2996if (MRI->getType(CondReg) !=LLT::scalar(32))
2997returnfalse;
2998
2999 CondPhysReg = AMDGPU::SCC;
3000 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3001 ConstrainRC = &AMDGPU::SReg_32RegClass;
3002 }else {
3003// FIXME: Should scc->vcc copies and with exec?
3004
3005// Unless the value of CondReg is a result of a V_CMP* instruction then we
3006// need to insert an and with exec.
3007if (!isVCmpResult(CondReg, *MRI)) {
3008constbool Is64 = STI.isWave64();
3009constunsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3010constRegisterExec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3011
3012Register TmpReg =MRI->createVirtualRegister(TRI.getBoolRC());
3013BuildMI(*BB, &I,DL, TII.get(Opcode), TmpReg)
3014 .addReg(CondReg)
3015 .addReg(Exec)
3016 .setOperandDead(3);// Dead scc
3017 CondReg = TmpReg;
3018 }
3019
3020 CondPhysReg =TRI.getVCC();
3021 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3022 ConstrainRC =TRI.getBoolRC();
3023 }
3024
3025if (!MRI->getRegClassOrNull(CondReg))
3026MRI->setRegClass(CondReg, ConstrainRC);
3027
3028BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), CondPhysReg)
3029 .addReg(CondReg);
3030BuildMI(*BB, &I,DL, TII.get(BrOpcode))
3031 .addMBB(I.getOperand(1).getMBB());
3032
3033I.eraseFromParent();
3034returntrue;
3035}
3036
3037bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3038MachineInstr &I) const{
3039Register DstReg =I.getOperand(0).getReg();
3040constRegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3041constbool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3042I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3043if (IsVGPR)
3044I.addOperand(*MF,MachineOperand::CreateReg(AMDGPU::EXEC,false,true));
3045
3046return RBI.constrainGenericRegister(
3047 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3048}
3049
3050bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const{
3051Register DstReg =I.getOperand(0).getReg();
3052Register SrcReg =I.getOperand(1).getReg();
3053Register MaskReg =I.getOperand(2).getReg();
3054LLT Ty =MRI->getType(DstReg);
3055LLT MaskTy =MRI->getType(MaskReg);
3056MachineBasicBlock *BB =I.getParent();
3057constDebugLoc &DL =I.getDebugLoc();
3058
3059constRegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3060constRegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3061constRegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
3062constbool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
3063if (DstRB != SrcRB)// Should only happen for hand written MIR.
3064returnfalse;
3065
3066// Try to avoid emitting a bit operation when we only need to touch half of
3067// the 64-bit pointer.
3068APInt MaskOnes =KB->getKnownOnes(MaskReg).zext(64);
3069constAPInt MaskHi32 =APInt::getHighBitsSet(64, 32);
3070constAPInt MaskLo32 =APInt::getLowBitsSet(64, 32);
3071
3072constbool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3073constbool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3074
3075if (!IsVGPR && Ty.getSizeInBits() == 64 &&
3076 !CanCopyLow32 && !CanCopyHi32) {
3077auto MIB =BuildMI(*BB, &I,DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3078 .addReg(SrcReg)
3079 .addReg(MaskReg)
3080 .setOperandDead(3);// Dead scc
3081I.eraseFromParent();
3082returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3083 }
3084
3085unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3086constTargetRegisterClass &RegRC
3087 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3088
3089constTargetRegisterClass *DstRC =TRI.getRegClassForTypeOnBank(Ty, *DstRB);
3090constTargetRegisterClass *SrcRC =TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
3091constTargetRegisterClass *MaskRC =
3092TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3093
3094if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3095 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3096 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
3097returnfalse;
3098
3099if (Ty.getSizeInBits() == 32) {
3100assert(MaskTy.getSizeInBits() == 32 &&
3101"ptrmask should have been narrowed during legalize");
3102
3103auto NewOp =BuildMI(*BB, &I,DL, TII.get(NewOpc), DstReg)
3104 .addReg(SrcReg)
3105 .addReg(MaskReg);
3106
3107if (!IsVGPR)
3108 NewOp.setOperandDead(3);// Dead scc
3109I.eraseFromParent();
3110returntrue;
3111 }
3112
3113Register HiReg =MRI->createVirtualRegister(&RegRC);
3114Register LoReg =MRI->createVirtualRegister(&RegRC);
3115
3116// Extract the subregisters from the source pointer.
3117BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), LoReg)
3118 .addReg(SrcReg, 0, AMDGPU::sub0);
3119BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), HiReg)
3120 .addReg(SrcReg, 0, AMDGPU::sub1);
3121
3122Register MaskedLo, MaskedHi;
3123
3124if (CanCopyLow32) {
3125// If all the bits in the low half are 1, we only need a copy for it.
3126 MaskedLo = LoReg;
3127 }else {
3128// Extract the mask subregister and apply the and.
3129Register MaskLo =MRI->createVirtualRegister(&RegRC);
3130 MaskedLo =MRI->createVirtualRegister(&RegRC);
3131
3132BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), MaskLo)
3133 .addReg(MaskReg, 0, AMDGPU::sub0);
3134BuildMI(*BB, &I,DL, TII.get(NewOpc), MaskedLo)
3135 .addReg(LoReg)
3136 .addReg(MaskLo);
3137 }
3138
3139if (CanCopyHi32) {
3140// If all the bits in the high half are 1, we only need a copy for it.
3141 MaskedHi = HiReg;
3142 }else {
3143Register MaskHi =MRI->createVirtualRegister(&RegRC);
3144 MaskedHi =MRI->createVirtualRegister(&RegRC);
3145
3146BuildMI(*BB, &I,DL, TII.get(AMDGPU::COPY), MaskHi)
3147 .addReg(MaskReg, 0, AMDGPU::sub1);
3148BuildMI(*BB, &I,DL, TII.get(NewOpc), MaskedHi)
3149 .addReg(HiReg)
3150 .addReg(MaskHi);
3151 }
3152
3153BuildMI(*BB, &I,DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3154 .addReg(MaskedLo)
3155 .addImm(AMDGPU::sub0)
3156 .addReg(MaskedHi)
3157 .addImm(AMDGPU::sub1);
3158I.eraseFromParent();
3159returntrue;
3160}
3161
3162/// Return the register to use for the index value, and the subregister to use
3163/// for the indirectly accessed register.
3164static std::pair<Register, unsigned>
3165computeIndirectRegIndex(MachineRegisterInfo &MRI,constSIRegisterInfo &TRI,
3166constTargetRegisterClass *SuperRC,Register IdxReg,
3167unsigned EltSize,GISelKnownBits &KnownBits) {
3168Register IdxBaseReg;
3169intOffset;
3170
3171 std::tie(IdxBaseReg,Offset) =
3172AMDGPU::getBaseWithConstantOffset(MRI, IdxReg, &KnownBits);
3173if (IdxBaseReg == AMDGPU::NoRegister) {
3174// This will happen if the index is a known constant. This should ordinarily
3175// be legalized out, but handle it as a register just in case.
3176assert(Offset == 0);
3177 IdxBaseReg = IdxReg;
3178 }
3179
3180ArrayRef<int16_t> SubRegs =TRI.getRegSplitParts(SuperRC, EltSize);
3181
3182// Skip out of bounds offsets, or else we would end up using an undefined
3183// register.
3184if (static_cast<unsigned>(Offset) >= SubRegs.size())
3185return std::pair(IdxReg, SubRegs[0]);
3186return std::pair(IdxBaseReg, SubRegs[Offset]);
3187}
3188
3189bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3190MachineInstr &MI) const{
3191Register DstReg =MI.getOperand(0).getReg();
3192Register SrcReg =MI.getOperand(1).getReg();
3193Register IdxReg =MI.getOperand(2).getReg();
3194
3195LLT DstTy =MRI->getType(DstReg);
3196LLT SrcTy =MRI->getType(SrcReg);
3197
3198constRegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3199constRegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3200constRegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3201
3202// The index must be scalar. If it wasn't RegBankSelect should have moved this
3203// into a waterfall loop.
3204if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3205returnfalse;
3206
3207constTargetRegisterClass *SrcRC =
3208TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3209constTargetRegisterClass *DstRC =
3210TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3211if (!SrcRC || !DstRC)
3212returnfalse;
3213if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3214 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3215 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3216returnfalse;
3217
3218MachineBasicBlock *BB =MI.getParent();
3219constDebugLoc &DL =MI.getDebugLoc();
3220constbool Is64 = DstTy.getSizeInBits() == 64;
3221
3222unsignedSubReg;
3223 std::tie(IdxReg,SubReg) =computeIndirectRegIndex(
3224 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3225
3226if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3227if (DstTy.getSizeInBits() != 32 && !Is64)
3228returnfalse;
3229
3230BuildMI(*BB, &MI,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3231 .addReg(IdxReg);
3232
3233unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3234BuildMI(*BB, &MI,DL, TII.get(Opc), DstReg)
3235 .addReg(SrcReg, 0,SubReg)
3236 .addReg(SrcReg,RegState::Implicit);
3237MI.eraseFromParent();
3238returntrue;
3239 }
3240
3241if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3242returnfalse;
3243
3244if (!STI.useVGPRIndexMode()) {
3245BuildMI(*BB, &MI,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3246 .addReg(IdxReg);
3247BuildMI(*BB, &MI,DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3248 .addReg(SrcReg, 0,SubReg)
3249 .addReg(SrcReg,RegState::Implicit);
3250MI.eraseFromParent();
3251returntrue;
3252 }
3253
3254constMCInstrDesc &GPRIDXDesc =
3255 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC),true);
3256BuildMI(*BB,MI,DL, GPRIDXDesc, DstReg)
3257 .addReg(SrcReg)
3258 .addReg(IdxReg)
3259 .addImm(SubReg);
3260
3261MI.eraseFromParent();
3262returntrue;
3263}
3264
3265// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3266bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3267MachineInstr &MI) const{
3268Register DstReg =MI.getOperand(0).getReg();
3269Register VecReg =MI.getOperand(1).getReg();
3270Register ValReg =MI.getOperand(2).getReg();
3271Register IdxReg =MI.getOperand(3).getReg();
3272
3273LLT VecTy =MRI->getType(DstReg);
3274LLT ValTy =MRI->getType(ValReg);
3275unsignedVecSize = VecTy.getSizeInBits();
3276unsigned ValSize = ValTy.getSizeInBits();
3277
3278constRegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3279constRegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3280constRegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3281
3282assert(VecTy.getElementType() == ValTy);
3283
3284// The index must be scalar. If it wasn't RegBankSelect should have moved this
3285// into a waterfall loop.
3286if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3287returnfalse;
3288
3289constTargetRegisterClass *VecRC =
3290TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3291constTargetRegisterClass *ValRC =
3292TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3293
3294if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3295 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3296 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3297 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3298returnfalse;
3299
3300if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3301returnfalse;
3302
3303unsignedSubReg;
3304 std::tie(IdxReg,SubReg) =
3305computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3306
3307constboolIndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3308 STI.useVGPRIndexMode();
3309
3310MachineBasicBlock *BB =MI.getParent();
3311constDebugLoc &DL =MI.getDebugLoc();
3312
3313if (!IndexMode) {
3314BuildMI(*BB, &MI,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3315 .addReg(IdxReg);
3316
3317constMCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3318 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3319BuildMI(*BB,MI,DL, RegWriteOp, DstReg)
3320 .addReg(VecReg)
3321 .addReg(ValReg)
3322 .addImm(SubReg);
3323MI.eraseFromParent();
3324returntrue;
3325 }
3326
3327constMCInstrDesc &GPRIDXDesc =
3328 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC),false);
3329BuildMI(*BB,MI,DL, GPRIDXDesc, DstReg)
3330 .addReg(VecReg)
3331 .addReg(ValReg)
3332 .addReg(IdxReg)
3333 .addImm(SubReg);
3334
3335MI.eraseFromParent();
3336returntrue;
3337}
3338
3339bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const{
3340assert(!AMDGPU::isGFX12Plus(STI));
3341unsigned Opc;
3342unsignedSize =MI.getOperand(3).getImm();
3343
3344// The struct intrinsic variants add one additional operand over raw.
3345constbool HasVIndex =MI.getNumOperands() == 9;
3346Register VIndex;
3347int OpOffset = 0;
3348if (HasVIndex) {
3349 VIndex =MI.getOperand(4).getReg();
3350 OpOffset = 1;
3351 }
3352
3353Register VOffset =MI.getOperand(4 + OpOffset).getReg();
3354 std::optional<ValueAndVReg> MaybeVOffset =
3355getIConstantVRegValWithLookThrough(VOffset, *MRI);
3356constbool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3357
3358switch (Size) {
3359default:
3360returnfalse;
3361case 1:
3362 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3363 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3364 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3365 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3366break;
3367case 2:
3368 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3369 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3370 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3371 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3372break;
3373case 4:
3374 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3375 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3376 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3377 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3378break;
3379case 12:
3380if (!Subtarget->hasLDSLoadB96_B128())
3381returnfalse;
3382
3383 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3384 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3385 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3386 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3387break;
3388case 16:
3389if (!Subtarget->hasLDSLoadB96_B128())
3390returnfalse;
3391
3392 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3393 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3394 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3395 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3396break;
3397 }
3398
3399MachineBasicBlock *MBB =MI.getParent();
3400constDebugLoc &DL =MI.getDebugLoc();
3401BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3402 .add(MI.getOperand(2));
3403
3404auto MIB =BuildMI(*MBB, &MI,DL, TII.get(Opc));
3405
3406if (HasVIndex && HasVOffset) {
3407Register IdxReg =MRI->createVirtualRegister(TRI.getVGPR64Class());
3408BuildMI(*MBB, &*MIB,DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3409 .addReg(VIndex)
3410 .addImm(AMDGPU::sub0)
3411 .addReg(VOffset)
3412 .addImm(AMDGPU::sub1);
3413
3414 MIB.addReg(IdxReg);
3415 }elseif (HasVIndex) {
3416 MIB.addReg(VIndex);
3417 }elseif (HasVOffset) {
3418 MIB.addReg(VOffset);
3419 }
3420
3421 MIB.add(MI.getOperand(1));// rsrc
3422 MIB.add(MI.getOperand(5 + OpOffset));// soffset
3423 MIB.add(MI.getOperand(6 + OpOffset));// imm offset
3424bool IsGFX12Plus =AMDGPU::isGFX12Plus(STI);
3425unsigned Aux =MI.getOperand(7 + OpOffset).getImm();
3426 MIB.addImm(Aux & (IsGFX12Plus ?AMDGPU::CPol::ALL
3427 :AMDGPU::CPol::ALL_pregfx12));// cpol
3428 MIB.addImm(
3429 Aux & (IsGFX12Plus ?AMDGPU::CPol::SWZ :AMDGPU::CPol::SWZ_pregfx12)
3430 ? 1
3431 : 0);// swz
3432
3433MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3434MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3435 LoadPtrI.Offset =MI.getOperand(6 + OpOffset).getImm();
3436MachinePointerInfo StorePtrI = LoadPtrI;
3437 StorePtrI.V =nullptr;
3438 StorePtrI.AddrSpace =AMDGPUAS::LOCAL_ADDRESS;
3439
3440autoF = LoadMMO->getFlags() &
3441 ~(MachineMemOperand::MOStore |MachineMemOperand::MOLoad);
3442 LoadMMO =MF->getMachineMemOperand(LoadPtrI,F |MachineMemOperand::MOLoad,
3443Size, LoadMMO->getBaseAlign());
3444
3445MachineMemOperand *StoreMMO =
3446MF->getMachineMemOperand(StorePtrI,F |MachineMemOperand::MOStore,
3447sizeof(int32_t), LoadMMO->getBaseAlign());
3448
3449 MIB.setMemRefs({LoadMMO, StoreMMO});
3450
3451MI.eraseFromParent();
3452returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3453}
3454
3455/// Match a zero extend from a 32-bit value to 64-bits.
3456staticRegistermatchZeroExtendFromS32(MachineRegisterInfo &MRI,Register Reg) {
3457Register ZExtSrc;
3458if (mi_match(Reg,MRI,m_GZExt(m_Reg(ZExtSrc))))
3459returnMRI.getType(ZExtSrc) ==LLT::scalar(32) ? ZExtSrc :Register();
3460
3461// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3462constMachineInstr *Def =getDefIgnoringCopies(Reg,MRI);
3463if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3464returnRegister();
3465
3466assert(Def->getNumOperands() == 3 &&
3467MRI.getType(Def->getOperand(0).getReg()) ==LLT::scalar(64));
3468if (mi_match(Def->getOperand(2).getReg(),MRI,m_ZeroInt())) {
3469return Def->getOperand(1).getReg();
3470 }
3471
3472returnRegister();
3473}
3474
3475bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3476unsigned Opc;
3477unsignedSize =MI.getOperand(3).getImm();
3478
3479switch (Size) {
3480default:
3481returnfalse;
3482case 1:
3483 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3484break;
3485case 2:
3486 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3487break;
3488case 4:
3489 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3490break;
3491case 12:
3492if (!Subtarget->hasLDSLoadB96_B128())
3493returnfalse;
3494 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3495break;
3496case 16:
3497if (!Subtarget->hasLDSLoadB96_B128())
3498returnfalse;
3499 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3500break;
3501 }
3502
3503MachineBasicBlock *MBB =MI.getParent();
3504constDebugLoc &DL =MI.getDebugLoc();
3505BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3506 .add(MI.getOperand(2));
3507
3508RegisterAddr =MI.getOperand(1).getReg();
3509Register VOffset;
3510// Try to split SAddr and VOffset. Global and LDS pointers share the same
3511// immediate offset, so we cannot use a regular SelectGlobalSAddr().
3512if (!isSGPR(Addr)) {
3513auto AddrDef =getDefSrcRegIgnoringCopies(Addr, *MRI);
3514if (isSGPR(AddrDef->Reg)) {
3515Addr = AddrDef->Reg;
3516 }elseif (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3517Register SAddr =
3518getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3519if (isSGPR(SAddr)) {
3520Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3521if (Register Off =matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3522Addr = SAddr;
3523 VOffset =Off;
3524 }
3525 }
3526 }
3527 }
3528
3529if (isSGPR(Addr)) {
3530 Opc =AMDGPU::getGlobalSaddrOp(Opc);
3531if (!VOffset) {
3532 VOffset =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3533BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3534 .addImm(0);
3535 }
3536 }
3537
3538auto MIB =BuildMI(*MBB, &MI,DL, TII.get(Opc))
3539 .addReg(Addr);
3540
3541if (isSGPR(Addr))
3542 MIB.addReg(VOffset);
3543
3544 MIB.add(MI.getOperand(4))// offset
3545 .add(MI.getOperand(5));// cpol
3546
3547MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3548MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3549 LoadPtrI.Offset =MI.getOperand(4).getImm();
3550MachinePointerInfo StorePtrI = LoadPtrI;
3551 LoadPtrI.AddrSpace =AMDGPUAS::GLOBAL_ADDRESS;
3552 StorePtrI.AddrSpace =AMDGPUAS::LOCAL_ADDRESS;
3553autoF = LoadMMO->getFlags() &
3554 ~(MachineMemOperand::MOStore |MachineMemOperand::MOLoad);
3555 LoadMMO =MF->getMachineMemOperand(LoadPtrI,F |MachineMemOperand::MOLoad,
3556Size, LoadMMO->getBaseAlign());
3557MachineMemOperand *StoreMMO =
3558MF->getMachineMemOperand(StorePtrI,F |MachineMemOperand::MOStore,
3559sizeof(int32_t),Align(4));
3560
3561 MIB.setMemRefs({LoadMMO, StoreMMO});
3562
3563MI.eraseFromParent();
3564returnconstrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3565}
3566
3567bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3568MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3569MI.removeOperand(1);
3570MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3571returntrue;
3572}
3573
3574// FIXME: This should be removed and let the patterns select. We just need the
3575// AGPR/VGPR combination versions.
3576bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const{
3577unsigned Opc;
3578switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3579case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3580 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3581break;
3582case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3583 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3584break;
3585case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3586 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3587break;
3588case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3589 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3590break;
3591case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3592 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3593break;
3594case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3595 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3596break;
3597case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3598 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3599break;
3600case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3601 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3602break;
3603case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3604 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3605break;
3606case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3607 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3608break;
3609case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3610 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3611break;
3612case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3613 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3614break;
3615case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3616 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3617break;
3618case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3619 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3620break;
3621case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3622 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3623break;
3624case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3625 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3626break;
3627case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3628 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3629break;
3630case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3631 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3632break;
3633case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3634 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3635break;
3636case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3637 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3638break;
3639case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3640 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3641break;
3642case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3643 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3644break;
3645case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3646 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3647break;
3648case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3649 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3650break;
3651case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3652 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3653break;
3654case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3655 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3656break;
3657case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3658 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3659break;
3660case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3661 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3662break;
3663default:
3664llvm_unreachable("unhandled smfmac intrinsic");
3665 }
3666
3667auto VDst_In =MI.getOperand(4);
3668
3669MI.setDesc(TII.get(Opc));
3670MI.removeOperand(4);// VDst_In
3671MI.removeOperand(1);// Intrinsic ID
3672MI.addOperand(VDst_In);// Readd VDst_In to the end
3673MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3674returntrue;
3675}
3676
3677bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3678MachineInstr &MI,Intrinsic::ID IntrID) const{
3679if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3680 !Subtarget->hasPermlane16Swap())
3681returnfalse;
3682if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3683 !Subtarget->hasPermlane32Swap())
3684returnfalse;
3685
3686unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3687 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3688 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3689
3690MI.removeOperand(2);
3691MI.setDesc(TII.get(Opcode));
3692MI.addOperand(*MF,MachineOperand::CreateReg(AMDGPU::EXEC,false,true));
3693
3694MachineOperand &FI =MI.getOperand(4);
3695 FI.setImm(FI.getImm() ?AMDGPU::DPP::DPP_FI_1 :AMDGPU::DPP::DPP_FI_0);
3696
3697returnconstrainSelectedInstRegOperands(MI, TII, TRI, RBI);
3698}
3699
3700bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const{
3701Register DstReg =MI.getOperand(0).getReg();
3702Register SrcReg =MI.getOperand(1).getReg();
3703constRegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3704constbool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3705MachineBasicBlock *MBB =MI.getParent();
3706constDebugLoc &DL =MI.getDebugLoc();
3707
3708if (IsVALU) {
3709BuildMI(*MBB,MI,DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3710 .addImm(Subtarget->getWavefrontSizeLog2())
3711 .addReg(SrcReg);
3712 }else {
3713BuildMI(*MBB,MI,DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3714 .addReg(SrcReg)
3715 .addImm(Subtarget->getWavefrontSizeLog2())
3716 .setOperandDead(3);// Dead scc
3717 }
3718
3719constTargetRegisterClass &RC =
3720 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3721if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3722returnfalse;
3723
3724MI.eraseFromParent();
3725returntrue;
3726}
3727
3728// Match BITOP3 operation and return a number of matched instructions plus
3729// truth table.
3730static std::pair<unsigned, uint8_t>BitOp3_Op(Register R,
3731SmallVectorImpl<Register> &Src,
3732constMachineRegisterInfo &MRI) {
3733unsigned NumOpcodes = 0;
3734uint8_t LHSBits, RHSBits;
3735
3736auto getOperandBits = [&Src, R, &MRI](RegisterOp,uint8_t &Bits) ->bool {
3737// Define truth table given Src0, Src1, Src2 bits permutations:
3738// 0 0 0
3739// 0 0 1
3740// 0 1 0
3741// 0 1 1
3742// 1 0 0
3743// 1 0 1
3744// 1 1 0
3745// 1 1 1
3746constuint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3747
3748if (mi_match(Op,MRI,m_AllOnesInt())) {
3749 Bits = 0xff;
3750returntrue;
3751 }
3752if (mi_match(Op,MRI,m_ZeroInt())) {
3753 Bits = 0;
3754returntrue;
3755 }
3756
3757for (unsignedI = 0;I < Src.size(); ++I) {
3758// Try to find existing reused operand
3759if (Src[I] ==Op) {
3760 Bits = SrcBits[I];
3761returntrue;
3762 }
3763// Try to replace parent operator
3764if (Src[I] == R) {
3765 Bits = SrcBits[I];
3766 Src[I] =Op;
3767returntrue;
3768 }
3769 }
3770
3771if (Src.size() == 3) {
3772// No room left for operands. Try one last time, there can be a 'not' of
3773// one of our source operands. In this case we can compute the bits
3774// without growing Src vector.
3775RegisterLHS;
3776if (mi_match(Op,MRI,m_Not(m_Reg(LHS)))) {
3777LHS =getSrcRegIgnoringCopies(LHS,MRI);
3778for (unsignedI = 0;I < Src.size(); ++I) {
3779if (Src[I] ==LHS) {
3780 Bits = ~SrcBits[I];
3781returntrue;
3782 }
3783 }
3784 }
3785
3786returnfalse;
3787 }
3788
3789 Bits = SrcBits[Src.size()];
3790 Src.push_back(Op);
3791returntrue;
3792 };
3793
3794MachineInstr *MI =MRI.getVRegDef(R);
3795switch (MI->getOpcode()) {
3796case TargetOpcode::G_AND:
3797case TargetOpcode::G_OR:
3798case TargetOpcode::G_XOR: {
3799RegisterLHS =getSrcRegIgnoringCopies(MI->getOperand(1).getReg(),MRI);
3800RegisterRHS =getSrcRegIgnoringCopies(MI->getOperand(2).getReg(),MRI);
3801
3802SmallVector<Register, 3> Backup(Src.begin(), Src.end());
3803if (!getOperandBits(LHS, LHSBits) ||
3804 !getOperandBits(RHS, RHSBits)) {
3805 Src = Backup;
3806return std::make_pair(0, 0);
3807 }
3808
3809// Recursion is naturally limited by the size of the operand vector.
3810autoOp =BitOp3_Op(LHS, Src,MRI);
3811if (Op.first) {
3812 NumOpcodes +=Op.first;
3813 LHSBits =Op.second;
3814 }
3815
3816Op =BitOp3_Op(RHS, Src,MRI);
3817if (Op.first) {
3818 NumOpcodes +=Op.first;
3819 RHSBits =Op.second;
3820 }
3821break;
3822 }
3823default:
3824return std::make_pair(0, 0);
3825 }
3826
3827uint8_t TTbl;
3828switch (MI->getOpcode()) {
3829case TargetOpcode::G_AND:
3830 TTbl = LHSBits & RHSBits;
3831break;
3832case TargetOpcode::G_OR:
3833 TTbl = LHSBits | RHSBits;
3834break;
3835case TargetOpcode::G_XOR:
3836 TTbl = LHSBits ^ RHSBits;
3837break;
3838default:
3839break;
3840 }
3841
3842return std::make_pair(NumOpcodes + 1, TTbl);
3843}
3844
3845bool AMDGPUInstructionSelector::selectBITOP3(MachineInstr &MI) const{
3846if (!Subtarget->hasBitOp3Insts())
3847returnfalse;
3848
3849Register DstReg =MI.getOperand(0).getReg();
3850constRegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3851constbool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3852if (!IsVALU)
3853returnfalse;
3854
3855SmallVector<Register, 3> Src;
3856uint8_t TTbl;
3857unsigned NumOpcodes;
3858
3859 std::tie(NumOpcodes, TTbl) =BitOp3_Op(DstReg, Src, *MRI);
3860
3861// Src.empty() case can happen if all operands are all zero or all ones.
3862// Normally it shall be optimized out before reaching this.
3863if (NumOpcodes < 2 || Src.empty())
3864returnfalse;
3865
3866constbool IsB32 =MRI->getType(DstReg) ==LLT::scalar(32);
3867if (NumOpcodes == 2 && IsB32) {
3868// Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes
3869// asm more readable. This cannot be modeled with AddedComplexity because
3870// selector does not know how many operations did we match.
3871if (mi_match(MI, *MRI,m_GXor(m_GXor(m_Reg(),m_Reg()),m_Reg())) ||
3872mi_match(MI, *MRI,m_GOr(m_GOr(m_Reg(),m_Reg()),m_Reg())) ||
3873mi_match(MI, *MRI,m_GOr(m_GAnd(m_Reg(),m_Reg()),m_Reg())))
3874returnfalse;
3875 }elseif (NumOpcodes < 4) {
3876// For a uniform case threshold should be higher to account for moves
3877// between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be
3878// in SGPRs and a readtfirstlane after.
3879returnfalse;
3880 }
3881
3882unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3883unsigned CBL = STI.getConstantBusLimit(Opc);
3884MachineBasicBlock *MBB =MI.getParent();
3885constDebugLoc &DL =MI.getDebugLoc();
3886
3887for (unsignedI = 0;I < Src.size(); ++I) {
3888constRegisterBank *RB = RBI.getRegBank(Src[I], *MRI, TRI);
3889if (RB->getID() != AMDGPU::SGPRRegBankID)
3890continue;
3891if (CBL > 0) {
3892 --CBL;
3893continue;
3894 }
3895Register NewReg =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3896BuildMI(*MBB,MI,DL, TII.get(AMDGPU::COPY), NewReg)
3897 .addReg(Src[I]);
3898 Src[I] = NewReg;
3899 }
3900
3901// Last operand can be ignored, turning a ternary operation into a binary.
3902// For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace
3903// 'c' with 'a' here without changing the answer. In some pathological
3904// cases it should be possible to get an operation with a single operand
3905// too if optimizer would not catch it.
3906while (Src.size() < 3)
3907 Src.push_back(Src[0]);
3908
3909auto MIB =BuildMI(*MBB,MI,DL, TII.get(Opc), DstReg);
3910if (!IsB32)
3911 MIB.addImm(0);// src_mod0
3912 MIB.addReg(Src[0]);
3913if (!IsB32)
3914 MIB.addImm(0);// src_mod1
3915 MIB.addReg(Src[1]);
3916if (!IsB32)
3917 MIB.addImm(0);// src_mod2
3918 MIB.addReg(Src[2])
3919 .addImm(TTbl);
3920if (!IsB32)
3921 MIB.addImm(0);// op_sel
3922
3923constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3924MI.eraseFromParent();
3925
3926returntrue;
3927}
3928
3929bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const{
3930Register SrcReg =MI.getOperand(0).getReg();
3931if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3932returnfalse;
3933
3934MachineInstr *DefMI =MRI->getVRegDef(SrcReg);
3935Register SP =
3936 Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
3937Register WaveAddr =getWaveAddress(DefMI);
3938MachineBasicBlock *MBB =MI.getParent();
3939constDebugLoc &DL =MI.getDebugLoc();
3940
3941if (!WaveAddr) {
3942 WaveAddr =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3943BuildMI(*MBB,MI,DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3944 .addReg(SrcReg)
3945 .addImm(Subtarget->getWavefrontSizeLog2())
3946 .setOperandDead(3);// Dead scc
3947 }
3948
3949BuildMI(*MBB, &MI,DL, TII.get(AMDGPU::COPY), SP)
3950 .addReg(WaveAddr);
3951
3952MI.eraseFromParent();
3953returntrue;
3954}
3955
3956boolAMDGPUInstructionSelector::select(MachineInstr &I) {
3957
3958if (!I.isPreISelOpcode()) {
3959if (I.isCopy())
3960return selectCOPY(I);
3961returntrue;
3962 }
3963
3964switch (I.getOpcode()) {
3965case TargetOpcode::G_AND:
3966case TargetOpcode::G_OR:
3967case TargetOpcode::G_XOR:
3968if (selectBITOP3(I))
3969returntrue;
3970if (selectImpl(I, *CoverageInfo))
3971returntrue;
3972return selectG_AND_OR_XOR(I);
3973case TargetOpcode::G_ADD:
3974case TargetOpcode::G_SUB:
3975case TargetOpcode::G_PTR_ADD:
3976if (selectImpl(I, *CoverageInfo))
3977returntrue;
3978return selectG_ADD_SUB(I);
3979case TargetOpcode::G_UADDO:
3980case TargetOpcode::G_USUBO:
3981case TargetOpcode::G_UADDE:
3982case TargetOpcode::G_USUBE:
3983return selectG_UADDO_USUBO_UADDE_USUBE(I);
3984case AMDGPU::G_AMDGPU_MAD_U64_U32:
3985case AMDGPU::G_AMDGPU_MAD_I64_I32:
3986return selectG_AMDGPU_MAD_64_32(I);
3987case TargetOpcode::G_INTTOPTR:
3988case TargetOpcode::G_BITCAST:
3989case TargetOpcode::G_PTRTOINT:
3990case TargetOpcode::G_FREEZE:
3991return selectCOPY(I);
3992case TargetOpcode::G_FNEG:
3993if (selectImpl(I, *CoverageInfo))
3994returntrue;
3995return selectG_FNEG(I);
3996case TargetOpcode::G_FABS:
3997if (selectImpl(I, *CoverageInfo))
3998returntrue;
3999return selectG_FABS(I);
4000case TargetOpcode::G_EXTRACT:
4001return selectG_EXTRACT(I);
4002case TargetOpcode::G_MERGE_VALUES:
4003case TargetOpcode::G_CONCAT_VECTORS:
4004return selectG_MERGE_VALUES(I);
4005case TargetOpcode::G_UNMERGE_VALUES:
4006return selectG_UNMERGE_VALUES(I);
4007case TargetOpcode::G_BUILD_VECTOR:
4008case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4009return selectG_BUILD_VECTOR(I);
4010case TargetOpcode::G_IMPLICIT_DEF:
4011return selectG_IMPLICIT_DEF(I);
4012case TargetOpcode::G_INSERT:
4013return selectG_INSERT(I);
4014case TargetOpcode::G_INTRINSIC:
4015case TargetOpcode::G_INTRINSIC_CONVERGENT:
4016return selectG_INTRINSIC(I);
4017case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4018case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4019return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
4020case TargetOpcode::G_ICMP:
4021case TargetOpcode::G_FCMP:
4022if (selectG_ICMP_or_FCMP(I))
4023returntrue;
4024return selectImpl(I, *CoverageInfo);
4025case TargetOpcode::G_LOAD:
4026case TargetOpcode::G_ZEXTLOAD:
4027case TargetOpcode::G_SEXTLOAD:
4028case TargetOpcode::G_STORE:
4029case TargetOpcode::G_ATOMIC_CMPXCHG:
4030case TargetOpcode::G_ATOMICRMW_XCHG:
4031case TargetOpcode::G_ATOMICRMW_ADD:
4032case TargetOpcode::G_ATOMICRMW_SUB:
4033case TargetOpcode::G_ATOMICRMW_AND:
4034case TargetOpcode::G_ATOMICRMW_OR:
4035case TargetOpcode::G_ATOMICRMW_XOR:
4036case TargetOpcode::G_ATOMICRMW_MIN:
4037case TargetOpcode::G_ATOMICRMW_MAX:
4038case TargetOpcode::G_ATOMICRMW_UMIN:
4039case TargetOpcode::G_ATOMICRMW_UMAX:
4040case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4041case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4042case TargetOpcode::G_ATOMICRMW_FADD:
4043case TargetOpcode::G_ATOMICRMW_FMIN:
4044case TargetOpcode::G_ATOMICRMW_FMAX:
4045return selectG_LOAD_STORE_ATOMICRMW(I);
4046case TargetOpcode::G_SELECT:
4047return selectG_SELECT(I);
4048case TargetOpcode::G_TRUNC:
4049return selectG_TRUNC(I);
4050case TargetOpcode::G_SEXT:
4051case TargetOpcode::G_ZEXT:
4052case TargetOpcode::G_ANYEXT:
4053case TargetOpcode::G_SEXT_INREG:
4054// This is a workaround. For extension from type i1, `selectImpl()` uses
4055// patterns from TD file and generates an illegal VGPR to SGPR COPY as type
4056// i1 can only be hold in a SGPR class.
4057if (MRI->getType(I.getOperand(1).getReg()) !=LLT::scalar(1) &&
4058 selectImpl(I, *CoverageInfo))
4059returntrue;
4060return selectG_SZA_EXT(I);
4061case TargetOpcode::G_FPEXT:
4062if (selectG_FPEXT(I))
4063returntrue;
4064return selectImpl(I, *CoverageInfo);
4065case TargetOpcode::G_BRCOND:
4066return selectG_BRCOND(I);
4067case TargetOpcode::G_GLOBAL_VALUE:
4068return selectG_GLOBAL_VALUE(I);
4069case TargetOpcode::G_PTRMASK:
4070return selectG_PTRMASK(I);
4071case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4072return selectG_EXTRACT_VECTOR_ELT(I);
4073case TargetOpcode::G_INSERT_VECTOR_ELT:
4074return selectG_INSERT_VECTOR_ELT(I);
4075case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4076case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4077case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4078case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4079case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4080constAMDGPU::ImageDimIntrinsicInfo *Intr =
4081AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));
4082assert(Intr &&"not an image intrinsic with image pseudo");
4083return selectImageIntrinsic(I,Intr);
4084 }
4085case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
4086return selectBVHIntrinsic(I);
4087case AMDGPU::G_SBFX:
4088case AMDGPU::G_UBFX:
4089return selectG_SBFX_UBFX(I);
4090case AMDGPU::G_SI_CALL:
4091I.setDesc(TII.get(AMDGPU::SI_CALL));
4092returntrue;
4093case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4094return selectWaveAddress(I);
4095case AMDGPU::G_STACKRESTORE:
4096return selectStackRestore(I);
4097case AMDGPU::G_PHI:
4098return selectPHI(I);
4099case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4100return selectCOPY_SCC_VCC(I);
4101case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4102return selectCOPY_VCC_SCC(I);
4103case AMDGPU::G_AMDGPU_READANYLANE:
4104return selectReadAnyLane(I);
4105case TargetOpcode::G_CONSTANT:
4106case TargetOpcode::G_FCONSTANT:
4107default:
4108return selectImpl(I, *CoverageInfo);
4109 }
4110returnfalse;
4111}
4112
4113InstructionSelector::ComplexRendererFns
4114AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const{
4115return {{
4116 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4117 }};
4118
4119}
4120
4121std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4122Register Src,bool IsCanonicalizing,bool AllowAbs,bool OpSel) const{
4123unsigned Mods = 0;
4124MachineInstr *MI =getDefIgnoringCopies(Src, *MRI);
4125
4126if (MI->getOpcode() == AMDGPU::G_FNEG) {
4127 Src =MI->getOperand(1).getReg();
4128 Mods |=SISrcMods::NEG;
4129MI =getDefIgnoringCopies(Src, *MRI);
4130 }elseif (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4131// Fold fsub [+-]0 into fneg. This may not have folded depending on the
4132// denormal mode, but we're implicitly canonicalizing in a source operand.
4133constConstantFP *LHS =
4134getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
4135if (LHS &&LHS->isZero()) {
4136 Mods |=SISrcMods::NEG;
4137 Src =MI->getOperand(2).getReg();
4138 }
4139 }
4140
4141if (AllowAbs &&MI->getOpcode() == AMDGPU::G_FABS) {
4142 Src =MI->getOperand(1).getReg();
4143 Mods |=SISrcMods::ABS;
4144 }
4145
4146if (OpSel)
4147 Mods |=SISrcMods::OP_SEL_0;
4148
4149return std::pair(Src, Mods);
4150}
4151
4152Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4153Register Src,unsigned Mods,MachineOperand Root,MachineInstr *InsertPt,
4154bool ForceVGPR) const{
4155if ((Mods != 0 || ForceVGPR) &&
4156 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
4157
4158// If we looked through copies to find source modifiers on an SGPR operand,
4159// we now have an SGPR register source. To avoid potentially violating the
4160// constant bus restriction, we need to insert a copy to a VGPR.
4161Register VGPRSrc =MRI->cloneVirtualRegister(Root.getReg());
4162BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
4163 TII.get(AMDGPU::COPY), VGPRSrc)
4164 .addReg(Src);
4165 Src = VGPRSrc;
4166 }
4167
4168return Src;
4169}
4170
4171///
4172/// This will select either an SGPR or VGPR operand and will save us from
4173/// having to write an extra tablegen pattern.
4174InstructionSelector::ComplexRendererFns
4175AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const{
4176return {{
4177 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
4178 }};
4179}
4180
4181InstructionSelector::ComplexRendererFns
4182AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const{
4183Register Src;
4184unsigned Mods;
4185 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4186
4187return {{
4188 [=](MachineInstrBuilder &MIB) {
4189 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4190 },
4191 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); },// src0_mods
4192 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },// clamp
4193 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }// omod
4194 }};
4195}
4196
4197InstructionSelector::ComplexRendererFns
4198AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const{
4199Register Src;
4200unsigned Mods;
4201 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4202/*IsCanonicalizing=*/true,
4203/*AllowAbs=*/false);
4204
4205return {{
4206 [=](MachineInstrBuilder &MIB) {
4207 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4208 },
4209 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); },// src0_mods
4210 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },// clamp
4211 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }// omod
4212 }};
4213}
4214
4215InstructionSelector::ComplexRendererFns
4216AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const{
4217return {{
4218 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
4219 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },// clamp
4220 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }// omod
4221 }};
4222}
4223
4224InstructionSelector::ComplexRendererFns
4225AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const{
4226Register Src;
4227unsigned Mods;
4228 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4229
4230return {{
4231 [=](MachineInstrBuilder &MIB) {
4232 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4233 },
4234 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
4235 }};
4236}
4237
4238InstructionSelector::ComplexRendererFns
4239AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4240MachineOperand &Root) const{
4241Register Src;
4242unsigned Mods;
4243 std::tie(Src, Mods) =
4244 selectVOP3ModsImpl(Root.getReg(),/*IsCanonicalizing=*/false);
4245
4246return {{
4247 [=](MachineInstrBuilder &MIB) {
4248 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4249 },
4250 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
4251 }};
4252}
4253
4254InstructionSelector::ComplexRendererFns
4255AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const{
4256Register Src;
4257unsigned Mods;
4258 std::tie(Src, Mods) =
4259 selectVOP3ModsImpl(Root.getReg(),/*IsCanonicalizing=*/true,
4260/*AllowAbs=*/false);
4261
4262return {{
4263 [=](MachineInstrBuilder &MIB) {
4264 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4265 },
4266 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
4267 }};
4268}
4269
4270InstructionSelector::ComplexRendererFns
4271AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const{
4272RegisterReg = Root.getReg();
4273constMachineInstr *Def =getDefIgnoringCopies(Reg, *MRI);
4274if (Def->getOpcode() == AMDGPU::G_FNEG ||Def->getOpcode() == AMDGPU::G_FABS)
4275return {};
4276return {{
4277 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4278 }};
4279}
4280
4281std::pair<Register, unsigned>
4282AMDGPUInstructionSelector::selectVOP3PModsImpl(
4283Register Src,constMachineRegisterInfo &MRI,bool IsDOT) const{
4284unsigned Mods = 0;
4285MachineInstr *MI =MRI.getVRegDef(Src);
4286
4287if (MI->getOpcode() == AMDGPU::G_FNEG &&
4288// It's possible to see an f32 fneg here, but unlikely.
4289// TODO: Treat f32 fneg as only high bit.
4290MRI.getType(Src) ==LLT::fixed_vector(2, 16)) {
4291 Mods ^= (SISrcMods::NEG |SISrcMods::NEG_HI);
4292 Src =MI->getOperand(1).getReg();
4293MI =MRI.getVRegDef(Src);
4294 }
4295
4296// TODO: Handle G_FSUB 0 as fneg
4297
4298// TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
4299 (void)IsDOT;// DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
4300
4301// Packed instructions do not have abs modifiers.
4302 Mods |=SISrcMods::OP_SEL_1;
4303
4304return std::pair(Src, Mods);
4305}
4306
4307InstructionSelector::ComplexRendererFns
4308AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const{
4309MachineRegisterInfo &MRI
4310 = Root.getParent()->getParent()->getParent()->getRegInfo();
4311
4312Register Src;
4313unsigned Mods;
4314 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
4315
4316return {{
4317 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4318 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
4319 }};
4320}
4321
4322InstructionSelector::ComplexRendererFns
4323AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const{
4324MachineRegisterInfo &MRI
4325 = Root.getParent()->getParent()->getParent()->getRegInfo();
4326
4327Register Src;
4328unsigned Mods;
4329 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI,true);
4330
4331return {{
4332 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4333 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
4334 }};
4335}
4336
4337InstructionSelector::ComplexRendererFns
4338AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const{
4339// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
4340// Value is in Imm operand as i1 sign extended to int64_t.
4341// 1(-1) promotes packed values to signed, 0 treats them as unsigned.
4342assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4343"expected i1 value");
4344unsigned Mods =SISrcMods::OP_SEL_1;
4345if (Root.getImm() == -1)
4346 Mods ^=SISrcMods::NEG;
4347return {{
4348 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
4349 }};
4350}
4351
4352InstructionSelector::ComplexRendererFns
4353AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4354MachineOperand &Root) const{
4355assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
4356"expected i1 value");
4357unsigned Mods =SISrcMods::OP_SEL_1;
4358if (Root.getImm() != 0)
4359 Mods |=SISrcMods::OP_SEL_0;
4360
4361return {{
4362 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
4363 }};
4364}
4365
4366staticRegisterbuildRegSequence(SmallVectorImpl<Register> &Elts,
4367MachineInstr *InsertPt,
4368MachineRegisterInfo &MRI) {
4369constTargetRegisterClass *DstRegClass;
4370switch (Elts.size()) {
4371case 8:
4372 DstRegClass = &AMDGPU::VReg_256RegClass;
4373break;
4374case 4:
4375 DstRegClass = &AMDGPU::VReg_128RegClass;
4376break;
4377case 2:
4378 DstRegClass = &AMDGPU::VReg_64RegClass;
4379break;
4380default:
4381llvm_unreachable("unhandled Reg sequence size");
4382 }
4383
4384MachineIRBuilderB(*InsertPt);
4385auto MIB =B.buildInstr(AMDGPU::REG_SEQUENCE)
4386 .addDef(MRI.createVirtualRegister(DstRegClass));
4387for (unsigned i = 0; i < Elts.size(); ++i) {
4388 MIB.addReg(Elts[i]);
4389 MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
4390 }
4391return MIB->getOperand(0).getReg();
4392}
4393
4394staticvoidselectWMMAModsNegAbs(unsigned ModOpcode,unsigned &Mods,
4395SmallVectorImpl<Register> &Elts,Register &Src,
4396MachineInstr *InsertPt,
4397MachineRegisterInfo &MRI) {
4398if (ModOpcode == TargetOpcode::G_FNEG) {
4399 Mods |=SISrcMods::NEG;
4400// Check if all elements also have abs modifier
4401SmallVector<Register, 8> NegAbsElts;
4402for (auto El : Elts) {
4403Register FabsSrc;
4404if (!mi_match(El,MRI,m_GFabs(m_Reg(FabsSrc))))
4405break;
4406 NegAbsElts.push_back(FabsSrc);
4407 }
4408if (Elts.size() != NegAbsElts.size()) {
4409// Neg
4410 Src =buildRegSequence(Elts, InsertPt,MRI);
4411 }else {
4412// Neg and Abs
4413 Mods |=SISrcMods::NEG_HI;
4414 Src =buildRegSequence(NegAbsElts, InsertPt,MRI);
4415 }
4416 }else {
4417assert(ModOpcode == TargetOpcode::G_FABS);
4418// Abs
4419 Mods |=SISrcMods::NEG_HI;
4420 Src =buildRegSequence(Elts, InsertPt,MRI);
4421 }
4422}
4423
4424InstructionSelector::ComplexRendererFns
4425AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const{
4426Register Src = Root.getReg();
4427unsigned Mods =SISrcMods::OP_SEL_1;
4428SmallVector<Register, 8> EltsF32;
4429
4430if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
4431assert(BV->getNumSources() > 0);
4432// Based on first element decide which mod we match, neg or abs
4433MachineInstr *ElF32 =MRI->getVRegDef(BV->getSourceReg(0));
4434unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
4435 ? AMDGPU::G_FNEG
4436 : AMDGPU::G_FABS;
4437for (unsigned i = 0; i < BV->getNumSources(); ++i) {
4438 ElF32 =MRI->getVRegDef(BV->getSourceReg(i));
4439if (ElF32->getOpcode() != ModOpcode)
4440break;
4441 EltsF32.push_back(ElF32->getOperand(1).getReg());
4442 }
4443
4444// All elements had ModOpcode modifier
4445if (BV->getNumSources() == EltsF32.size()) {
4446selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4447 *MRI);
4448 }
4449 }
4450
4451return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4452 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4453}
4454
4455InstructionSelector::ComplexRendererFns
4456AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const{
4457Register Src = Root.getReg();
4458unsigned Mods =SISrcMods::OP_SEL_1;
4459SmallVector<Register, 8> EltsV2F16;
4460
4461if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4462for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4463Register FNegSrc;
4464if (!mi_match(CV->getSourceReg(i), *MRI,m_GFNeg(m_Reg(FNegSrc))))
4465break;
4466 EltsV2F16.push_back(FNegSrc);
4467 }
4468
4469// All elements had ModOpcode modifier
4470if (CV->getNumSources() == EltsV2F16.size()) {
4471 Mods |=SISrcMods::NEG;
4472 Mods |=SISrcMods::NEG_HI;
4473 Src =buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4474 }
4475 }
4476
4477return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4478 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4479}
4480
4481InstructionSelector::ComplexRendererFns
4482AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const{
4483Register Src = Root.getReg();
4484unsigned Mods =SISrcMods::OP_SEL_1;
4485SmallVector<Register, 8> EltsV2F16;
4486
4487if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4488assert(CV->getNumSources() > 0);
4489MachineInstr *ElV2F16 =MRI->getVRegDef(CV->getSourceReg(0));
4490// Based on first element decide which mod we match, neg or abs
4491unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4492 ? AMDGPU::G_FNEG
4493 : AMDGPU::G_FABS;
4494
4495for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4496 ElV2F16 =MRI->getVRegDef(CV->getSourceReg(i));
4497if (ElV2F16->getOpcode() != ModOpcode)
4498break;
4499 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4500 }
4501
4502// All elements had ModOpcode modifier
4503if (CV->getNumSources() == EltsV2F16.size()) {
4504MachineIRBuilderB(*Root.getParent());
4505selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4506 *MRI);
4507 }
4508 }
4509
4510return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4511 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4512}
4513
4514InstructionSelector::ComplexRendererFns
4515AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const{
4516 std::optional<FPValueAndVReg> FPValReg;
4517if (mi_match(Root.getReg(), *MRI,m_GFCstOrSplat(FPValReg))) {
4518if (TII.isInlineConstant(FPValReg->Value)) {
4519return {{[=](MachineInstrBuilder &MIB) {
4520 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4521 }}};
4522 }
4523// Non-inlineable splat floats should not fall-through for integer immediate
4524// checks.
4525return {};
4526 }
4527
4528APInt ICst;
4529if (mi_match(Root.getReg(), *MRI,m_ICstOrSplat(ICst))) {
4530if (TII.isInlineConstant(ICst)) {
4531return {
4532 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4533 }
4534 }
4535
4536return {};
4537}
4538
4539InstructionSelector::ComplexRendererFns
4540AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const{
4541Register Src =
4542getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4543unsignedKey = 0;
4544
4545Register ShiftSrc;
4546 std::optional<ValueAndVReg> ShiftAmt;
4547if (mi_match(Src, *MRI,m_GLShr(m_Reg(ShiftSrc),m_GCst(ShiftAmt))) &&
4548MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4549 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4550Key = ShiftAmt->Value.getZExtValue() / 8;
4551 Src = ShiftSrc;
4552 }
4553
4554return {{
4555 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4556 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); }// index_key
4557 }};
4558}
4559
4560InstructionSelector::ComplexRendererFns
4561AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const{
4562
4563Register Src =
4564getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4565unsignedKey = 0;
4566
4567Register ShiftSrc;
4568 std::optional<ValueAndVReg> ShiftAmt;
4569if (mi_match(Src, *MRI,m_GLShr(m_Reg(ShiftSrc),m_GCst(ShiftAmt))) &&
4570MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4571 ShiftAmt->Value.getZExtValue() == 16) {
4572 Src = ShiftSrc;
4573Key = 1;
4574 }
4575
4576return {{
4577 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4578 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); }// index_key
4579 }};
4580}
4581
4582InstructionSelector::ComplexRendererFns
4583AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const{
4584Register Src;
4585unsigned Mods;
4586 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
4587
4588// FIXME: Handle op_sel
4589return {{
4590 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4591 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
4592 }};
4593}
4594
4595InstructionSelector::ComplexRendererFns
4596AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const{
4597Register Src;
4598unsigned Mods;
4599 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4600/*IsCanonicalizing=*/true,
4601/*AllowAbs=*/false,
4602/*OpSel=*/false);
4603
4604return {{
4605 [=](MachineInstrBuilder &MIB) {
4606 MIB.addReg(
4607 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB,/* ForceVGPR */true));
4608 },
4609 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); },// src0_mods
4610 }};
4611}
4612
4613InstructionSelector::ComplexRendererFns
4614AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const{
4615Register Src;
4616unsigned Mods;
4617 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(),
4618/*IsCanonicalizing=*/true,
4619/*AllowAbs=*/false,
4620/*OpSel=*/true);
4621
4622return {{
4623 [=](MachineInstrBuilder &MIB) {
4624 MIB.addReg(
4625 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB,/* ForceVGPR */true));
4626 },
4627 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); },// src0_mods
4628 }};
4629}
4630
4631bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4632Register &Base,
4633Register *SOffset,
4634 int64_t *Offset) const{
4635MachineInstr *MI = Root.getParent();
4636MachineBasicBlock *MBB =MI->getParent();
4637
4638// FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4639// then we can select all ptr + 32-bit offsets.
4640SmallVector<GEPInfo, 4> AddrInfo;
4641 getAddrModeInfo(*MI, *MRI, AddrInfo);
4642
4643if (AddrInfo.empty())
4644returnfalse;
4645
4646const GEPInfo &GEPI = AddrInfo[0];
4647 std::optional<int64_t> EncodedImm;
4648
4649if (SOffset &&Offset) {
4650 EncodedImm =AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm,/*IsBuffer=*/false,
4651/*HasSOffset=*/true);
4652if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4653 AddrInfo.size() > 1) {
4654const GEPInfo &GEPI2 = AddrInfo[1];
4655if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4656if (Register OffsetReg =
4657matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4658Base = GEPI2.SgprParts[0];
4659 *SOffset = OffsetReg;
4660 *Offset = *EncodedImm;
4661if (*Offset >= 0 || !AMDGPU::hasSMRDSignedImmOffset(STI))
4662returntrue;
4663
4664// For unbuffered smem loads, it is illegal for the Immediate Offset
4665// to be negative if the resulting (Offset + (M0 or SOffset or zero)
4666// is negative. Handle the case where the Immediate Offset + SOffset
4667// is negative.
4668auto SKnown =KB->getKnownBits(*SOffset);
4669if (*Offset + SKnown.getMinValue().getSExtValue() < 0)
4670returnfalse;
4671
4672returntrue;
4673 }
4674 }
4675 }
4676returnfalse;
4677 }
4678
4679 EncodedImm =AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm,/*IsBuffer=*/false,
4680/*HasSOffset=*/false);
4681if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4682Base = GEPI.SgprParts[0];
4683 *Offset = *EncodedImm;
4684returntrue;
4685 }
4686
4687// SGPR offset is unsigned.
4688if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4689 GEPI.Imm != 0) {
4690// If we make it this far we have a load with an 32-bit immediate offset.
4691// It is OK to select this using a sgpr offset, because we have already
4692// failed trying to select this load into one of the _IMM variants since
4693// the _IMM Patterns are considered before the _SGPR patterns.
4694Base = GEPI.SgprParts[0];
4695 *SOffset =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4696BuildMI(*MBB,MI,MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4697 .addImm(GEPI.Imm);
4698returntrue;
4699 }
4700
4701if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4702if (Register OffsetReg =matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4703Base = GEPI.SgprParts[0];
4704 *SOffset = OffsetReg;
4705returntrue;
4706 }
4707 }
4708
4709returnfalse;
4710}
4711
4712InstructionSelector::ComplexRendererFns
4713AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const{
4714RegisterBase;
4715 int64_tOffset;
4716if (!selectSmrdOffset(Root,Base,/* SOffset= */nullptr, &Offset))
4717return std::nullopt;
4718
4719return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4720 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4721}
4722
4723InstructionSelector::ComplexRendererFns
4724AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const{
4725SmallVector<GEPInfo, 4> AddrInfo;
4726 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4727
4728if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4729return std::nullopt;
4730
4731const GEPInfo &GEPInfo = AddrInfo[0];
4732Register PtrReg = GEPInfo.SgprParts[0];
4733 std::optional<int64_t> EncodedImm =
4734AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4735if (!EncodedImm)
4736return std::nullopt;
4737
4738return {{
4739 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4740 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4741 }};
4742}
4743
4744InstructionSelector::ComplexRendererFns
4745AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const{
4746RegisterBase, SOffset;
4747if (!selectSmrdOffset(Root,Base, &SOffset,/* Offset= */nullptr))
4748return std::nullopt;
4749
4750return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4751 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4752}
4753
4754InstructionSelector::ComplexRendererFns
4755AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const{
4756RegisterBase, SOffset;
4757 int64_tOffset;
4758if (!selectSmrdOffset(Root,Base, &SOffset, &Offset))
4759return std::nullopt;
4760
4761return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4762 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4763 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4764}
4765
4766std::pair<Register, int>
4767AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4768uint64_t FlatVariant) const{
4769MachineInstr *MI = Root.getParent();
4770
4771autoDefault = std::pair(Root.getReg(), 0);
4772
4773if (!STI.hasFlatInstOffsets())
4774returnDefault;
4775
4776Register PtrBase;
4777 int64_t ConstOffset;
4778 std::tie(PtrBase, ConstOffset) =
4779 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4780
4781if (ConstOffset == 0 || (FlatVariant ==SIInstrFlags::FlatScratch &&
4782 !isFlatScratchBaseLegal(Root.getReg())))
4783returnDefault;
4784
4785unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4786if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4787returnDefault;
4788
4789return std::pair(PtrBase, ConstOffset);
4790}
4791
4792InstructionSelector::ComplexRendererFns
4793AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const{
4794auto PtrWithOffset = selectFlatOffsetImpl(Root,SIInstrFlags::FLAT);
4795
4796return {{
4797 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4798 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4799 }};
4800}
4801
4802InstructionSelector::ComplexRendererFns
4803AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const{
4804auto PtrWithOffset = selectFlatOffsetImpl(Root,SIInstrFlags::FlatGlobal);
4805
4806return {{
4807 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4808 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4809 }};
4810}
4811
4812InstructionSelector::ComplexRendererFns
4813AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const{
4814auto PtrWithOffset = selectFlatOffsetImpl(Root,SIInstrFlags::FlatScratch);
4815
4816return {{
4817 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4818 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4819 }};
4820}
4821
4822// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4823InstructionSelector::ComplexRendererFns
4824AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const{
4825RegisterAddr = Root.getReg();
4826Register PtrBase;
4827 int64_t ConstOffset;
4828 int64_t ImmOffset = 0;
4829
4830// Match the immediate offset first, which canonically is moved as low as
4831// possible.
4832 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4833
4834if (ConstOffset != 0) {
4835if (TII.isLegalFLATOffset(ConstOffset,AMDGPUAS::GLOBAL_ADDRESS,
4836SIInstrFlags::FlatGlobal)) {
4837Addr = PtrBase;
4838 ImmOffset = ConstOffset;
4839 }else {
4840auto PtrBaseDef =getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4841if (isSGPR(PtrBaseDef->Reg)) {
4842if (ConstOffset > 0) {
4843// Offset is too large.
4844//
4845// saddr + large_offset -> saddr +
4846// (voffset = large_offset & ~MaxOffset) +
4847// (large_offset & MaxOffset);
4848 int64_t SplitImmOffset, RemainderOffset;
4849 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4850 ConstOffset,AMDGPUAS::GLOBAL_ADDRESS,SIInstrFlags::FlatGlobal);
4851
4852if (isUInt<32>(RemainderOffset)) {
4853MachineInstr *MI = Root.getParent();
4854MachineBasicBlock *MBB =MI->getParent();
4855Register HighBits =
4856MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4857
4858BuildMI(*MBB,MI,MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4859 HighBits)
4860 .addImm(RemainderOffset);
4861
4862return {{
4863 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); },// saddr
4864 [=](MachineInstrBuilder &MIB) {
4865 MIB.addReg(HighBits);
4866 },// voffset
4867 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4868 }};
4869 }
4870 }
4871
4872// We are adding a 64 bit SGPR and a constant. If constant bus limit
4873// is 1 we would need to perform 1 or 2 extra moves for each half of
4874// the constant and it is better to do a scalar add and then issue a
4875// single VALU instruction to materialize zero. Otherwise it is less
4876// instructions to perform VALU adds with immediates or inline literals.
4877unsigned NumLiterals =
4878 !TII.isInlineConstant(APInt(32,Lo_32(ConstOffset))) +
4879 !TII.isInlineConstant(APInt(32,Hi_32(ConstOffset)));
4880if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4881return std::nullopt;
4882 }
4883 }
4884 }
4885
4886// Match the variable offset.
4887auto AddrDef =getDefSrcRegIgnoringCopies(Addr, *MRI);
4888if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4889// Look through the SGPR->VGPR copy.
4890Register SAddr =
4891getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4892
4893if (isSGPR(SAddr)) {
4894Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4895
4896// It's possible voffset is an SGPR here, but the copy to VGPR will be
4897// inserted later.
4898if (Register VOffset =matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4899return {{[=](MachineInstrBuilder &MIB) {// saddr
4900 MIB.addReg(SAddr);
4901 },
4902 [=](MachineInstrBuilder &MIB) {// voffset
4903 MIB.addReg(VOffset);
4904 },
4905 [=](MachineInstrBuilder &MIB) {// offset
4906 MIB.addImm(ImmOffset);
4907 }}};
4908 }
4909 }
4910 }
4911
4912// FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4913// drop this.
4914if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4915 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4916return std::nullopt;
4917
4918// It's cheaper to materialize a single 32-bit zero for vaddr than the two
4919// moves required to copy a 64-bit SGPR to VGPR.
4920MachineInstr *MI = Root.getParent();
4921MachineBasicBlock *MBB =MI->getParent();
4922Register VOffset =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4923
4924BuildMI(*MBB,MI,MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4925 .addImm(0);
4926
4927return {{
4928 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); },// saddr
4929 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); },// voffset
4930 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }// offset
4931 }};
4932}
4933
4934InstructionSelector::ComplexRendererFns
4935AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const{
4936RegisterAddr = Root.getReg();
4937Register PtrBase;
4938 int64_t ConstOffset;
4939 int64_t ImmOffset = 0;
4940
4941// Match the immediate offset first, which canonically is moved as low as
4942// possible.
4943 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4944
4945if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4946 TII.isLegalFLATOffset(ConstOffset,AMDGPUAS::PRIVATE_ADDRESS,
4947SIInstrFlags::FlatScratch)) {
4948Addr = PtrBase;
4949 ImmOffset = ConstOffset;
4950 }
4951
4952auto AddrDef =getDefSrcRegIgnoringCopies(Addr, *MRI);
4953if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4954int FI = AddrDef->MI->getOperand(1).getIndex();
4955return {{
4956 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); },// saddr
4957 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }// offset
4958 }};
4959 }
4960
4961Register SAddr = AddrDef->Reg;
4962
4963if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4964RegisterLHS = AddrDef->MI->getOperand(1).getReg();
4965RegisterRHS = AddrDef->MI->getOperand(2).getReg();
4966auto LHSDef =getDefSrcRegIgnoringCopies(LHS, *MRI);
4967auto RHSDef =getDefSrcRegIgnoringCopies(RHS, *MRI);
4968
4969if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4970 isSGPR(RHSDef->Reg)) {
4971int FI = LHSDef->MI->getOperand(1).getIndex();
4972MachineInstr &I = *Root.getParent();
4973MachineBasicBlock *BB =I.getParent();
4974constDebugLoc &DL =I.getDebugLoc();
4975 SAddr =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4976
4977BuildMI(*BB, &I,DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4978 .addFrameIndex(FI)
4979 .addReg(RHSDef->Reg)
4980 .setOperandDead(3);// Dead scc
4981 }
4982 }
4983
4984if (!isSGPR(SAddr))
4985return std::nullopt;
4986
4987return {{
4988 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); },// saddr
4989 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }// offset
4990 }};
4991}
4992
4993// Check whether the flat scratch SVS swizzle bug affects this access.
4994bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4995Register VAddr,Register SAddr,uint64_t ImmOffset) const{
4996if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4997returnfalse;
4998
4999// The bug affects the swizzling of SVS accesses if there is any carry out
5000// from the two low order bits (i.e. from bit 1 into bit 2) when adding
5001// voffset to (soffset + inst_offset).
5002auto VKnown =KB->getKnownBits(VAddr);
5003auto SKnown =KnownBits::add(KB->getKnownBits(SAddr),
5004KnownBits::makeConstant(APInt(32, ImmOffset)));
5005uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5006uint64_tSMax = SKnown.getMaxValue().getZExtValue();
5007return (VMax & 3) + (SMax & 3) >= 4;
5008}
5009
5010InstructionSelector::ComplexRendererFns
5011AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const{
5012RegisterAddr = Root.getReg();
5013Register PtrBase;
5014 int64_t ConstOffset;
5015 int64_t ImmOffset = 0;
5016
5017// Match the immediate offset first, which canonically is moved as low as
5018// possible.
5019 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
5020
5021Register OrigAddr =Addr;
5022if (ConstOffset != 0 &&
5023 TII.isLegalFLATOffset(ConstOffset,AMDGPUAS::PRIVATE_ADDRESS,true)) {
5024Addr = PtrBase;
5025 ImmOffset = ConstOffset;
5026 }
5027
5028auto AddrDef =getDefSrcRegIgnoringCopies(Addr, *MRI);
5029if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5030return std::nullopt;
5031
5032RegisterRHS = AddrDef->MI->getOperand(2).getReg();
5033if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
5034return std::nullopt;
5035
5036RegisterLHS = AddrDef->MI->getOperand(1).getReg();
5037auto LHSDef =getDefSrcRegIgnoringCopies(LHS, *MRI);
5038
5039if (OrigAddr !=Addr) {
5040if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5041return std::nullopt;
5042 }else {
5043if (!isFlatScratchBaseLegalSV(OrigAddr))
5044return std::nullopt;
5045 }
5046
5047if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5048return std::nullopt;
5049
5050if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5051int FI = LHSDef->MI->getOperand(1).getIndex();
5052return {{
5053 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); },// vaddr
5054 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); },// saddr
5055 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }// offset
5056 }};
5057 }
5058
5059if (!isSGPR(LHS))
5060return std::nullopt;
5061
5062return {{
5063 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); },// vaddr
5064 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); },// saddr
5065 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); }// offset
5066 }};
5067}
5068
5069InstructionSelector::ComplexRendererFns
5070AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const{
5071MachineInstr *MI = Root.getParent();
5072MachineBasicBlock *MBB =MI->getParent();
5073MachineFunction *MF =MBB->getParent();
5074constSIMachineFunctionInfo *Info =MF->getInfo<SIMachineFunctionInfo>();
5075
5076 int64_tOffset = 0;
5077if (mi_match(Root.getReg(), *MRI,m_ICst(Offset)) &&
5078Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
5079Register HighBits =MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5080
5081// TODO: Should this be inside the render function? The iterator seems to
5082// move.
5083constuint32_t MaxOffset =SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
5084BuildMI(*MBB,MI,MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5085 HighBits)
5086 .addImm(Offset & ~MaxOffset);
5087
5088return {{[=](MachineInstrBuilder &MIB) {// rsrc
5089 MIB.addReg(Info->getScratchRSrcReg());
5090 },
5091 [=](MachineInstrBuilder &MIB) {// vaddr
5092 MIB.addReg(HighBits);
5093 },
5094 [=](MachineInstrBuilder &MIB) {// soffset
5095// Use constant zero for soffset and rely on eliminateFrameIndex
5096// to choose the appropriate frame register if need be.
5097 MIB.addImm(0);
5098 },
5099 [=](MachineInstrBuilder &MIB) {// offset
5100 MIB.addImm(Offset & MaxOffset);
5101 }}};
5102 }
5103
5104assert(Offset == 0 ||Offset == -1);
5105
5106// Try to fold a frame index directly into the MUBUF vaddr field, and any
5107// offsets.
5108 std::optional<int> FI;
5109Register VAddr = Root.getReg();
5110
5111constMachineInstr *RootDef =MRI->getVRegDef(Root.getReg());
5112Register PtrBase;
5113 int64_t ConstOffset;
5114 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5115if (ConstOffset != 0) {
5116if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
5117 (!STI.privateMemoryResourceIsRangeChecked() ||
5118KB->signBitIsZero(PtrBase))) {
5119constMachineInstr *PtrBaseDef =MRI->getVRegDef(PtrBase);
5120if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
5121 FI = PtrBaseDef->getOperand(1).getIndex();
5122else
5123 VAddr = PtrBase;
5124Offset = ConstOffset;
5125 }
5126 }elseif (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5127 FI = RootDef->getOperand(1).getIndex();
5128 }
5129
5130return {{[=](MachineInstrBuilder &MIB) {// rsrc
5131 MIB.addReg(Info->getScratchRSrcReg());
5132 },
5133 [=](MachineInstrBuilder &MIB) {// vaddr
5134if (FI)
5135 MIB.addFrameIndex(*FI);
5136else
5137 MIB.addReg(VAddr);
5138 },
5139 [=](MachineInstrBuilder &MIB) {// soffset
5140// Use constant zero for soffset and rely on eliminateFrameIndex
5141// to choose the appropriate frame register if need be.
5142 MIB.addImm(0);
5143 },
5144 [=](MachineInstrBuilder &MIB) {// offset
5145 MIB.addImm(Offset);
5146 }}};
5147}
5148
5149bool AMDGPUInstructionSelector::isDSOffsetLegal(RegisterBase,
5150 int64_tOffset) const{
5151if (!isUInt<16>(Offset))
5152returnfalse;
5153
5154if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5155returntrue;
5156
5157// On Southern Islands instruction with a negative base value and an offset
5158// don't seem to work.
5159returnKB->signBitIsZero(Base);
5160}
5161
5162bool AMDGPUInstructionSelector::isDSOffset2Legal(RegisterBase, int64_t Offset0,
5163 int64_t Offset1,
5164unsignedSize) const{
5165if (Offset0 %Size != 0 || Offset1 %Size != 0)
5166returnfalse;
5167if (!isUInt<8>(Offset0 /Size) || !isUInt<8>(Offset1 /Size))
5168returnfalse;
5169
5170if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
5171returntrue;
5172
5173// On Southern Islands instruction with a negative base value and an offset
5174// don't seem to work.
5175returnKB->signBitIsZero(Base);
5176}
5177
5178// Return whether the operation has NoUnsignedWrap property.
5179staticboolisNoUnsignedWrap(MachineInstr *Addr) {
5180returnAddr->getOpcode() == TargetOpcode::G_OR ||
5181 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5182Addr->getFlag(MachineInstr::NoUWrap));
5183}
5184
5185// Check that the base address of flat scratch load/store in the form of `base +
5186// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
5187// requirement). We always treat the first operand as the base address here.
5188bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(RegisterAddr) const{
5189MachineInstr *AddrMI =getDefIgnoringCopies(Addr, *MRI);
5190
5191if (isNoUnsignedWrap(AddrMI))
5192returntrue;
5193
5194// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5195// values.
5196if (STI.hasSignedScratchOffsets())
5197returntrue;
5198
5199RegisterLHS = AddrMI->getOperand(1).getReg();
5200RegisterRHS = AddrMI->getOperand(2).getReg();
5201
5202if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
5203 std::optional<ValueAndVReg> RhsValReg =
5204getIConstantVRegValWithLookThrough(RHS, *MRI);
5205// If the immediate offset is negative and within certain range, the base
5206// address cannot also be negative. If the base is also negative, the sum
5207// would be either negative or much larger than the valid range of scratch
5208// memory a thread can access.
5209if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5210 RhsValReg->Value.getSExtValue() > -0x40000000)
5211returntrue;
5212 }
5213
5214returnKB->signBitIsZero(LHS);
5215}
5216
5217// Check address value in SGPR/VGPR are legal for flat scratch in the form
5218// of: SGPR + VGPR.
5219bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(RegisterAddr) const{
5220MachineInstr *AddrMI =getDefIgnoringCopies(Addr, *MRI);
5221
5222if (isNoUnsignedWrap(AddrMI))
5223returntrue;
5224
5225// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5226// values.
5227if (STI.hasSignedScratchOffsets())
5228returntrue;
5229
5230RegisterLHS = AddrMI->getOperand(1).getReg();
5231RegisterRHS = AddrMI->getOperand(2).getReg();
5232returnKB->signBitIsZero(RHS) &&KB->signBitIsZero(LHS);
5233}
5234
5235// Check address value in SGPR/VGPR are legal for flat scratch in the form
5236// of: SGPR + VGPR + Imm.
5237bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5238RegisterAddr) const{
5239// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
5240// values.
5241if (STI.hasSignedScratchOffsets())
5242returntrue;
5243
5244MachineInstr *AddrMI =getDefIgnoringCopies(Addr, *MRI);
5245RegisterBase = AddrMI->getOperand(1).getReg();
5246 std::optional<DefinitionAndSourceRegister> BaseDef =
5247getDefSrcRegIgnoringCopies(Base, *MRI);
5248 std::optional<ValueAndVReg> RHSOffset =
5249getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
5250assert(RHSOffset);
5251
5252// If the immediate offset is negative and within certain range, the base
5253// address cannot also be negative. If the base is also negative, the sum
5254// would be either negative or much larger than the valid range of scratch
5255// memory a thread can access.
5256if (isNoUnsignedWrap(BaseDef->MI) &&
5257 (isNoUnsignedWrap(AddrMI) ||
5258 (RHSOffset->Value.getSExtValue() < 0 &&
5259 RHSOffset->Value.getSExtValue() > -0x40000000)))
5260returntrue;
5261
5262RegisterLHS = BaseDef->MI->getOperand(1).getReg();
5263RegisterRHS = BaseDef->MI->getOperand(2).getReg();
5264returnKB->signBitIsZero(RHS) &&KB->signBitIsZero(LHS);
5265}
5266
5267bool AMDGPUInstructionSelector::isUnneededShiftMask(constMachineInstr &MI,
5268unsigned ShAmtBits) const{
5269assert(MI.getOpcode() == TargetOpcode::G_AND);
5270
5271 std::optional<APInt>RHS =
5272getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
5273if (!RHS)
5274returnfalse;
5275
5276if (RHS->countr_one() >= ShAmtBits)
5277returntrue;
5278
5279constAPInt &LHSKnownZeros =KB->getKnownZeroes(MI.getOperand(1).getReg());
5280return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
5281}
5282
5283InstructionSelector::ComplexRendererFns
5284AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5285MachineOperand &Root) const{
5286RegisterReg = Root.getReg();
5287constSIMachineFunctionInfo *Info =MF->getInfo<SIMachineFunctionInfo>();
5288
5289 std::optional<DefinitionAndSourceRegister>Def =
5290getDefSrcRegIgnoringCopies(Reg, *MRI);
5291assert(Def &&"this shouldn't be an optional result");
5292Reg =Def->Reg;
5293
5294if (Register WaveBase =getWaveAddress(Def->MI)) {
5295return {{
5296 [=](MachineInstrBuilder &MIB) {// rsrc
5297 MIB.addReg(Info->getScratchRSrcReg());
5298 },
5299 [=](MachineInstrBuilder &MIB) {// soffset
5300 MIB.addReg(WaveBase);
5301 },
5302 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }// offset
5303 }};
5304 }
5305
5306 int64_tOffset = 0;
5307
5308// FIXME: Copy check is a hack
5309RegisterBasePtr;
5310if (mi_match(Reg, *MRI,
5311m_GPtrAdd(m_Reg(BasePtr),
5312m_any_of(m_ICst(Offset),m_Copy(m_ICst(Offset)))))) {
5313if (!TII.isLegalMUBUFImmOffset(Offset))
5314return {};
5315MachineInstr *BasePtrDef =getDefIgnoringCopies(BasePtr, *MRI);
5316Register WaveBase =getWaveAddress(BasePtrDef);
5317if (!WaveBase)
5318return {};
5319
5320return {{
5321 [=](MachineInstrBuilder &MIB) {// rsrc
5322 MIB.addReg(Info->getScratchRSrcReg());
5323 },
5324 [=](MachineInstrBuilder &MIB) {// soffset
5325 MIB.addReg(WaveBase);
5326 },
5327 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }// offset
5328 }};
5329 }
5330
5331if (!mi_match(Root.getReg(), *MRI,m_ICst(Offset)) ||
5332 !TII.isLegalMUBUFImmOffset(Offset))
5333return {};
5334
5335return {{
5336 [=](MachineInstrBuilder &MIB) {// rsrc
5337 MIB.addReg(Info->getScratchRSrcReg());
5338 },
5339 [=](MachineInstrBuilder &MIB) {// soffset
5340 MIB.addImm(0);
5341 },
5342 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }// offset
5343 }};
5344}
5345
5346std::pair<Register, unsigned>
5347AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const{
5348constMachineInstr *RootDef =MRI->getVRegDef(Root.getReg());
5349 int64_t ConstAddr = 0;
5350
5351Register PtrBase;
5352 int64_tOffset;
5353 std::tie(PtrBase,Offset) =
5354 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5355
5356if (Offset) {
5357if (isDSOffsetLegal(PtrBase,Offset)) {
5358// (add n0, c0)
5359return std::pair(PtrBase,Offset);
5360 }
5361 }elseif (RootDef->getOpcode() == AMDGPU::G_SUB) {
5362// TODO
5363
5364
5365 }elseif (mi_match(Root.getReg(), *MRI,m_ICst(ConstAddr))) {
5366// TODO
5367
5368 }
5369
5370return std::pair(Root.getReg(), 0);
5371}
5372
5373InstructionSelector::ComplexRendererFns
5374AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const{
5375RegisterReg;
5376unsignedOffset;
5377 std::tie(Reg,Offset) = selectDS1Addr1OffsetImpl(Root);
5378return {{
5379 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5380 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
5381 }};
5382}
5383
5384InstructionSelector::ComplexRendererFns
5385AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const{
5386return selectDSReadWrite2(Root, 4);
5387}
5388
5389InstructionSelector::ComplexRendererFns
5390AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const{
5391return selectDSReadWrite2(Root, 8);
5392}
5393
5394InstructionSelector::ComplexRendererFns
5395AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
5396unsignedSize) const{
5397RegisterReg;
5398unsignedOffset;
5399 std::tie(Reg,Offset) = selectDSReadWrite2Impl(Root,Size);
5400return {{
5401 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
5402 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
5403 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
5404 }};
5405}
5406
5407std::pair<Register, unsigned>
5408AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
5409unsignedSize) const{
5410constMachineInstr *RootDef =MRI->getVRegDef(Root.getReg());
5411 int64_t ConstAddr = 0;
5412
5413Register PtrBase;
5414 int64_tOffset;
5415 std::tie(PtrBase,Offset) =
5416 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
5417
5418if (Offset) {
5419 int64_t OffsetValue0 =Offset;
5420 int64_t OffsetValue1 =Offset +Size;
5421if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1,Size)) {
5422// (add n0, c0)
5423return std::pair(PtrBase, OffsetValue0 /Size);
5424 }
5425 }elseif (RootDef->getOpcode() == AMDGPU::G_SUB) {
5426// TODO
5427
5428 }elseif (mi_match(Root.getReg(), *MRI,m_ICst(ConstAddr))) {
5429// TODO
5430
5431 }
5432
5433return std::pair(Root.getReg(), 0);
5434}
5435
5436/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
5437/// the base value with the constant offset. There may be intervening copies
5438/// between \p Root and the identified constant. Returns \p Root, 0 if this does
5439/// not match the pattern.
5440std::pair<Register, int64_t>
5441AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5442Register Root,constMachineRegisterInfo &MRI) const{
5443MachineInstr *RootI =getDefIgnoringCopies(Root, MRI);
5444if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
5445return {Root, 0};
5446
5447MachineOperand &RHS = RootI->getOperand(2);
5448 std::optional<ValueAndVReg> MaybeOffset =
5449getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5450if (!MaybeOffset)
5451return {Root, 0};
5452return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5453}
5454
5455staticvoidaddZeroImm(MachineInstrBuilder &MIB) {
5456 MIB.addImm(0);
5457}
5458
5459/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5460/// BasePtr is not valid, a null base pointer will be used.
5461staticRegisterbuildRSRC(MachineIRBuilder &B,MachineRegisterInfo &MRI,
5462uint32_t FormatLo,uint32_t FormatHi,
5463Register BasePtr) {
5464Register RSrc2 =MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5465Register RSrc3 =MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5466Register RSrcHi =MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5467Register RSrc =MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5468
5469B.buildInstr(AMDGPU::S_MOV_B32)
5470 .addDef(RSrc2)
5471 .addImm(FormatLo);
5472B.buildInstr(AMDGPU::S_MOV_B32)
5473 .addDef(RSrc3)
5474 .addImm(FormatHi);
5475
5476// Build the half of the subregister with the constants before building the
5477// full 128-bit register. If we are building multiple resource descriptors,
5478// this will allow CSEing of the 2-component register.
5479B.buildInstr(AMDGPU::REG_SEQUENCE)
5480 .addDef(RSrcHi)
5481 .addReg(RSrc2)
5482 .addImm(AMDGPU::sub0)
5483 .addReg(RSrc3)
5484 .addImm(AMDGPU::sub1);
5485
5486Register RSrcLo = BasePtr;
5487if (!BasePtr) {
5488 RSrcLo =MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5489B.buildInstr(AMDGPU::S_MOV_B64)
5490 .addDef(RSrcLo)
5491 .addImm(0);
5492 }
5493
5494B.buildInstr(AMDGPU::REG_SEQUENCE)
5495 .addDef(RSrc)
5496 .addReg(RSrcLo)
5497 .addImm(AMDGPU::sub0_sub1)
5498 .addReg(RSrcHi)
5499 .addImm(AMDGPU::sub2_sub3);
5500
5501return RSrc;
5502}
5503
5504staticRegisterbuildAddr64RSrc(MachineIRBuilder &B,MachineRegisterInfo &MRI,
5505constSIInstrInfo &TII,Register BasePtr) {
5506uint64_t DefaultFormat =TII.getDefaultRsrcDataFormat();
5507
5508// FIXME: Why are half the "default" bits ignored based on the addressing
5509// mode?
5510returnbuildRSRC(B,MRI, 0,Hi_32(DefaultFormat), BasePtr);
5511}
5512
5513staticRegisterbuildOffsetSrc(MachineIRBuilder &B,MachineRegisterInfo &MRI,
5514constSIInstrInfo &TII,Register BasePtr) {
5515uint64_t DefaultFormat =TII.getDefaultRsrcDataFormat();
5516
5517// FIXME: Why are half the "default" bits ignored based on the addressing
5518// mode?
5519returnbuildRSRC(B,MRI, -1,Hi_32(DefaultFormat), BasePtr);
5520}
5521
5522AMDGPUInstructionSelector::MUBUFAddressData
5523AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const{
5524 MUBUFAddressDataData;
5525Data.N0 = Src;
5526
5527Register PtrBase;
5528 int64_tOffset;
5529
5530 std::tie(PtrBase,Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5531if (isUInt<32>(Offset)) {
5532Data.N0 = PtrBase;
5533Data.Offset =Offset;
5534 }
5535
5536if (MachineInstr *InputAdd
5537 =getOpcodeDef(TargetOpcode::G_PTR_ADD,Data.N0, *MRI)) {
5538Data.N2 = InputAdd->getOperand(1).getReg();
5539Data.N3 = InputAdd->getOperand(2).getReg();
5540
5541// FIXME: Need to fix extra SGPR->VGPRcopies inserted
5542// FIXME: Don't know this was defined by operand 0
5543//
5544// TODO: Remove this when we have copy folding optimizations after
5545// RegBankSelect.
5546Data.N2 =getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5547Data.N3 =getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5548 }
5549
5550returnData;
5551}
5552
5553/// Return if the addr64 mubuf mode should be used for the given address.
5554bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressDataAddr) const{
5555// (ptr_add N2, N3) -> addr64, or
5556// (ptr_add (ptr_add N2, N3), C1) -> addr64
5557if (Addr.N2)
5558returntrue;
5559
5560constRegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5561return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5562}
5563
5564/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5565/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5566/// component.
5567void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5568MachineIRBuilder &B,Register &SOffset, int64_t &ImmOffset) const{
5569if (TII.isLegalMUBUFImmOffset(ImmOffset))
5570return;
5571
5572// Illegal offset, store it in soffset.
5573 SOffset =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5574B.buildInstr(AMDGPU::S_MOV_B32)
5575 .addDef(SOffset)
5576 .addImm(ImmOffset);
5577 ImmOffset = 0;
5578}
5579
5580bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5581MachineOperand &Root,Register &VAddr,Register &RSrcReg,
5582Register &SOffset, int64_t &Offset) const{
5583// FIXME: Predicates should stop this from reaching here.
5584// addr64 bit was removed for volcanic islands.
5585if (!STI.hasAddr64() || STI.useFlatForGlobal())
5586returnfalse;
5587
5588 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5589if (!shouldUseAddr64(AddrData))
5590returnfalse;
5591
5592Register N0 = AddrData.N0;
5593Register N2 = AddrData.N2;
5594Register N3 = AddrData.N3;
5595Offset = AddrData.Offset;
5596
5597// Base pointer for the SRD.
5598Register SRDPtr;
5599
5600if (N2) {
5601if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5602assert(N3);
5603if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5604// Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5605// addr64, and construct the default resource from a 0 address.
5606 VAddr = N0;
5607 }else {
5608 SRDPtr = N3;
5609 VAddr = N2;
5610 }
5611 }else {
5612// N2 is not divergent.
5613 SRDPtr = N2;
5614 VAddr = N3;
5615 }
5616 }elseif (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5617// Use the default null pointer in the resource
5618 VAddr = N0;
5619 }else {
5620// N0 -> offset, or
5621// (N0 + C1) -> offset
5622 SRDPtr = N0;
5623 }
5624
5625MachineIRBuilderB(*Root.getParent());
5626 RSrcReg =buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5627 splitIllegalMUBUFOffset(B, SOffset,Offset);
5628returntrue;
5629}
5630
5631bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5632MachineOperand &Root,Register &RSrcReg,Register &SOffset,
5633 int64_t &Offset) const{
5634
5635// FIXME: Pattern should not reach here.
5636if (STI.useFlatForGlobal())
5637returnfalse;
5638
5639 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5640if (shouldUseAddr64(AddrData))
5641returnfalse;
5642
5643// N0 -> offset, or
5644// (N0 + C1) -> offset
5645Register SRDPtr = AddrData.N0;
5646Offset = AddrData.Offset;
5647
5648// TODO: Look through extensions for 32-bit soffset.
5649MachineIRBuilderB(*Root.getParent());
5650
5651 RSrcReg =buildOffsetSrc(B, *MRI, TII, SRDPtr);
5652 splitIllegalMUBUFOffset(B, SOffset,Offset);
5653returntrue;
5654}
5655
5656InstructionSelector::ComplexRendererFns
5657AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const{
5658Register VAddr;
5659Register RSrcReg;
5660Register SOffset;
5661 int64_tOffset = 0;
5662
5663if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset,Offset))
5664return {};
5665
5666// FIXME: Use defaulted operands for trailing 0s and remove from the complex
5667// pattern.
5668return {{
5669 [=](MachineInstrBuilder &MIB) {// rsrc
5670 MIB.addReg(RSrcReg);
5671 },
5672 [=](MachineInstrBuilder &MIB) {// vaddr
5673 MIB.addReg(VAddr);
5674 },
5675 [=](MachineInstrBuilder &MIB) {// soffset
5676if (SOffset)
5677 MIB.addReg(SOffset);
5678elseif (STI.hasRestrictedSOffset())
5679 MIB.addReg(AMDGPU::SGPR_NULL);
5680else
5681 MIB.addImm(0);
5682 },
5683 [=](MachineInstrBuilder &MIB) {// offset
5684 MIB.addImm(Offset);
5685 },
5686addZeroImm,// cpol
5687addZeroImm,// tfe
5688addZeroImm// swz
5689 }};
5690}
5691
5692InstructionSelector::ComplexRendererFns
5693AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const{
5694Register RSrcReg;
5695Register SOffset;
5696 int64_tOffset = 0;
5697
5698if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset,Offset))
5699return {};
5700
5701return {{
5702 [=](MachineInstrBuilder &MIB) {// rsrc
5703 MIB.addReg(RSrcReg);
5704 },
5705 [=](MachineInstrBuilder &MIB) {// soffset
5706if (SOffset)
5707 MIB.addReg(SOffset);
5708elseif (STI.hasRestrictedSOffset())
5709 MIB.addReg(AMDGPU::SGPR_NULL);
5710else
5711 MIB.addImm(0);
5712 },
5713 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },// offset
5714addZeroImm,// cpol
5715addZeroImm,// tfe
5716addZeroImm,// swz
5717 }};
5718}
5719
5720InstructionSelector::ComplexRendererFns
5721AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const{
5722
5723Register SOffset = Root.getReg();
5724
5725if (STI.hasRestrictedSOffset() &&mi_match(SOffset, *MRI,m_ZeroInt()))
5726 SOffset = AMDGPU::SGPR_NULL;
5727
5728return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5729}
5730
5731/// Get an immediate that must be 32-bits, and treated as zero extended.
5732static std::optional<uint64_t>
5733getConstantZext32Val(Register Reg,constMachineRegisterInfo &MRI) {
5734// getIConstantVRegVal sexts any values, so see if that matters.
5735 std::optional<int64_t> OffsetVal =getIConstantVRegSExtVal(Reg,MRI);
5736if (!OffsetVal || !isInt<32>(*OffsetVal))
5737return std::nullopt;
5738returnLo_32(*OffsetVal);
5739}
5740
5741InstructionSelector::ComplexRendererFns
5742AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const{
5743 std::optional<uint64_t> OffsetVal =
5744 Root.isImm() ? Root.getImm() :getConstantZext32Val(Root.getReg(), *MRI);
5745if (!OffsetVal)
5746return {};
5747
5748 std::optional<int64_t> EncodedImm =
5749AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal,true);
5750if (!EncodedImm)
5751return {};
5752
5753return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5754}
5755
5756InstructionSelector::ComplexRendererFns
5757AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const{
5758assert(STI.getGeneration() ==AMDGPUSubtarget::SEA_ISLANDS);
5759
5760 std::optional<uint64_t> OffsetVal =getConstantZext32Val(Root.getReg(), *MRI);
5761if (!OffsetVal)
5762return {};
5763
5764 std::optional<int64_t> EncodedImm =
5765AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
5766if (!EncodedImm)
5767return {};
5768
5769return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5770}
5771
5772InstructionSelector::ComplexRendererFns
5773AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const{
5774// Match the (soffset + offset) pair as a 32-bit register base and
5775// an immediate offset.
5776Register SOffset;
5777unsignedOffset;
5778 std::tie(SOffset,Offset) =AMDGPU::getBaseWithConstantOffset(
5779 *MRI, Root.getReg(),KB,/*CheckNUW*/true);
5780if (!SOffset)
5781return std::nullopt;
5782
5783 std::optional<int64_t> EncodedOffset =
5784AMDGPU::getSMRDEncodedOffset(STI,Offset,/* IsBuffer */true);
5785if (!EncodedOffset)
5786return std::nullopt;
5787
5788assert(MRI->getType(SOffset) ==LLT::scalar(32));
5789return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5790 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5791}
5792
5793std::pair<Register, unsigned>
5794AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5795bool &Matched) const{
5796 Matched =false;
5797
5798Register Src;
5799unsigned Mods;
5800 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
5801
5802if (mi_match(Src, *MRI,m_GFPExt(m_Reg(Src)))) {
5803assert(MRI->getType(Src) ==LLT::scalar(16));
5804
5805// Only change Src if src modifier could be gained. In such cases new Src
5806// could be sgpr but this does not violate constant bus restriction for
5807// instruction that is being selected.
5808 Src =stripBitCast(Src, *MRI);
5809
5810constauto CheckAbsNeg = [&]() {
5811// Be careful about folding modifiers if we already have an abs. fneg is
5812// applied last, so we don't want to apply an earlier fneg.
5813if ((Mods &SISrcMods::ABS) == 0) {
5814unsigned ModsTmp;
5815 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
5816
5817if ((ModsTmp &SISrcMods::NEG) != 0)
5818 Mods ^=SISrcMods::NEG;
5819
5820if ((ModsTmp &SISrcMods::ABS) != 0)
5821 Mods |=SISrcMods::ABS;
5822 }
5823 };
5824
5825 CheckAbsNeg();
5826
5827// op_sel/op_sel_hi decide the source type and source.
5828// If the source's op_sel_hi is set, it indicates to do a conversion from
5829// fp16. If the sources's op_sel is set, it picks the high half of the
5830// source register.
5831
5832 Mods |=SISrcMods::OP_SEL_1;
5833
5834if (isExtractHiElt(*MRI, Src, Src)) {
5835 Mods |=SISrcMods::OP_SEL_0;
5836 CheckAbsNeg();
5837 }
5838
5839 Matched =true;
5840 }
5841
5842return {Src, Mods};
5843}
5844
5845InstructionSelector::ComplexRendererFns
5846AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5847MachineOperand &Root) const{
5848Register Src;
5849unsigned Mods;
5850bool Matched;
5851 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5852if (!Matched)
5853return {};
5854
5855return {{
5856 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5857 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
5858 }};
5859}
5860
5861InstructionSelector::ComplexRendererFns
5862AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const{
5863Register Src;
5864unsigned Mods;
5865bool Matched;
5866 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5867
5868return {{
5869 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5870 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }// src_mods
5871 }};
5872}
5873
5874bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5875MachineInstr &I,Intrinsic::ID IntrID) const{
5876MachineBasicBlock *MBB =I.getParent();
5877constDebugLoc &DL =I.getDebugLoc();
5878Register CCReg =I.getOperand(0).getReg();
5879
5880BuildMI(*MBB, &I,DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5881 .addImm(I.getOperand(2).getImm());
5882
5883BuildMI(*MBB, &I,DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5884
5885I.eraseFromParent();
5886return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5887 *MRI);
5888}
5889
5890bool AMDGPUInstructionSelector::selectSGetBarrierState(
5891MachineInstr &I,Intrinsic::ID IntrID) const{
5892MachineBasicBlock *MBB =I.getParent();
5893constDebugLoc &DL =I.getDebugLoc();
5894MachineOperand BarOp =I.getOperand(2);
5895 std::optional<int64_t> BarValImm =
5896getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5897
5898if (!BarValImm) {
5899auto CopyMIB =BuildMI(*MBB, &I,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5900 .addReg(BarOp.getReg());
5901constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5902 }
5903MachineInstrBuilder MIB;
5904unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
5905 : AMDGPU::S_GET_BARRIER_STATE_M0;
5906 MIB =BuildMI(*MBB, &I,DL, TII.get(Opc));
5907
5908auto DstReg =I.getOperand(0).getReg();
5909constTargetRegisterClass *DstRC =
5910TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
5911if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
5912returnfalse;
5913 MIB.addDef(DstReg);
5914if (BarValImm) {
5915 MIB.addImm(*BarValImm);
5916 }
5917I.eraseFromParent();
5918returntrue;
5919}
5920
5921unsignedgetNamedBarrierOp(bool HasInlineConst,Intrinsic::ID IntrID) {
5922if (HasInlineConst) {
5923switch (IntrID) {
5924default:
5925llvm_unreachable("not a named barrier op");
5926case Intrinsic::amdgcn_s_barrier_join:
5927return AMDGPU::S_BARRIER_JOIN_IMM;
5928case Intrinsic::amdgcn_s_get_named_barrier_state:
5929return AMDGPU::S_GET_BARRIER_STATE_IMM;
5930 };
5931 }else {
5932switch (IntrID) {
5933default:
5934llvm_unreachable("not a named barrier op");
5935case Intrinsic::amdgcn_s_barrier_join:
5936return AMDGPU::S_BARRIER_JOIN_M0;
5937case Intrinsic::amdgcn_s_get_named_barrier_state:
5938return AMDGPU::S_GET_BARRIER_STATE_M0;
5939 };
5940 }
5941}
5942
5943bool AMDGPUInstructionSelector::selectNamedBarrierInit(
5944MachineInstr &I,Intrinsic::ID IntrID) const{
5945MachineBasicBlock *MBB =I.getParent();
5946constDebugLoc &DL =I.getDebugLoc();
5947MachineOperand BarOp =I.getOperand(1);
5948MachineOperand CntOp =I.getOperand(2);
5949
5950// BarID = (BarOp >> 4) & 0x3F
5951Register TmpReg0 =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5952BuildMI(*MBB, &I,DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
5953 .add(BarOp)
5954 .addImm(4u)
5955 .setOperandDead(3);// Dead scc
5956
5957Register TmpReg1 =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5958BuildMI(*MBB, &I,DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
5959 .addReg(TmpReg0)
5960 .addImm(0x3F)
5961 .setOperandDead(3);// Dead scc
5962
5963// MO = ((CntOp & 0x3F) << shAmt) | BarID
5964Register TmpReg2 =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5965BuildMI(*MBB, &I,DL, TII.get(AMDGPU::S_AND_B32), TmpReg2)
5966 .add(CntOp)
5967 .addImm(0x3F)
5968 .setOperandDead(3);// Dead scc
5969
5970Register TmpReg3 =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5971constexprunsigned ShAmt = 16;
5972BuildMI(*MBB, &I,DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg3)
5973 .addReg(TmpReg2)
5974 .addImm(ShAmt)
5975 .setOperandDead(3);// Dead scc
5976
5977Register TmpReg4 =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5978BuildMI(*MBB, &I,DL, TII.get(AMDGPU::S_OR_B32), TmpReg4)
5979 .addReg(TmpReg1)
5980 .addReg(TmpReg3)
5981 .setOperandDead(3);// Dead scc;
5982
5983auto CopyMIB =
5984BuildMI(*MBB, &I,DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(TmpReg4);
5985constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5986
5987unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
5988 ? AMDGPU::S_BARRIER_INIT_M0
5989 : AMDGPU::S_BARRIER_SIGNAL_M0;
5990MachineInstrBuilder MIB;
5991 MIB =BuildMI(*MBB, &I,DL, TII.get(Opc));
5992
5993I.eraseFromParent();
5994returntrue;
5995}
5996
5997bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5998MachineInstr &I,Intrinsic::ID IntrID) const{
5999MachineBasicBlock *MBB =I.getParent();
6000constDebugLoc &DL =I.getDebugLoc();
6001MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6002 ?I.getOperand(2)
6003 :I.getOperand(1);
6004 std::optional<int64_t> BarValImm =
6005getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
6006
6007if (!BarValImm) {
6008// BarID = (BarOp >> 4) & 0x3F
6009Register TmpReg0 =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6010BuildMI(*MBB, &I,DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg0)
6011 .addReg(BarOp.getReg())
6012 .addImm(4u)
6013 .setOperandDead(3);// Dead scc;
6014
6015Register TmpReg1 =MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6016BuildMI(*MBB, &I,DL, TII.get(AMDGPU::S_AND_B32), TmpReg1)
6017 .addReg(TmpReg0)
6018 .addImm(0x3F)
6019 .setOperandDead(3);// Dead scc;
6020
6021auto CopyMIB =BuildMI(*MBB, &I,DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6022 .addReg(TmpReg1);
6023constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
6024 }
6025
6026MachineInstrBuilder MIB;
6027unsigned Opc =getNamedBarrierOp(BarValImm.has_value(), IntrID);
6028 MIB =BuildMI(*MBB, &I,DL, TII.get(Opc));
6029
6030if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6031auto DstReg =I.getOperand(0).getReg();
6032constTargetRegisterClass *DstRC =
6033TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
6034if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
6035returnfalse;
6036 MIB.addDef(DstReg);
6037 }
6038
6039if (BarValImm) {
6040auto BarId = ((*BarValImm) >> 4) & 0x3F;
6041 MIB.addImm(BarId);
6042 }
6043
6044I.eraseFromParent();
6045returntrue;
6046}
6047
6048void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
6049constMachineInstr &MI,
6050int OpIdx) const{
6051assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6052"Expected G_CONSTANT");
6053 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
6054}
6055
6056void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
6057constMachineInstr &MI,
6058int OpIdx) const{
6059assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6060"Expected G_CONSTANT");
6061 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
6062}
6063
6064void AMDGPUInstructionSelector::renderBitcastFPImm(MachineInstrBuilder &MIB,
6065constMachineInstr &MI,
6066int OpIdx) const{
6067constMachineOperand &Op =MI.getOperand(1);
6068assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6069 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6070}
6071
6072void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
6073constMachineInstr &MI,
6074int OpIdx) const{
6075assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6076"Expected G_CONSTANT");
6077 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
6078}
6079
6080/// This only really exists to satisfy DAG type checking machinery, so is a
6081/// no-op here.
6082void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
6083constMachineInstr &MI,
6084int OpIdx) const{
6085constMachineOperand &Op =MI.getOperand(OpIdx);
6086 int64_tImm;
6087if (Op.isReg() &&mi_match(Op.getReg(), *MRI,m_ICst(Imm)))
6088 MIB.addImm(Imm);
6089else
6090 MIB.addImm(Op.getImm());
6091}
6092
6093void AMDGPUInstructionSelector::renderZextBoolTImm(MachineInstrBuilder &MIB,
6094constMachineInstr &MI,
6095int OpIdx) const{
6096 MIB.addImm(MI.getOperand(OpIdx).getImm() != 0);
6097}
6098
6099void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
6100constMachineInstr &MI,
6101int OpIdx) const{
6102assert(OpIdx >= 0 &&"expected to match an immediate operand");
6103 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6104}
6105
6106void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6107MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6108assert(OpIdx >= 0 &&"expected to match an immediate operand");
6109 MIB.addImm(
6110 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6111}
6112
6113void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6114MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6115assert(OpIdx >= 0 &&"expected to match an immediate operand");
6116 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x2)
6117 ? (int64_t)(SISrcMods::OP_SEL_0 |SISrcMods::DST_OP_SEL)
6118 : (int64_t)SISrcMods::DST_OP_SEL);
6119}
6120
6121void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6122MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6123assert(OpIdx >= 0 &&"expected to match an immediate operand");
6124 MIB.addImm(
6125 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6126}
6127
6128void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6129MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6130assert(OpIdx >= 0 &&"expected to match an immediate operand");
6131 MIB.addImm((MI.getOperand(OpIdx).getImm() & 0x1)
6132 ? (int64_t)(SISrcMods::OP_SEL_0)
6133 : 0);
6134}
6135
6136void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6137MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6138assert(OpIdx >= 0 &&"expected to match an immediate operand");
6139 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::DST_OP_SEL)
6140 : 0);
6141}
6142
6143void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6144MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6145assert(OpIdx >= 0 &&"expected to match an immediate operand");
6146 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)(SISrcMods::OP_SEL_0)
6147 : 0);
6148}
6149
6150void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6151MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6152assert(OpIdx >= 0 &&"expected to match an immediate operand");
6153 MIB.addImm(
6154 (MI.getOperand(OpIdx).getImm() & 0x1) ? (int64_t)SISrcMods::OP_SEL_0 : 0);
6155}
6156
6157void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6158MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6159assert(OpIdx >= 0 &&"expected to match an immediate operand");
6160 MIB.addImm(
6161 (MI.getOperand(OpIdx).getImm() & 0x2) ? (int64_t)SISrcMods::DST_OP_SEL : 0);
6162}
6163
6164void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
6165constMachineInstr &MI,
6166int OpIdx) const{
6167assert(OpIdx >= 0 &&"expected to match an immediate operand");
6168 MIB.addImm(MI.getOperand(OpIdx).getImm() &
6169 (AMDGPU::isGFX12Plus(STI) ?AMDGPU::CPol::ALL
6170 :AMDGPU::CPol::ALL_pregfx12));
6171}
6172
6173void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
6174constMachineInstr &MI,
6175int OpIdx) const{
6176assert(OpIdx >= 0 &&"expected to match an immediate operand");
6177constboolSwizzle =MI.getOperand(OpIdx).getImm() &
6178 (AMDGPU::isGFX12Plus(STI) ?AMDGPU::CPol::SWZ
6179 :AMDGPU::CPol::SWZ_pregfx12);
6180 MIB.addImm(Swizzle);
6181}
6182
6183void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6184MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6185assert(OpIdx >= 0 &&"expected to match an immediate operand");
6186constuint32_t Cpol =MI.getOperand(OpIdx).getImm() &
6187 (AMDGPU::isGFX12Plus(STI) ?AMDGPU::CPol::ALL
6188 :AMDGPU::CPol::ALL_pregfx12);
6189 MIB.addImm(Cpol |AMDGPU::CPol::GLC);
6190}
6191
6192void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
6193constMachineInstr &MI,
6194int OpIdx) const{
6195 MIB.addFrameIndex(MI.getOperand(1).getIndex());
6196}
6197
6198void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
6199constMachineInstr &MI,
6200int OpIdx) const{
6201constAPFloat &APF =MI.getOperand(1).getFPImm()->getValueAPF();
6202int ExpVal = APF.getExactLog2Abs();
6203assert(ExpVal != INT_MIN);
6204 MIB.addImm(ExpVal);
6205}
6206
6207void AMDGPUInstructionSelector::renderRoundMode(MachineInstrBuilder &MIB,
6208constMachineInstr &MI,
6209int OpIdx) const{
6210// "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3
6211// "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0
6212// "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1
6213// "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2
6214 MIB.addImm((MI.getOperand(OpIdx).getImm() + 3) % 4);
6215}
6216
6217/// Convert from 2-bit value to enum values used for op_sel* source modifiers.
6218void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6219MachineInstrBuilder &MIB,constMachineInstr &MI,int OpIdx) const{
6220unsigned Val =MI.getOperand(OpIdx).getImm();
6221unsignedNew = 0;
6222if (Val & 0x1)
6223New |=SISrcMods::OP_SEL_0;
6224if (Val & 0x2)
6225New |=SISrcMods::OP_SEL_1;
6226 MIB.addImm(New);
6227}
6228
6229bool AMDGPUInstructionSelector::isInlineImmediate(constAPInt &Imm) const{
6230return TII.isInlineConstant(Imm);
6231}
6232
6233bool AMDGPUInstructionSelector::isInlineImmediate(constAPFloat &Imm) const{
6234return TII.isInlineConstant(Imm);
6235}
SubReg
unsigned SubReg
Definition:AArch64AdvSIMDScalarPass.cpp:104
MRI
unsigned const MachineRegisterInfo * MRI
Definition:AArch64AdvSIMDScalarPass.cpp:105
DefMI
MachineInstrBuilder MachineInstrBuilder & DefMI
Definition:AArch64ExpandPseudoInsts.cpp:113
getIntrinsicID
static unsigned getIntrinsicID(const SDNode *N)
Definition:AArch64ISelLowering.cpp:7713
Intr
unsigned Intr
Definition:AMDGPUBaseInfo.cpp:2958
AMDGPUBaseInfo.h
AMDGPUGlobalISelUtils.h
AMDGPUInstrInfo.h
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
isNoUnsignedWrap
static bool isNoUnsignedWrap(MachineInstr *Addr)
Definition:AMDGPUInstructionSelector.cpp:5179
buildOffsetSrc
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
Definition:AMDGPUInstructionSelector.cpp:5513
getNamedBarrierOp
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
Definition:AMDGPUInstructionSelector.cpp:5921
GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_PREDICATES_INIT
GET_GLOBALISEL_TEMPORARIES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
getWaveAddress
static Register getWaveAddress(const MachineInstr *Def)
Definition:AMDGPUInstructionSelector.cpp:70
isExtractHiElt
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
Definition:AMDGPUInstructionSelector.cpp:2704
shouldUseAndMask
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
Definition:AMDGPUInstructionSelector.cpp:2523
BitOp3_Op
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
Definition:AMDGPUInstructionSelector.cpp:3730
isLaneMaskFromSameBlock
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
Definition:AMDGPUInstructionSelector.cpp:1549
parseTexFail
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
Definition:AMDGPUInstructionSelector.cpp:1996
computeIndirectRegIndex
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
Definition:AMDGPUInstructionSelector.cpp:3165
addZeroImm
static void addZeroImm(MachineInstrBuilder &MIB)
Definition:AMDGPUInstructionSelector.cpp:5455
gwsIntrinToOpcode
static unsigned gwsIntrinToOpcode(unsigned IntrID)
Definition:AMDGPUInstructionSelector.cpp:1813
isConstant
static bool isConstant(const MachineInstr &MI)
Definition:AMDGPUInstructionSelector.cpp:2862
buildRegSequence
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Definition:AMDGPUInstructionSelector.cpp:4366
buildRSRC
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
Definition:AMDGPUInstructionSelector.cpp:5461
matchZeroExtendFromS32
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
Definition:AMDGPUInstructionSelector.cpp:3456
getLogicalBitOpcode
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
Definition:AMDGPUInstructionSelector.cpp:374
stripCopy
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
Definition:AMDGPUInstructionSelector.cpp:2693
stripBitCast
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
Definition:AMDGPUInstructionSelector.cpp:2697
getConstantZext32Val
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
Definition:AMDGPUInstructionSelector.cpp:5733
getV_CMPOpcode
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
Definition:AMDGPUInstructionSelector.cpp:1198
isVCmpResult
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
Definition:AMDGPUInstructionSelector.cpp:2959
buildAddr64RSrc
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
Definition:AMDGPUInstructionSelector.cpp:5504
selectWMMAModsNegAbs
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
Definition:AMDGPUInstructionSelector.cpp:4394
AMDGPUInstructionSelector.h
This file declares the targeting of the InstructionSelector class for AMDGPU.
S1
static const LLT S1
Definition:AMDGPULegalizerInfo.cpp:282
Select
AMDGPU Register Bank Select
Definition:AMDGPURegBankSelect.cpp:71
AMDGPURegisterBankInfo.h
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
AMDGPUTargetMachine.h
The AMDGPU TargetMachine interface definition for hw codegen targets.
AMDGPU.h
MBB
MachineBasicBlock & MBB
Definition:ARMSLSHardening.cpp:71
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition:ARMSLSHardening.cpp:73
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Info
Analysis containing CSE Info
Definition:CSEInfo.cpp:27
LLVM_DEBUG
#define LLVM_DEBUG(...)
Definition:Debug.h:106
DiagnosticInfo.h
Addr
uint64_t Addr
Definition:ELFObjHandler.cpp:79
Size
uint64_t Size
Definition:ELFObjHandler.cpp:81
GIMatchTableExecutorImpl.h
GISelKnownBits.h
Provides analysis for querying information about KnownBits during GISel passes.
DEBUG_TYPE
#define DEBUG_TYPE
Definition:GenericCycleImpl.h:31
GenericMachineInstrs.h
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
TII
const HexagonInstrInfo * TII
Definition:HexagonCopyToCombine.cpp:125
MI
IRTranslator LLVM IR MI
Definition:IRTranslator.cpp:112
F
#define F(x, y, z)
Definition:MD5.cpp:55
I
#define I(x, y, z)
Definition:MD5.cpp:58
MIPatternMatch.h
Contains matchers for matching SSA Machine Instructions.
MachineFrameInfo.h
MachineIRBuilder.h
This file declares the MachineIRBuilder class.
TRI
unsigned const TargetRegisterInfo * TRI
Definition:MachineSink.cpp:2029
Signed
@ Signed
Definition:NVPTXISelLowering.cpp:4789
P
#define P(N)
Swizzle
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
Definition:R600InstrInfo.cpp:341
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SIMachineFunctionInfo.h
OS
raw_pwrite_stream & OS
Definition:SampleProfWriter.cpp:51
Ptr
@ Ptr
Definition:TargetLibraryInfo.cpp:77
RHS
Value * RHS
Definition:X86PartialReduction.cpp:74
LHS
Value * LHS
Definition:X86PartialReduction.cpp:73
llvm::AMDGPUInstructionSelector::AMDGPUInstructionSelector
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
Definition:AMDGPUInstructionSelector.cpp:43
llvm::AMDGPUInstructionSelector::getName
static const char * getName()
llvm::AMDGPUInstructionSelector::select
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
Definition:AMDGPUInstructionSelector.cpp:3956
llvm::AMDGPUInstructionSelector::setupMF
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
Definition:AMDGPUInstructionSelector.cpp:59
llvm::AMDGPUMachineFunction::setInitWholeWave
void setInitWholeWave()
Definition:AMDGPUMachineFunction.h:115
llvm::AMDGPUMachineFunction::getLDSSize
uint32_t getLDSSize() const
Definition:AMDGPUMachineFunction.h:81
llvm::AMDGPUMachineFunction::isEntryFunction
bool isEntryFunction() const
Definition:AMDGPUMachineFunction.h:89
llvm::AMDGPURegisterBankInfo
Definition:AMDGPURegisterBankInfo.h:42
llvm::AMDGPURegisterBankInfo::getRegBankFromRegClass
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
Definition:AMDGPURegisterBankInfo.cpp:287
llvm::AMDGPUSubtarget::hasSDWA
bool hasSDWA() const
Definition:AMDGPUSubtarget.h:219
llvm::AMDGPUSubtarget::useRealTrue16Insts
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
Definition:AMDGPUSubtarget.cpp:37
llvm::AMDGPUSubtarget::getFlatWorkGroupSizes
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
Definition:AMDGPUSubtarget.cpp:159
llvm::AMDGPUSubtarget::GFX10
@ GFX10
Definition:AMDGPUSubtarget.h:41
llvm::AMDGPUSubtarget::SEA_ISLANDS
@ SEA_ISLANDS
Definition:AMDGPUSubtarget.h:38
llvm::AMDGPUSubtarget::VOLCANIC_ISLANDS
@ VOLCANIC_ISLANDS
Definition:AMDGPUSubtarget.h:39
llvm::AMDGPUSubtarget::GFX11
@ GFX11
Definition:AMDGPUSubtarget.h:42
llvm::AMDGPUSubtarget::getWavefrontSizeLog2
unsigned getWavefrontSizeLog2() const
Definition:AMDGPUSubtarget.h:263
llvm::AMDGPUSubtarget::getWavefrontSize
unsigned getWavefrontSize() const
Definition:AMDGPUSubtarget.h:259
llvm::AMDGPUSubtarget::hasInv2PiInlineImm
bool hasInv2PiInlineImm() const
Definition:AMDGPUSubtarget.h:239
llvm::AMDGPUTargetMachine
Definition:AMDGPUTargetMachine.h:31
llvm::AMDGPUTargetMachine::getNullPointerValue
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
Definition:AMDGPUTargetMachine.cpp:896
llvm::APFloat
Definition:APFloat.h:904
llvm::APFloat::getExactLog2Abs
LLVM_READONLY int getExactLog2Abs() const
Definition:APFloat.h:1489
llvm::APInt
Class for arbitrary precision integers.
Definition:APInt.h:78
llvm::APInt::zext
APInt zext(unsigned width) const
Zero extend to a new width.
Definition:APInt.cpp:986
llvm::APInt::getLowBitsSet
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition:APInt.h:306
llvm::APInt::getHighBitsSet
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition:APInt.h:296
llvm::APInt::getSExtValue
int64_t getSExtValue() const
Get sign extended value.
Definition:APInt.h:1542
llvm::APInt::countr_one
unsigned countr_one() const
Count the number of trailing one bits.
Definition:APInt.h:1635
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition:ArrayRef.h:41
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition:ArrayRef.h:168
llvm::BlockFrequencyInfo
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Definition:BlockFrequencyInfo.h:37
llvm::CmpInst::Predicate
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition:InstrTypes.h:673
llvm::CmpInst::FCMP_OEQ
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition:InstrTypes.h:676
llvm::CmpInst::FCMP_TRUE
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition:InstrTypes.h:690
llvm::CmpInst::ICMP_SLT
@ ICMP_SLT
signed less than
Definition:InstrTypes.h:702
llvm::CmpInst::ICMP_SLE
@ ICMP_SLE
signed less or equal
Definition:InstrTypes.h:703
llvm::CmpInst::FCMP_OLT
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition:InstrTypes.h:679
llvm::CmpInst::FCMP_ULE
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition:InstrTypes.h:688
llvm::CmpInst::FCMP_OGT
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition:InstrTypes.h:677
llvm::CmpInst::FCMP_OGE
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition:InstrTypes.h:678
llvm::CmpInst::ICMP_UGE
@ ICMP_UGE
unsigned greater or equal
Definition:InstrTypes.h:697
llvm::CmpInst::ICMP_UGT
@ ICMP_UGT
unsigned greater than
Definition:InstrTypes.h:696
llvm::CmpInst::ICMP_SGT
@ ICMP_SGT
signed greater than
Definition:InstrTypes.h:700
llvm::CmpInst::FCMP_ULT
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition:InstrTypes.h:687
llvm::CmpInst::FCMP_ONE
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition:InstrTypes.h:681
llvm::CmpInst::FCMP_UEQ
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition:InstrTypes.h:684
llvm::CmpInst::ICMP_ULT
@ ICMP_ULT
unsigned less than
Definition:InstrTypes.h:698
llvm::CmpInst::FCMP_UGT
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition:InstrTypes.h:685
llvm::CmpInst::FCMP_OLE
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition:InstrTypes.h:680
llvm::CmpInst::FCMP_ORD
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition:InstrTypes.h:682
llvm::CmpInst::ICMP_EQ
@ ICMP_EQ
equal
Definition:InstrTypes.h:694
llvm::CmpInst::ICMP_NE
@ ICMP_NE
not equal
Definition:InstrTypes.h:695
llvm::CmpInst::ICMP_SGE
@ ICMP_SGE
signed greater or equal
Definition:InstrTypes.h:701
llvm::CmpInst::FCMP_UNE
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition:InstrTypes.h:689
llvm::CmpInst::ICMP_ULE
@ ICMP_ULE
unsigned less or equal
Definition:InstrTypes.h:699
llvm::CmpInst::FCMP_UGE
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition:InstrTypes.h:686
llvm::CmpInst::FCMP_FALSE
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition:InstrTypes.h:675
llvm::CmpInst::FCMP_UNO
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition:InstrTypes.h:683
llvm::CmpInst::isFPPredicate
bool isFPPredicate() const
Definition:InstrTypes.h:780
llvm::CmpInst::isIntPredicate
bool isIntPredicate() const
Definition:InstrTypes.h:781
llvm::CodeGenCoverage
Definition:CodeGenCoverage.h:19
llvm::ConstantFP
ConstantFP - Floating Point Values [float, double].
Definition:Constants.h:271
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition:Constants.h:163
llvm::ConstantInt::getZExtValue
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition:Constants.h:157
llvm::DWARFExpression::Operation
This class represents an Operation in the Expression.
Definition:DWARFExpression.h:32
llvm::DebugLoc
A debug info location.
Definition:DebugLoc.h:33
llvm::DiagnosticInfoUnsupported
Diagnostic information for unsupported feature in backend.
Definition:DiagnosticInfo.h:1097
llvm::Function
Definition:Function.h:63
llvm::GBuildVector
Represents a G_BUILD_VECTOR.
Definition:GenericMachineInstrs.h:300
llvm::GCNSubtarget
Definition:GCNSubtarget.h:34
llvm::GCNSubtarget::useVGPRIndexMode
bool useVGPRIndexMode() const
Definition:GCNSubtarget.cpp:358
llvm::GCNSubtarget::hasPermlane32Swap
bool hasPermlane32Swap() const
Definition:GCNSubtarget.h:1341
llvm::GCNSubtarget::hasScalarCompareEq64
bool hasScalarCompareEq64() const
Definition:GCNSubtarget.h:1021
llvm::GCNSubtarget::getLDSBankCount
int getLDSBankCount() const
Definition:GCNSubtarget.h:350
llvm::GCNSubtarget::hasUsableDSOffset
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition:GCNSubtarget.h:478
llvm::GCNSubtarget::unsafeDSOffsetFoldingEnabled
bool unsafeDSOffsetFoldingEnabled() const
Definition:GCNSubtarget.h:482
llvm::GCNSubtarget::hasBitOp3Insts
bool hasBitOp3Insts() const
Definition:GCNSubtarget.h:1338
llvm::GCNSubtarget::hasFlatInstOffsets
bool hasFlatInstOffsets() const
Definition:GCNSubtarget.h:641
llvm::GCNSubtarget::hasCompressedExport
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
Definition:GCNSubtarget.h:1283
llvm::GCNSubtarget::hasGFX90AInsts
bool hasGFX90AInsts() const
Definition:GCNSubtarget.h:1247
llvm::GCNSubtarget::hasLDSLoadB96_B128
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
Definition:GCNSubtarget.h:1311
llvm::GCNSubtarget::getConstantBusLimit
unsigned getConstantBusLimit(unsigned Opcode) const
Definition:GCNSubtarget.cpp:200
llvm::GCNSubtarget::hasMADIntraFwdBug
bool hasMADIntraFwdBug() const
Definition:GCNSubtarget.h:1103
llvm::GCNSubtarget::privateMemoryResourceIsRangeChecked
bool privateMemoryResourceIsRangeChecked() const
Definition:GCNSubtarget.h:563
llvm::GCNSubtarget::hasSignedScratchOffsets
bool hasSignedScratchOffsets() const
Definition:GCNSubtarget.h:1442
llvm::GCNSubtarget::hasRestrictedSOffset
bool hasRestrictedSOffset() const
Definition:GCNSubtarget.h:1319
llvm::GCNSubtarget::getTargetLowering
const SITargetLowering * getTargetLowering() const override
Definition:GCNSubtarget.h:287
llvm::GCNSubtarget::ldsRequiresM0Init
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition:GCNSubtarget.h:716
llvm::GCNSubtarget::hasSPackHL
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
Definition:GCNSubtarget.h:1279
llvm::GCNSubtarget::hasG16
bool hasG16() const
Definition:GCNSubtarget.h:1093
llvm::GCNSubtarget::hasPermlane16Swap
bool hasPermlane16Swap() const
Definition:GCNSubtarget.h:1340
llvm::GCNSubtarget::hasFlatScratchSVSSwizzleBug
bool hasFlatScratchSVSSwizzleBug() const
Definition:GCNSubtarget.h:1293
llvm::GCNSubtarget::hasGWS
bool hasGWS() const
Definition:GCNSubtarget.h:1401
llvm::GCNSubtarget::useFlatForGlobal
bool useFlatForGlobal() const
Definition:GCNSubtarget.h:541
llvm::GCNSubtarget::getGeneration
Generation getGeneration() const
Definition:GCNSubtarget.h:327
llvm::GCNSubtarget::hasSplitBarriers
bool hasSplitBarriers() const
Definition:GCNSubtarget.h:1416
llvm::GCNSubtarget::hasUnpackedD16VMem
bool hasUnpackedD16VMem() const
Definition:GCNSubtarget.h:746
llvm::GCNSubtarget::hasGWSSemaReleaseAll
bool hasGWSSemaReleaseAll() const
Definition:GCNSubtarget.h:730
llvm::GCNSubtarget::hasAddr64
bool hasAddr64() const
Definition:GCNSubtarget.h:391
llvm::GCNSubtarget::isWave64
bool isWave64() const
Definition:GCNSubtarget.h:1588
llvm::GCNSubtarget::hasAddNoCarry
bool hasAddNoCarry() const
Definition:GCNSubtarget.h:738
llvm::GCNSubtarget::hasSALUFloatInsts
bool hasSALUFloatInsts() const
Definition:GCNSubtarget.h:1315
llvm::GCNSubtarget::hasPartialNSAEncoding
bool hasPartialNSAEncoding() const
Definition:GCNSubtarget.h:1113
llvm::GCNSubtarget::checkSubtargetFeatures
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Definition:GCNSubtarget.cpp:161
llvm::GConcatVectors
Represents a G_CONCAT_VECTORS.
Definition:GenericMachineInstrs.h:292
llvm::GIMatchTableExecutor::BFI
BlockFrequencyInfo * BFI
Definition:GIMatchTableExecutor.h:594
llvm::GIMatchTableExecutor::KB
GISelKnownBits * KB
Definition:GIMatchTableExecutor.h:591
llvm::GIMatchTableExecutor::MF
MachineFunction * MF
Definition:GIMatchTableExecutor.h:592
llvm::GIMatchTableExecutor::ComplexRendererFns
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
Definition:GIMatchTableExecutor.h:616
llvm::GIMatchTableExecutor::PSI
ProfileSummaryInfo * PSI
Definition:GIMatchTableExecutor.h:593
llvm::GIMatchTableExecutor::setupMF
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
Definition:GIMatchTableExecutor.h:601
llvm::GIMatchTableExecutor::CoverageInfo
CodeGenCoverage * CoverageInfo
Definition:GIMatchTableExecutor.h:590
llvm::GISelKnownBits
Definition:GISelKnownBits.h:29
llvm::GISelKnownBits::getKnownOnes
APInt getKnownOnes(Register R)
Definition:GISelKnownBits.cpp:97
llvm::GISelKnownBits::getKnownBits
KnownBits getKnownBits(Register R)
Definition:GISelKnownBits.cpp:66
llvm::GISelKnownBits::signBitIsZero
bool signBitIsZero(Register Op)
Definition:GISelKnownBits.cpp:87
llvm::GISelKnownBits::getKnownZeroes
APInt getKnownZeroes(Register R)
Definition:GISelKnownBits.cpp:93
llvm::GlobalValue
Definition:GlobalValue.h:48
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition:GlobalValue.h:657
llvm::Instruction
Definition:Instruction.h:68
llvm::LLT
Definition:LowLevelType.h:39
llvm::LLT::isScalar
constexpr bool isScalar() const
Definition:LowLevelType.h:146
llvm::LLT::scalar
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition:LowLevelType.h:42
llvm::LLT::isValid
constexpr bool isValid() const
Definition:LowLevelType.h:145
llvm::LLT::isVector
constexpr bool isVector() const
Definition:LowLevelType.h:148
llvm::LLT::getSizeInBits
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition:LowLevelType.h:190
llvm::LLT::getElementType
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition:LowLevelType.h:277
llvm::LLT::getAddressSpace
constexpr unsigned getAddressSpace() const
Definition:LowLevelType.h:270
llvm::LLT::fixed_vector
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition:LowLevelType.h:100
llvm::MCInstrDesc
Describe properties that are true of each instruction in the target description file.
Definition:MCInstrDesc.h:198
llvm::MDNode
Metadata node.
Definition:Metadata.h:1073
llvm::MachineBasicBlock
Definition:MachineBasicBlock.h:125
llvm::MachineBasicBlock::getParent
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
Definition:MachineBasicBlock.h:311
llvm::MachineFrameInfo
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
Definition:MachineFrameInfo.h:106
llvm::MachineFrameInfo::setReturnAddressIsTaken
void setReturnAddressIsTaken(bool s)
Definition:MachineFrameInfo.h:380
llvm::MachineFunction
Definition:MachineFunction.h:267
llvm::MachineFunction::getSubtarget
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Definition:MachineFunction.h:733
llvm::MachineFunction::getMachineMemOperand
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
Definition:MachineFunction.cpp:536
llvm::MachineFunction::getFrameInfo
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
Definition:MachineFunction.h:749
llvm::MachineFunction::getRegInfo
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Definition:MachineFunction.h:743
llvm::MachineFunction::getFunction
Function & getFunction()
Return the LLVM function that this machine code represents.
Definition:MachineFunction.h:704
llvm::MachineFunction::getInfo
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Definition:MachineFunction.h:831
llvm::MachineFunction::getTarget
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Definition:MachineFunction.h:729
llvm::MachineIRBuilder
Helper class to build MachineInstr.
Definition:MachineIRBuilder.h:235
llvm::MachineInstrBuilder
Definition:MachineInstrBuilder.h:71
llvm::MachineInstrBuilder::setMemRefs
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
Definition:MachineInstrBuilder.h:210
llvm::MachineInstrBuilder::setOperandDead
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
Definition:MachineInstrBuilder.h:285
llvm::MachineInstrBuilder::addImm
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
Definition:MachineInstrBuilder.h:133
llvm::MachineInstrBuilder::add
const MachineInstrBuilder & add(const MachineOperand &MO) const
Definition:MachineInstrBuilder.h:226
llvm::MachineInstrBuilder::addFrameIndex
const MachineInstrBuilder & addFrameIndex(int Idx) const
Definition:MachineInstrBuilder.h:154
llvm::MachineInstrBuilder::addGlobalAddress
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
Definition:MachineInstrBuilder.h:179
llvm::MachineInstrBuilder::addReg
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Definition:MachineInstrBuilder.h:99
llvm::MachineInstrBuilder::addMBB
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Definition:MachineInstrBuilder.h:148
llvm::MachineInstrBuilder::cloneMemRefs
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Definition:MachineInstrBuilder.h:215
llvm::MachineInstrBuilder::addUse
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
Definition:MachineInstrBuilder.h:125
llvm::MachineInstrBuilder::addDef
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Definition:MachineInstrBuilder.h:118
llvm::MachineInstr
Representation of each machine instruction.
Definition:MachineInstr.h:71
llvm::MachineInstr::getOpcode
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition:MachineInstr.h:577
llvm::MachineInstr::getParent
const MachineBasicBlock * getParent() const
Definition:MachineInstr.h:349
llvm::MachineInstr::getNumOperands
unsigned getNumOperands() const
Retuns the total number of operands.
Definition:MachineInstr.h:580
llvm::MachineInstr::tieOperands
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
Definition:MachineInstr.cpp:1168
llvm::MachineInstr::NoUWrap
@ NoUWrap
Definition:MachineInstr.h:107
llvm::MachineInstr::getDebugLoc
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition:MachineInstr.h:501
llvm::MachineInstr::getOperand
const MachineOperand & getOperand(unsigned i) const
Definition:MachineInstr.h:587
llvm::MachineMemOperand
A description of a memory reference used in the backend.
Definition:MachineMemOperand.h:129
llvm::MachineMemOperand::getAddrSpace
unsigned getAddrSpace() const
Definition:MachineMemOperand.h:233
llvm::MachineMemOperand::MOLoad
@ MOLoad
The memory access reads data.
Definition:MachineMemOperand.h:136
llvm::MachineMemOperand::MOStore
@ MOStore
The memory access writes data.
Definition:MachineMemOperand.h:138
llvm::MachineMemOperand::getPointerInfo
const MachinePointerInfo & getPointerInfo() const
Definition:MachineMemOperand.h:204
llvm::MachineMemOperand::getFlags
Flags getFlags() const
Return the raw flags of the source value,.
Definition:MachineMemOperand.h:224
llvm::MachineMemOperand::getValue
const Value * getValue() const
Return the base address of the memory access.
Definition:MachineMemOperand.h:213
llvm::MachineMemOperand::getBaseAlign
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
Definition:MachineMemOperand.h:263
llvm::MachineOperand
MachineOperand class - Representation of each machine instruction operand.
Definition:MachineOperand.h:48
llvm::MachineOperand::getSubReg
unsigned getSubReg() const
Definition:MachineOperand.h:374
llvm::MachineOperand::isUndef
bool isUndef() const
Definition:MachineOperand.h:404
llvm::MachineOperand::getCImm
const ConstantInt * getCImm() const
Definition:MachineOperand.h:561
llvm::MachineOperand::setImm
void setImm(int64_t immVal)
Definition:MachineOperand.h:685
llvm::MachineOperand::getImm
int64_t getImm() const
Definition:MachineOperand.h:556
llvm::MachineOperand::isImplicit
bool isImplicit() const
Definition:MachineOperand.h:389
llvm::MachineOperand::isKill
bool isKill() const
Definition:MachineOperand.h:399
llvm::MachineOperand::isReg
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Definition:MachineOperand.h:329
llvm::MachineOperand::getShuffleMask
ArrayRef< int > getShuffleMask() const
Definition:MachineOperand.h:622
llvm::MachineOperand::setReg
void setReg(Register Reg)
Change the register this operand corresponds to.
Definition:MachineOperand.cpp:61
llvm::MachineOperand::isDef
bool isDef() const
Definition:MachineOperand.h:384
llvm::MachineOperand::isImm
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Definition:MachineOperand.h:331
llvm::MachineOperand::getParent
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
Definition:MachineOperand.h:243
llvm::MachineOperand::isDebug
bool isDebug() const
Definition:MachineOperand.h:455
llvm::MachineOperand::getIndex
int getIndex() const
Definition:MachineOperand.h:576
llvm::MachineOperand::isDead
bool isDead() const
Definition:MachineOperand.h:394
llvm::MachineOperand::CreateImm
static MachineOperand CreateImm(int64_t Val)
Definition:MachineOperand.h:820
llvm::MachineOperand::isEarlyClobber
bool isEarlyClobber() const
Definition:MachineOperand.h:445
llvm::MachineOperand::getReg
Register getReg() const
getReg - Returns the register number.
Definition:MachineOperand.h:369
llvm::MachineOperand::isInternalRead
bool isInternalRead() const
Definition:MachineOperand.h:440
llvm::MachineOperand::CreateReg
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
Definition:MachineOperand.h:838
llvm::MachineRegisterInfo
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Definition:MachineRegisterInfo.h:51
llvm::Metadata
Root of the metadata hierarchy.
Definition:Metadata.h:62
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition:Module.h:65
llvm::PointerUnion< const TargetRegisterClass *, const RegisterBank * >
llvm::ProfileSummaryInfo
Analysis providing profile information.
Definition:ProfileSummaryInfo.h:41
llvm::RegisterBankInfo::constrainGenericRegister
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
Definition:RegisterBankInfo.cpp:132
llvm::RegisterBankInfo::getRegBank
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
Definition:RegisterBankInfo.h:440
llvm::RegisterBankInfo::getSizeInBits
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
Definition:RegisterBankInfo.cpp:498
llvm::RegisterBank
This class implements the register bank concept.
Definition:RegisterBank.h:28
llvm::RegisterBank::getID
unsigned getID() const
Get the identifier of this register bank.
Definition:RegisterBank.h:45
llvm::Register
Wrapper class representing virtual and physical registers.
Definition:Register.h:19
llvm::SIInstrInfo
Definition:SIInstrInfo.h:85
llvm::SIInstrInfo::isLegalMUBUFImmOffset
bool isLegalMUBUFImmOffset(unsigned Imm) const
Definition:SIInstrInfo.cpp:9165
llvm::SIInstrInfo::isInlineConstant
bool isInlineConstant(const APInt &Imm) const
Definition:SIInstrInfo.cpp:4282
llvm::SIInstrInfo::getIndirectRegWriteMovRelPseudo
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
Definition:SIInstrInfo.cpp:1537
llvm::SIInstrInfo::getMaxMUBUFImmOffset
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Definition:SIInstrInfo.cpp:9169
llvm::SIInstrInfo::getIndirectGPRIDXPseudo
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
Definition:SIInstrInfo.cpp:1404
llvm::SIInstrInfo::splitFlatOffset
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
Definition:SIInstrInfo.cpp:9309
llvm::SIInstrInfo::getDSShaderTypeValue
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
Definition:SIInstrInfo.cpp:9879
llvm::SIInstrInfo::isLegalFLATOffset
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
Definition:SIInstrInfo.cpp:9285
llvm::SIInstrInfo::MO_ABS32_LO
@ MO_ABS32_LO
Definition:SIInstrInfo.h:223
llvm::SIInstrInfo::enforceOperandRCAlignment
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
Definition:SIInstrInfo.cpp:10111
llvm::SIMachineFunctionInfo
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
Definition:SIMachineFunctionInfo.h:390
llvm::SIRegisterInfo
Definition:SIRegisterInfo.h:32
llvm::SIRegisterInfo::getSubRegFromChannel
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
Definition:SIRegisterInfo.cpp:549
llvm::SIRegisterInfo::getReturnAddressReg
MCRegister getReturnAddressReg(const MachineFunction &MF) const
Definition:SIRegisterInfo.cpp:3679
llvm::SIRegisterInfo::getRegSplitParts
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
Definition:SIRegisterInfo.cpp:3578
llvm::SIRegisterInfo::getRegClassForSizeOnBank
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
Definition:SIRegisterInfo.cpp:3685
llvm::SIRegisterInfo::getConstrainedRegClassForOperand
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
Definition:SIRegisterInfo.cpp:3704
llvm::SIRegisterInfo::getRegClassForTypeOnBank
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
Definition:SIRegisterInfo.h:345
llvm::SIRegisterInfo::getBoolRC
const TargetRegisterClass * getBoolRC() const
Definition:SIRegisterInfo.h:353
llvm::SIRegisterInfo::getExec
MCRegister getExec() const
Definition:SIRegisterInfo.cpp:3720
llvm::SIRegisterInfo::getWaveMaskRegClass
const TargetRegisterClass * getWaveMaskRegClass() const
Definition:SIRegisterInfo.h:358
llvm::SIRegisterInfo::isSGPRClass
static bool isSGPRClass(const TargetRegisterClass *RC)
Definition:SIRegisterInfo.h:203
llvm::SmallVectorBase::empty
bool empty() const
Definition:SmallVector.h:81
llvm::SmallVectorBase::size
size_t size() const
Definition:SmallVector.h:78
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition:SmallVector.h:573
llvm::SmallVectorTemplateBase::push_back
void push_back(const T &Elt)
Definition:SmallVector.h:413
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition:SmallVector.h:1196
llvm::SrcOp
Definition:MachineIRBuilder.h:142
llvm::SrcOp::getReg
Register getReg() const
Definition:MachineIRBuilder.h:194
llvm::TargetLoweringBase::getStackPointerRegisterToSaveRestore
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
Definition:TargetLowering.h:2015
llvm::TargetMachine::getOptLevel
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
Definition:TargetMachine.h:257
llvm::TargetMachine::getTargetTriple
const Triple & getTargetTriple() const
Definition:TargetMachine.h:126
llvm::TargetRegisterClass
Definition:TargetRegisterInfo.h:44
llvm::TargetRegisterClass::getID
unsigned getID() const
Return the register class ID number.
Definition:TargetRegisterInfo.h:73
llvm::TargetRegisterClass::hasSubClassEq
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
Definition:TargetRegisterInfo.h:130
llvm::TargetRegisterClass::hasSuperClassEq
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
Definition:TargetRegisterInfo.h:142
llvm::TargetRegisterInfo
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Definition:TargetRegisterInfo.h:235
llvm::Triple::OSType
OSType
Definition:Triple.h:199
llvm::Triple::AMDHSA
@ AMDHSA
Definition:Triple.h:223
llvm::Triple::AMDPAL
@ AMDPAL
Definition:Triple.h:233
llvm::Triple::getOS
OSType getOS() const
Get the parsed operating system type of this triple.
Definition:Triple.h:404
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
llvm::Value
LLVM Value Representation.
Definition:Value.h:74
llvm::Value::Value
Value(Type *Ty, unsigned scid)
Definition:Value.cpp:53
uint32_t
uint64_t
uint8_t
unsigned
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition:ErrorHandling.h:143
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition:AMDGPUAddrSpace.h:38
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition:AMDGPUAddrSpace.h:32
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition:AMDGPUAddrSpace.h:35
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition:AMDGPUAddrSpace.h:31
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition:AMDGPUAddrSpace.h:36
llvm::AMDGPU::Barrier::WORKGROUP
@ WORKGROUP
Definition:SIDefines.h:1078
llvm::AMDGPU::CPol::CPol
CPol
Definition:SIDefines.h:380
llvm::AMDGPU::CPol::GLC
@ GLC
Definition:SIDefines.h:381
llvm::AMDGPU::CPol::SWZ_pregfx12
@ SWZ_pregfx12
Definition:SIDefines.h:389
llvm::AMDGPU::CPol::ALL
@ ALL
Definition:SIDefines.h:422
llvm::AMDGPU::CPol::SWZ
@ SWZ
Definition:SIDefines.h:420
llvm::AMDGPU::CPol::ALL_pregfx12
@ ALL_pregfx12
Definition:SIDefines.h:388
llvm::AMDGPU::CPol::VOLATILE
@ VOLATILE
Definition:SIDefines.h:432
llvm::AMDGPU::DPP::DPP_FI_1
@ DPP_FI_1
Definition:SIDefines.h:987
llvm::AMDGPU::DPP::DPP_FI_0
@ DPP_FI_0
Definition:SIDefines.h:986
llvm::AMDGPU::HSAMD::Kernel::Key::SymbolName
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Definition:AMDGPUMetadata.h:387
llvm::AMDGPU::IsaInfo::TargetIDSetting::Off
@ Off
llvm::AMDGPU::PALMD::Key
Key
PAL metadata keys.
Definition:AMDGPUMetadata.h:487
llvm::AMDGPU::SDWA::UNUSED_PRESERVE
@ UNUSED_PRESERVE
Definition:SIDefines.h:916
llvm::AMDGPU::SDWA::WORD_1
@ WORD_1
Definition:SIDefines.h:909
llvm::AMDGPU::SDWA::WORD_0
@ WORD_0
Definition:SIDefines.h:908
llvm::AMDGPU::getMIMGG16MappingInfo
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
llvm::AMDGPU::getGlobalSaddrOp
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
llvm::AMDGPU::getMIMGOpcode
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
Definition:AMDGPUBaseInfo.cpp:273
llvm::AMDGPU::getSMRDEncodedLiteralOffset32
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
Definition:AMDGPUBaseInfo.cpp:2936
llvm::AMDGPU::isGFX12Plus
bool isGFX12Plus(const MCSubtargetInfo &STI)
Definition:AMDGPUBaseInfo.cpp:2210
llvm::AMDGPU::Imm
@ Imm
Definition:AMDGPURegBankLegalizeRules.h:105
llvm::AMDGPU::isInlinableLiteral32
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
Definition:AMDGPUBaseInfo.cpp:2616
llvm::AMDGPU::hasNamedOperand
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
Definition:AMDGPUBaseInfo.h:400
llvm::AMDGPU::hasSMRDSignedImmOffset
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
Definition:AMDGPUBaseInfo.cpp:163
llvm::AMDGPU::isGFX11Plus
bool isGFX11Plus(const MCSubtargetInfo &STI)
Definition:AMDGPUBaseInfo.cpp:2202
llvm::AMDGPU::isGFX10Plus
bool isGFX10Plus(const MCSubtargetInfo &STI)
Definition:AMDGPUBaseInfo.cpp:2194
llvm::AMDGPU::getSMRDEncodedOffset
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
Definition:AMDGPUBaseInfo.cpp:2907
llvm::AMDGPU::getRegBitWidth
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
Definition:SIRegisterInfo.cpp:3201
llvm::AMDGPU::getMIMGDimInfo
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
llvm::AMDGPU::getMIMGBaseOpcodeInfo
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
llvm::AMDGPU::getIntrinsicID
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
Definition:AMDGPUInstrInfo.cpp:30
llvm::AMDGPU::getImageDimIntrinsicInfo
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
llvm::AMDGPU::getBaseWithConstantOffset
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
Definition:AMDGPUGlobalISelUtils.cpp:26
llvm::ARMII::IndexMode
IndexMode
ARM Index Modes.
Definition:ARMBaseInfo.h:177
llvm::ARMII::VecSize
@ VecSize
Definition:ARMBaseInfo.h:437
llvm::ARM::ProfileKind::M
@ M
llvm::BitmaskEnumDetail::Mask
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition:BitmaskEnum.h:125
llvm::Intrinsic::getOrInsertDeclaration
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition:Intrinsics.cpp:732
llvm::MIPatternMatch::m_Reg
operand_type_match m_Reg()
Definition:MIPatternMatch.h:270
llvm::MIPatternMatch::m_GCst
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
Definition:MIPatternMatch.h:151
llvm::MIPatternMatch::m_Copy
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
Definition:MIPatternMatch.h:682
llvm::MIPatternMatch::m_SpecificICst
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
Definition:MIPatternMatch.h:195
llvm::MIPatternMatch::m_GZExt
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
Definition:MIPatternMatch.h:633
llvm::MIPatternMatch::m_GXor
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
Definition:MIPatternMatch.h:552
llvm::MIPatternMatch::m_GFPExt
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
Definition:MIPatternMatch.h:638
llvm::MIPatternMatch::m_ICst
ConstantMatch< APInt > m_ICst(APInt &Cst)
Definition:MIPatternMatch.h:93
llvm::MIPatternMatch::m_AllOnesInt
SpecificConstantMatch m_AllOnesInt()
Definition:MIPatternMatch.h:239
llvm::MIPatternMatch::m_GOr
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
Definition:MIPatternMatch.h:557
llvm::MIPatternMatch::m_ICstOrSplat
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
Definition:MIPatternMatch.h:134
llvm::MIPatternMatch::mi_match
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
Definition:MIPatternMatch.h:25
llvm::MIPatternMatch::m_GPtrAdd
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Definition:MIPatternMatch.h:510
llvm::MIPatternMatch::m_any_of
Or< Preds... > m_any_of(Preds &&... preds)
Definition:MIPatternMatch.h:314
llvm::MIPatternMatch::m_GAnd
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
Definition:MIPatternMatch.h:546
llvm::MIPatternMatch::m_GBitcast
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
Definition:MIPatternMatch.h:649
llvm::MIPatternMatch::m_GFNeg
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
Definition:MIPatternMatch.h:677
llvm::MIPatternMatch::m_GFCstOrSplat
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
Definition:MIPatternMatch.h:180
llvm::MIPatternMatch::m_GFabs
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
Definition:MIPatternMatch.h:672
llvm::MIPatternMatch::m_GLShr
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
Definition:MIPatternMatch.h:570
llvm::MIPatternMatch::m_GTrunc
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
Definition:MIPatternMatch.h:643
llvm::MipsISD::Ret
@ Ret
Definition:MipsISelLowering.h:117
llvm::NVPTXISD::BFE
@ BFE
Definition:NVPTXISelLowering.h:60
llvm::PatternMatch::m_ZeroInt
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition:PatternMatch.h:599
llvm::PatternMatch::m_OneUse
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition:PatternMatch.h:67
llvm::PatternMatch::m_Not
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
Definition:PatternMatch.h:2467
llvm::RegState::Implicit
@ Implicit
Not emitted register (e.g. carry, or temporary result).
Definition:MachineInstrBuilder.h:48
llvm::RegState::Dead
@ Dead
Unused definition.
Definition:MachineInstrBuilder.h:52
llvm::RegState::Kill
@ Kill
The last use of a register.
Definition:MachineInstrBuilder.h:50
llvm::SIInstrFlags::FlatGlobal
@ FlatGlobal
Definition:SIDefines.h:142
llvm::SIInstrFlags::DS
@ DS
Definition:SIDefines.h:88
llvm::SIInstrFlags::FlatScratch
@ FlatScratch
Definition:SIDefines.h:157
llvm::SIInstrFlags::FLAT
@ FLAT
Definition:SIDefines.h:87
llvm::SISrcMods::ABS
@ ABS
Definition:SIDefines.h:289
llvm::SISrcMods::OP_SEL_0
@ OP_SEL_0
Definition:SIDefines.h:292
llvm::SISrcMods::DST_OP_SEL
@ DST_OP_SEL
Definition:SIDefines.h:294
llvm::SISrcMods::NEG_HI
@ NEG_HI
Definition:SIDefines.h:291
llvm::SISrcMods::OP_SEL_1
@ OP_SEL_1
Definition:SIDefines.h:293
llvm::SISrcMods::NEG
@ NEG
Definition:SIDefines.h:288
llvm::SPII::Load
@ Load
Definition:SparcInstrInfo.h:32
llvm::X86Disassembler::Reg
Reg
All possible values of the reg field in the ModR/M byte.
Definition:X86DisassemblerDecoder.h:621
llvm::X86::FirstMacroFusionInstKind::Cmp
@ Cmp
llvm::codeview::EncodedFramePtrReg::BasePtr
@ BasePtr
llvm::ms_demangle::IntrinsicFunctionKind::New
@ New
llvm::omp::RTLDependInfoFields::Flags
@ Flags
llvm::orc::MemProt::Exec
@ Exec
llvm::rdf::Def
NodeAddr< DefNode * > Def
Definition:RDFGraph.h:384
llvm::sampleprof::Base
@ Base
Definition:Discriminator.h:58
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition:AddressRanges.h:18
llvm::getFunctionLiveInPhysReg
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition:Utils.cpp:910
llvm::Offset
@ Offset
Definition:DWP.cpp:480
llvm::constrainOperandRegClass
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition:Utils.cpp:56
llvm::getOpcodeDef
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition:Utils.cpp:645
llvm::popcount
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition:bit.h:385
llvm::getConstantFPVRegVal
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition:Utils.cpp:459
llvm::BuildMI
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
Definition:MachineInstrBuilder.h:373
llvm::getIConstantVRegVal
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition:Utils.cpp:294
llvm::Depth
@ Depth
Definition:SIMachineScheduler.h:36
llvm::constrainSelectedInstRegOperands
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition:Utils.cpp:155
llvm::getDefIgnoringCopies
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition:Utils.cpp:486
llvm::getIConstantVRegSExtVal
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition:Utils.cpp:314
llvm::Hi_32
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition:MathExtras.h:155
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition:Debug.cpp:163
llvm::report_fatal_error
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition:Error.cpp:167
llvm::getAnyConstantVRegValWithLookThrough
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition:Utils.cpp:439
llvm::CodeGenOptLevel::None
@ None
-O0
llvm::Lo_32
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition:MathExtras.h:160
llvm::getUndefRegState
unsigned getUndefRegState(bool B)
Definition:MachineInstrBuilder.h:561
llvm::RecurKind::SMax
@ SMax
Signed integer max implemented in terms of select(cmp()).
llvm::RecurKind::Add
@ Add
Sum of integers.
llvm::Op
DWARFExpression::Operation Op
Definition:DWARFExpression.cpp:22
llvm::DS_Error
@ DS_Error
Definition:DiagnosticInfo.h:50
llvm::getIConstantVRegValWithLookThrough
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition:Utils.cpp:433
llvm::getDefSrcRegIgnoringCopies
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition:Utils.cpp:467
llvm::getSrcRegIgnoringCopies
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition:Utils.cpp:493
llvm::Data
@ Data
Definition:SIMachineScheduler.h:55
llvm::InstructionUniformity::Default
@ Default
The result values are uniform if and only if all operands are uniform.
llvm::AMDGPU::ImageDimIntrinsicInfo
Definition:AMDGPUInstrInfo.h:55
llvm::AMDGPU::MIMGBaseOpcodeInfo
Definition:AMDGPUBaseInfo.h:407
llvm::AMDGPU::MIMGBaseOpcodeInfo::Gather4
bool Gather4
Definition:AMDGPUBaseInfo.h:413
llvm::AMDGPU::MIMGBaseOpcodeInfo::AtomicX2
bool AtomicX2
Definition:AMDGPUBaseInfo.h:411
llvm::AMDGPU::MIMGBaseOpcodeInfo::Sampler
bool Sampler
Definition:AMDGPUBaseInfo.h:412
llvm::AMDGPU::MIMGBaseOpcodeInfo::NoReturn
bool NoReturn
Definition:AMDGPUBaseInfo.h:424
llvm::AMDGPU::MIMGBaseOpcodeInfo::HasD16
bool HasD16
Definition:AMDGPUBaseInfo.h:420
llvm::AMDGPU::MIMGBaseOpcodeInfo::Store
bool Store
Definition:AMDGPUBaseInfo.h:409
llvm::AMDGPU::MIMGBaseOpcodeInfo::Atomic
bool Atomic
Definition:AMDGPUBaseInfo.h:410
llvm::AMDGPU::MIMGDimInfo
Definition:AMDGPUBaseInfo.h:433
llvm::AMDGPU::MIMGDimInfo::Encoding
uint8_t Encoding
Definition:AMDGPUBaseInfo.h:439
llvm::AMDGPU::MIMGDimInfo::DA
bool DA
Definition:AMDGPUBaseInfo.h:438
llvm::AMDGPU::MIMGG16MappingInfo
Definition:AMDGPUBaseInfo.h:472
llvm::AMDGPU::MIMGG16MappingInfo::G16
MIMGBaseOpcode G16
Definition:AMDGPUBaseInfo.h:474
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition:Alignment.h:39
llvm::KnownBits
Definition:KnownBits.h:23
llvm::KnownBits::makeConstant
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition:KnownBits.h:293
llvm::KnownBits::add
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition:KnownBits.h:336
llvm::MIPatternMatch::And
Matching combinators.
Definition:MIPatternMatch.h:273
llvm::MIPatternMatch::Or
Definition:MIPatternMatch.h:292
llvm::MachinePointerInfo
This class contains a discriminated union of information about pointers in memory operands,...
Definition:MachineMemOperand.h:41
llvm::MachinePointerInfo::Offset
int64_t Offset
Offset - This is an offset from the base Value*.
Definition:MachineMemOperand.h:46
llvm::MachinePointerInfo::AddrSpace
unsigned AddrSpace
Definition:MachineMemOperand.h:48
llvm::MachinePointerInfo::V
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
Definition:MachineMemOperand.h:43

Generated on Thu Jul 17 2025 14:04:18 for LLVM by doxygen 1.9.6
[8]ページ先頭

©2009-2025 Movatter.jp