1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9/// This file implements the targeting of the InstructionSelector class for 11/// \todo This should be generated by TableGen. 12//===----------------------------------------------------------------------===// 29#include "llvm/IR/IntrinsicsAMDGPU.h" 32#define DEBUG_TYPE "amdgpu-isel" 35using namespaceMIPatternMatch;
37#define GET_GLOBALISEL_IMPL 38#define AMDGPUSubtarget GCNSubtarget 39#include "AMDGPUGenGlobalISel.inc" 40#undef GET_GLOBALISEL_IMPL 46 :
TII(*STI.getInstrInfo()),
TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
49#include
"AMDGPUGenGlobalISel.inc" 52#include
"AMDGPUGenGlobalISel.inc" 69// Return the wave level SGPR base address if this is a wave address. 71return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
72 ? Def->getOperand(1).getReg()
76bool AMDGPUInstructionSelector::isVCC(
Register Reg,
78// The verifier is oblivious to s1 being a valid value for wavesize registers. 82auto &RegClassOrBank =
MRI.getRegClassOrRegBank(Reg);
84 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
86constLLT Ty =
MRI.getType(Reg);
89// G_TRUNC s1 result is never vcc. 90returnMRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
94constRegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
95return RB->
getID() == AMDGPU::VCCRegBankID;
98bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(
MachineInstr &
MI,
99unsigned NewOpc)
const{
100MI.setDesc(TII.get(NewOpc));
101MI.removeOperand(1);
// Remove intrinsic ID. 107// TODO: This should be legalized to s32 if needed 115if (!DstRC || DstRC != SrcRC)
122bool AMDGPUInstructionSelector::selectCOPY(
MachineInstr &
I)
const{
125I.setDesc(TII.get(TargetOpcode::COPY));
132if (isVCC(DstReg, *MRI)) {
133if (SrcReg == AMDGPU::SCC) {
141if (!isVCC(SrcReg, *MRI)) {
142// TODO: Should probably leave the copy and let copyPhysReg expand it. 149 std::optional<ValueAndVReg> ConstVal =
153 STI.
isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 .
addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157Register MaskedReg =
MRI->createVirtualRegister(SrcRC);
159// We can't trust the high bits at this point, so clear them. 161// TODO: Skip masking high bits if def is known boolean. 165const int64_t NoMods = 0;
166BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_AND_B16_t16_e64), MaskedReg)
172BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_CMP_NE_U16_t16_e64), DstReg)
180unsigned AndOpc = IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
185And.setOperandDead(3);
// Dead scc 187BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
193if (!
MRI->getRegClassOrNull(SrcReg))
194MRI->setRegClass(SrcReg, SrcRC);
208if (MO.getReg().isPhysical())
220bool AMDGPUInstructionSelector::selectCOPY_SCC_VCC(
MachineInstr &
I)
const{
225 STI.
isWave64() ? AMDGPU::S_CMP_LG_U64 : AMDGPU::S_CMP_LG_U32;
227 .
addReg(
I.getOperand(1).getReg())
232Register DstReg =
I.getOperand(0).getReg();
239bool AMDGPUInstructionSelector::selectCOPY_VCC_SCC(
MachineInstr &
I)
const{
243Register DstReg =
I.getOperand(0).getReg();
244Register SrcReg =
I.getOperand(1).getReg();
245 std::optional<ValueAndVReg> Arg =
249const int64_t
Value = Arg->
Value.getZExtValue();
251unsigned Opcode = STI.
isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
261// RegBankLegalize ensures that SrcReg is bool in reg (high bits are 0). 264unsigned SelectOpcode =
265 STI.
isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
274bool AMDGPUInstructionSelector::selectReadAnyLane(
MachineInstr &
I)
const{
275Register DstReg =
I.getOperand(0).getReg();
276Register SrcReg =
I.getOperand(1).getReg();
281auto RFL =
BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
288bool AMDGPUInstructionSelector::selectPHI(
MachineInstr &
I)
const{
289constRegister DefReg =
I.getOperand(0).getReg();
290constLLT DefTy =
MRI->getType(DefReg);
292// S1 G_PHIs should not be selected in instruction-select, instead: 293// - divergent S1 G_PHI should go through lane mask merging algorithm 294// and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering 295// - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect 299// TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy) 302MRI->getRegClassOrRegBank(DefReg);
305 dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
312constRegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
320// If inputs have register bank, assign corresponding reg class. 321// Note: registers don't need to have the same reg bank. 322for (
unsigned i = 1; i !=
I.getNumOperands(); i += 2) {
323constRegister SrcReg =
I.getOperand(i).getReg();
327constLLT SrcTy =
MRI->getType(SrcReg);
335I.setDesc(TII.get(TargetOpcode::PHI));
342unsigned SubIdx)
const{
346Register DstReg =
MRI->createVirtualRegister(&SubRC);
349unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.
getSubReg(), SubIdx);
351BuildMI(*BB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
352 .
addReg(Reg, 0, ComposedSubIdx);
377return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
379return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
381return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
387bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(
MachineInstr &
I)
const{
388Register DstReg =
I.getOperand(0).getReg();
392if (DstRB->
getID() != AMDGPU::SGPRRegBankID &&
393 DstRB->
getID() != AMDGPU::VCCRegBankID)
396bool Is64 =
Size > 32 || (DstRB->
getID() == AMDGPU::VCCRegBankID &&
400// Dead implicit-def of scc 408bool AMDGPUInstructionSelector::selectG_ADD_SUB(
MachineInstr &
I)
const{
411Register DstReg =
I.getOperand(0).getReg();
413LLT Ty =
MRI->getType(DstReg);
419constbool IsSALU = DstRB->
getID() == AMDGPU::SGPRRegBankID;
420constbool Sub =
I.getOpcode() == TargetOpcode::G_SUB;
424constunsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
427 .
add(
I.getOperand(1))
428 .
add(
I.getOperand(2))
435constunsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
436I.setDesc(TII.get(Opc));
442constunsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
448 .
add(
I.getOperand(1))
449 .
add(
I.getOperand(2))
455assert(!Sub &&
"illegal sub should not reach here");
458 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
460 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
462MachineOperand Lo1(getSubOperand64(
I.getOperand(1), HalfRC, AMDGPU::sub0));
463MachineOperand Lo2(getSubOperand64(
I.getOperand(2), HalfRC, AMDGPU::sub0));
464MachineOperand Hi1(getSubOperand64(
I.getOperand(1), HalfRC, AMDGPU::sub1));
465MachineOperand Hi2(getSubOperand64(
I.getOperand(2), HalfRC, AMDGPU::sub1));
467Register DstLo =
MRI->createVirtualRegister(&HalfRC);
468Register DstHi =
MRI->createVirtualRegister(&HalfRC);
471BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
474BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
480Register CarryReg =
MRI->createVirtualRegister(CarryRC);
481BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
497BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
511bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
516Register Dst0Reg =
I.getOperand(0).getReg();
517Register Dst1Reg =
I.getOperand(1).getReg();
518constbool IsAdd =
I.getOpcode() == AMDGPU::G_UADDO ||
519I.getOpcode() == AMDGPU::G_UADDE;
520constbool HasCarryIn =
I.getOpcode() == AMDGPU::G_UADDE ||
521I.getOpcode() == AMDGPU::G_USUBE;
523if (isVCC(Dst1Reg, *MRI)) {
525 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
526unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
527I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
533Register Src0Reg =
I.getOperand(2).getReg();
534Register Src1Reg =
I.getOperand(3).getReg();
537BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
538 .
addReg(
I.getOperand(4).getReg());
541unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
542unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
544auto CarryInst =
BuildMI(*BB, &
I,
DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
545 .
add(
I.getOperand(2))
546 .
add(
I.getOperand(3));
548if (
MRI->use_nodbg_empty(Dst1Reg)) {
551BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), Dst1Reg)
553if (!
MRI->getRegClassOrNull(Dst1Reg))
554MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
564 AMDGPU::SReg_32RegClass, *MRI))
571bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
575constbool IsUnsigned =
I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
579 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
580 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
582 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
583I.setDesc(TII.get(Opc));
585I.addImplicitDefUseOperands(*
MF);
589// TODO: We should probably legalize these to only using 32-bit results. 590bool AMDGPUInstructionSelector::selectG_EXTRACT(
MachineInstr &
I)
const{
592Register DstReg =
I.getOperand(0).getReg();
593Register SrcReg =
I.getOperand(1).getReg();
594LLT DstTy =
MRI->getType(DstReg);
595LLT SrcTy =
MRI->getType(SrcReg);
599// TODO: Should handle any multiple of 32 offset. 600unsignedOffset =
I.getOperand(2).getImm();
601if (
Offset % 32 != 0 || DstSize > 128)
604// 16-bit operations really use 32-bit registers. 605// FIXME: Probably should not allow 16-bit G_EXTRACT results. 621 SrcRC = TRI.getSubClassWithSubReg(SrcRC,
SubReg);
626 *SrcRC,
I.getOperand(1));
628BuildMI(*BB, &
I,
DL, TII.get(TargetOpcode::COPY), DstReg)
635bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(
MachineInstr &
MI)
const{
638LLT DstTy =
MRI->getType(DstReg);
639LLT SrcTy =
MRI->getType(
MI.getOperand(1).getReg());
655BuildMI(*BB, &
MI,
DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
656for (
intI = 0, E =
MI.getNumOperands() - 1;
I != E; ++
I) {
674bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(
MachineInstr &
MI)
const{
676constint NumDst =
MI.getNumOperands() - 1;
682LLT DstTy =
MRI->getType(DstReg0);
683LLT SrcTy =
MRI->getType(SrcReg);
695// Note we could have mixed SGPR and VGPR destination banks for an SGPR 696// source, and this relies on the fact that the same subregister indices are 699for (
intI = 0, E = NumDst;
I != E; ++
I) {
701BuildMI(*BB, &
MI,
DL, TII.get(TargetOpcode::COPY), Dst.getReg())
702 .
addReg(SrcReg, 0, SubRegs[
I]);
704// Make sure the subregister index is valid for the source register. 705 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[
I]);
719bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(
MachineInstr &
MI)
const{
720assert(
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
721MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
725LLT SrcTy =
MRI->getType(Src0);
728// BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE. 729if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
730return selectG_MERGE_VALUES(
MI);
733// Selection logic below is for V2S16 only. 734// For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32. 737 (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
742if (DstBank->
getID() == AMDGPU::AGPRRegBankID)
746 DstBank->
getID() == AMDGPU::VGPRRegBankID);
747constbool IsVector = DstBank->
getID() == AMDGPU::VGPRRegBankID;
752// First, before trying TableGen patterns, check if both sources are 753// constants. In those cases, we can trivially compute the final constant 754// and emit a simple move. 760const int64_t K0 = ConstSrc0->Value.getSExtValue();
761const int64_t K1 = ConstSrc1->Value.getSExtValue();
780// Now try TableGen patterns. 784// TODO: This should probably be a combine somewhere 785// (build_vector $src0, undef) -> copy $src0 787if (Src1Def->
getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
788MI.setDesc(TII.get(AMDGPU::COPY));
791 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
796// TODO: Can be improved? 798Register TmpReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
799auto MIB =
BuildMI(*BB,
MI,
DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
805 MIB =
BuildMI(*BB,
MI,
DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
819// With multiple uses of the shift, this will duplicate the shift and 820// increase register pressure. 822// (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16) 823// => (S_PACK_HH_B32_B16 $src0, $src1) 824// (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1) 825// => (S_PACK_HL_B32_B16 $src0, $src1) 826// (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16)) 827// => (S_PACK_LH_B32_B16 $src0, $src1) 828// (build_vector $src0, $src1) 829// => (S_PACK_LL_B32_B16 $src0, $src1) 837unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
838if (Shift0 && Shift1) {
839 Opc = AMDGPU::S_PACK_HH_B32_B16;
840MI.getOperand(1).setReg(ShiftSrc0);
841MI.getOperand(2).setReg(ShiftSrc1);
843 Opc = AMDGPU::S_PACK_LH_B32_B16;
844MI.getOperand(2).setReg(ShiftSrc1);
848if (ConstSrc1 && ConstSrc1->Value == 0) {
849// build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16 850auto MIB =
BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
859 Opc = AMDGPU::S_PACK_HL_B32_B16;
860MI.getOperand(1).setReg(ShiftSrc0);
864MI.setDesc(TII.get(Opc));
868bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(
MachineInstr &
I)
const{
871// FIXME: Interface for getConstrainedRegClassForOperand needs work. The 872// regbank check here is to know why getConstrainedRegClassForOperand failed. 874if ((!RC && !
MRI->getRegBankOrNull(MO.
getReg())) ||
876I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
883bool AMDGPUInstructionSelector::selectG_INSERT(
MachineInstr &
I)
const{
886Register DstReg =
I.getOperand(0).getReg();
887Register Src0Reg =
I.getOperand(1).getReg();
888Register Src1Reg =
I.getOperand(2).getReg();
889LLT Src1Ty =
MRI->getType(Src1Reg);
891unsigned DstSize =
MRI->getType(DstReg).getSizeInBits();
894 int64_t
Offset =
I.getOperand(3).getImm();
896// FIXME: These cases should have been illegal and unnecessary to check here. 897if (
Offset % 32 != 0 || InsSize % 32 != 0)
900// Currently not handled by getSubRegFromChannel. 905if (
SubReg == AMDGPU::NoSubRegister)
921// Deal with weird cases where the class only partially supports the subreg 923 Src0RC = TRI.getSubClassWithSubReg(Src0RC,
SubReg);
924if (!Src0RC || !Src1RC)
933BuildMI(*BB, &
I,
DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
942bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(
MachineInstr &
MI)
const{
949"scalar BFX instructions are expanded in regbankselect");
950assert(
MRI->getType(
MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
951"64-bit vector BFX instructions are expanded in regbankselect");
956bool IsSigned =
MI.getOpcode() == TargetOpcode::G_SBFX;
957unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
966bool AMDGPUInstructionSelector::selectInterpP1F16(
MachineInstr &
MI)
const{
978// This requires 2 instructions. It is possible to write a pattern to support 979// this, but the generated isel emitter doesn't correctly deal with multiple 980// output instructions using the same physical register input. The copy to m0 981// is incorrectly placed before the second instruction. 983// TODO: Match source modifiers. 985Register InterpMov =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
991BuildMI(*
MBB, &
MI,
DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
993 .
addImm(
MI.getOperand(4).getImm())
// $attr 994 .
addImm(
MI.getOperand(3).getImm());
// $attrchan 997 .
addImm(0)
// $src0_modifiers 999 .
addImm(
MI.getOperand(4).getImm())
// $attr 1000 .
addImm(
MI.getOperand(3).getImm())
// $attrchan 1001 .
addImm(0)
// $src2_modifiers 1002 .
addReg(InterpMov)
// $src2 - 2 f16 values selected by high 1003 .
addImm(
MI.getOperand(5).getImm())
// $high 1007MI.eraseFromParent();
1011// Writelane is special in that it can use SGPR and M0 (which would normally 1012// count as using the constant bus twice - but in this case it is allowed since 1013// the lane selector doesn't count as a use of the constant bus). However, it is 1014// still required to abide by the 1 SGPR rule. Fix this up if we might have 1016bool AMDGPUInstructionSelector::selectWritelane(
MachineInstr &
MI)
const{
1017// With a constant bus limit of at least 2, there's no issue. 1025Register LaneSelect =
MI.getOperand(3).getReg();
1028auto MIB =
BuildMI(*
MBB, &
MI,
DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
1030 std::optional<ValueAndVReg> ConstSelect =
1033// The selector has to be an inline immediate, so we can use whatever for 1034// the other operands. 1036 MIB.
addImm(ConstSelect->Value.getSExtValue() &
1039 std::optional<ValueAndVReg> ConstVal =
1042// If the value written is an inline immediate, we can get away without a 1046 MIB.
addImm(ConstVal->Value.getSExtValue());
1051// If the lane selector was originally in a VGPR and copied with 1052// readfirstlane, there's a hazard to read the same SGPR from the 1053// VALU. Constrain to a different SGPR to help avoid needing a nop later. 1056BuildMI(*
MBB, *MIB,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1064MI.eraseFromParent();
1068// We need to handle this here because tablegen doesn't support matching 1069// instructions with multiple outputs. 1070bool AMDGPUInstructionSelector::selectDivScale(
MachineInstr &
MI)
const{
1074LLT Ty =
MRI->getType(Dst0);
1077 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
1079 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
1083// TODO: Match source modifiers. 1090unsigned ChooseDenom =
MI.getOperand(5).getImm();
1092Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1096 .
addImm(0)
// $src0_modifiers 1098 .
addImm(0)
// $src1_modifiers 1100 .
addImm(0)
// $src2_modifiers 1105MI.eraseFromParent();
1109bool AMDGPUInstructionSelector::selectG_INTRINSIC(
MachineInstr &
I)
const{
1111switch (IntrinsicID) {
1112case Intrinsic::amdgcn_if_break: {
1115// FIXME: Manually selecting to avoid dealing with the SReg_1 trick 1116// SelectionDAG uses for wave32 vs wave64. 1117BuildMI(*BB, &
I,
I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1118 .
add(
I.getOperand(0))
1119 .
add(
I.getOperand(2))
1120 .
add(
I.getOperand(3));
1122Register DstReg =
I.getOperand(0).getReg();
1123Register Src0Reg =
I.getOperand(2).getReg();
1124Register Src1Reg =
I.getOperand(3).getReg();
1128for (
Register Reg : { DstReg, Src0Reg, Src1Reg })
1133case Intrinsic::amdgcn_interp_p1_f16:
1134return selectInterpP1F16(
I);
1135case Intrinsic::amdgcn_wqm:
1136return constrainCopyLikeIntrin(
I, AMDGPU::WQM);
1137case Intrinsic::amdgcn_softwqm:
1138return constrainCopyLikeIntrin(
I, AMDGPU::SOFT_WQM);
1139case Intrinsic::amdgcn_strict_wwm:
1140case Intrinsic::amdgcn_wwm:
1141return constrainCopyLikeIntrin(
I, AMDGPU::STRICT_WWM);
1142case Intrinsic::amdgcn_strict_wqm:
1143return constrainCopyLikeIntrin(
I, AMDGPU::STRICT_WQM);
1144case Intrinsic::amdgcn_writelane:
1145return selectWritelane(
I);
1146case Intrinsic::amdgcn_div_scale:
1147return selectDivScale(
I);
1148case Intrinsic::amdgcn_icmp:
1149case Intrinsic::amdgcn_fcmp:
1152return selectIntrinsicCmp(
I);
1153case Intrinsic::amdgcn_ballot:
1154return selectBallot(
I);
1155case Intrinsic::amdgcn_reloc_constant:
1156return selectRelocConstant(
I);
1157case Intrinsic::amdgcn_groupstaticsize:
1158return selectGroupStaticSize(
I);
1159case Intrinsic::returnaddress:
1160return selectReturnAddress(
I);
1161case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1162case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1163case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1164case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1165case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1166case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1167case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1168case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1169case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1170case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1171case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1172case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1173case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1174case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1175case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
1176case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
1177case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
1178case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
1179case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
1180case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
1181case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
1182case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
1183case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
1184case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
1185case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
1186case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
1187case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
1188case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
1189return selectSMFMACIntrin(
I);
1190case Intrinsic::amdgcn_permlane16_swap:
1191case Intrinsic::amdgcn_permlane32_swap:
1192return selectPermlaneSwapIntrin(
I, IntrinsicID);
1203if (
Size == 16 && !ST.has16BitInsts())
1206constautoSelect = [&](
unsigned S16Opc,
unsigned TrueS16Opc,
1207unsigned FakeS16Opc,
unsigned S32Opc,
1210// FIXME-TRUE16 use TrueS16Opc when realtrue16 is supported for CMP code 1211return ST.hasTrue16BitInsts()
1212 ? ST.useRealTrue16Insts() ? FakeS16Opc : FakeS16Opc
1223returnSelect(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1224 AMDGPU::V_CMP_NE_U16_fake16_e64, AMDGPU::V_CMP_NE_U32_e64,
1225 AMDGPU::V_CMP_NE_U64_e64);
1227returnSelect(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1228 AMDGPU::V_CMP_EQ_U16_fake16_e64, AMDGPU::V_CMP_EQ_U32_e64,
1229 AMDGPU::V_CMP_EQ_U64_e64);
1231returnSelect(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1232 AMDGPU::V_CMP_GT_I16_fake16_e64, AMDGPU::V_CMP_GT_I32_e64,
1233 AMDGPU::V_CMP_GT_I64_e64);
1235returnSelect(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1236 AMDGPU::V_CMP_GE_I16_fake16_e64, AMDGPU::V_CMP_GE_I32_e64,
1237 AMDGPU::V_CMP_GE_I64_e64);
1239returnSelect(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1240 AMDGPU::V_CMP_LT_I16_fake16_e64, AMDGPU::V_CMP_LT_I32_e64,
1241 AMDGPU::V_CMP_LT_I64_e64);
1243returnSelect(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1244 AMDGPU::V_CMP_LE_I16_fake16_e64, AMDGPU::V_CMP_LE_I32_e64,
1245 AMDGPU::V_CMP_LE_I64_e64);
1247returnSelect(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1248 AMDGPU::V_CMP_GT_U16_fake16_e64, AMDGPU::V_CMP_GT_U32_e64,
1249 AMDGPU::V_CMP_GT_U64_e64);
1251returnSelect(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1252 AMDGPU::V_CMP_GE_U16_fake16_e64, AMDGPU::V_CMP_GE_U32_e64,
1253 AMDGPU::V_CMP_GE_U64_e64);
1255returnSelect(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1256 AMDGPU::V_CMP_LT_U16_fake16_e64, AMDGPU::V_CMP_LT_U32_e64,
1257 AMDGPU::V_CMP_LT_U64_e64);
1259returnSelect(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1260 AMDGPU::V_CMP_LE_U16_fake16_e64, AMDGPU::V_CMP_LE_U32_e64,
1261 AMDGPU::V_CMP_LE_U64_e64);
1264returnSelect(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1265 AMDGPU::V_CMP_EQ_F16_fake16_e64, AMDGPU::V_CMP_EQ_F32_e64,
1266 AMDGPU::V_CMP_EQ_F64_e64);
1268returnSelect(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1269 AMDGPU::V_CMP_GT_F16_fake16_e64, AMDGPU::V_CMP_GT_F32_e64,
1270 AMDGPU::V_CMP_GT_F64_e64);
1272returnSelect(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1273 AMDGPU::V_CMP_GE_F16_fake16_e64, AMDGPU::V_CMP_GE_F32_e64,
1274 AMDGPU::V_CMP_GE_F64_e64);
1276returnSelect(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1277 AMDGPU::V_CMP_LT_F16_fake16_e64, AMDGPU::V_CMP_LT_F32_e64,
1278 AMDGPU::V_CMP_LT_F64_e64);
1280returnSelect(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1281 AMDGPU::V_CMP_LE_F16_fake16_e64, AMDGPU::V_CMP_LE_F32_e64,
1282 AMDGPU::V_CMP_LE_F64_e64);
1284returnSelect(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1285 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1286 AMDGPU::V_CMP_NEQ_F64_e64);
1288returnSelect(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1289 AMDGPU::V_CMP_O_F16_fake16_e64, AMDGPU::V_CMP_O_F32_e64,
1290 AMDGPU::V_CMP_O_F64_e64);
1292returnSelect(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1293 AMDGPU::V_CMP_U_F16_fake16_e64, AMDGPU::V_CMP_U_F32_e64,
1294 AMDGPU::V_CMP_U_F64_e64);
1296returnSelect(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1297 AMDGPU::V_CMP_NLG_F16_fake16_e64, AMDGPU::V_CMP_NLG_F32_e64,
1298 AMDGPU::V_CMP_NLG_F64_e64);
1300returnSelect(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1301 AMDGPU::V_CMP_NLE_F16_fake16_e64, AMDGPU::V_CMP_NLE_F32_e64,
1302 AMDGPU::V_CMP_NLE_F64_e64);
1304returnSelect(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1305 AMDGPU::V_CMP_NLT_F16_fake16_e64, AMDGPU::V_CMP_NLT_F32_e64,
1306 AMDGPU::V_CMP_NLT_F64_e64);
1308returnSelect(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1309 AMDGPU::V_CMP_NGE_F16_fake16_e64, AMDGPU::V_CMP_NGE_F32_e64,
1310 AMDGPU::V_CMP_NGE_F64_e64);
1312returnSelect(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1313 AMDGPU::V_CMP_NGT_F16_fake16_e64, AMDGPU::V_CMP_NGT_F32_e64,
1314 AMDGPU::V_CMP_NGT_F64_e64);
1316returnSelect(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1317 AMDGPU::V_CMP_NEQ_F16_fake16_e64, AMDGPU::V_CMP_NEQ_F32_e64,
1318 AMDGPU::V_CMP_NEQ_F64_e64);
1320returnSelect(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1321 AMDGPU::V_CMP_TRU_F16_fake16_e64, AMDGPU::V_CMP_TRU_F32_e64,
1322 AMDGPU::V_CMP_TRU_F64_e64);
1324returnSelect(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1325 AMDGPU::V_CMP_F_F16_fake16_e64, AMDGPU::V_CMP_F_F32_e64,
1326 AMDGPU::V_CMP_F_F64_e64);
1338return AMDGPU::S_CMP_LG_U64;
1340return AMDGPU::S_CMP_EQ_U64;
1349return AMDGPU::S_CMP_LG_U32;
1351return AMDGPU::S_CMP_EQ_U32;
1353return AMDGPU::S_CMP_GT_I32;
1355return AMDGPU::S_CMP_GE_I32;
1357return AMDGPU::S_CMP_LT_I32;
1359return AMDGPU::S_CMP_LE_I32;
1361return AMDGPU::S_CMP_GT_U32;
1363return AMDGPU::S_CMP_GE_U32;
1365return AMDGPU::S_CMP_LT_U32;
1367return AMDGPU::S_CMP_LE_U32;
1369return AMDGPU::S_CMP_EQ_F32;
1371return AMDGPU::S_CMP_GT_F32;
1373return AMDGPU::S_CMP_GE_F32;
1375return AMDGPU::S_CMP_LT_F32;
1377return AMDGPU::S_CMP_LE_F32;
1379return AMDGPU::S_CMP_LG_F32;
1381return AMDGPU::S_CMP_O_F32;
1383return AMDGPU::S_CMP_U_F32;
1385return AMDGPU::S_CMP_NLG_F32;
1387return AMDGPU::S_CMP_NLE_F32;
1389return AMDGPU::S_CMP_NLT_F32;
1391return AMDGPU::S_CMP_NGE_F32;
1393return AMDGPU::S_CMP_NGT_F32;
1395return AMDGPU::S_CMP_NEQ_F32;
1407return AMDGPU::S_CMP_EQ_F16;
1409return AMDGPU::S_CMP_GT_F16;
1411return AMDGPU::S_CMP_GE_F16;
1413return AMDGPU::S_CMP_LT_F16;
1415return AMDGPU::S_CMP_LE_F16;
1417return AMDGPU::S_CMP_LG_F16;
1419return AMDGPU::S_CMP_O_F16;
1421return AMDGPU::S_CMP_U_F16;
1423return AMDGPU::S_CMP_NLG_F16;
1425return AMDGPU::S_CMP_NLE_F16;
1427return AMDGPU::S_CMP_NLT_F16;
1429return AMDGPU::S_CMP_NGE_F16;
1431return AMDGPU::S_CMP_NGT_F16;
1433return AMDGPU::S_CMP_NEQ_F16;
1442bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(
MachineInstr &
I)
const{
1447Register SrcReg =
I.getOperand(2).getReg();
1452Register CCReg =
I.getOperand(0).getReg();
1453if (!isVCC(CCReg, *MRI)) {
1454int Opcode = getS_CMPOpcode(Pred,
Size);
1458 .
add(
I.getOperand(2))
1459 .
add(
I.getOperand(3));
1460BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), CCReg)
1469if (
I.getOpcode() == AMDGPU::G_FCMP)
1477I.getOperand(0).getReg())
1478 .
add(
I.getOperand(2))
1479 .
add(
I.getOperand(3));
1487bool AMDGPUInstructionSelector::selectIntrinsicCmp(
MachineInstr &
I)
const{
1489if (isVCC(Dst, *MRI))
1492LLT DstTy =
MRI->getType(Dst);
1498Register SrcReg =
I.getOperand(2).getReg();
1501// i1 inputs are not supported in GlobalISel. 1507BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1519auto [Src0, Src0Mods] = selectVOP3ModsImpl(
LHS.getReg());
1520auto [Src1, Src1Mods] = selectVOP3ModsImpl(
RHS.getReg());
1522 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &
I,
/*ForceVGPR*/true);
1524 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &
I,
/*ForceVGPR*/true);
1525 SelectedMI =
BuildMI(*BB, &
I,
DL, TII.get(Opcode), Dst);
1527 SelectedMI.
addImm(Src0Mods);
1528 SelectedMI.
addReg(Src0Reg);
1530 SelectedMI.
addImm(Src1Mods);
1531 SelectedMI.
addReg(Src1Reg);
1533 SelectedMI.
addImm(0);
// clamp 1535 SelectedMI.
addImm(0);
// op_sel 1545// Ballot has to zero bits in input lane-mask that are zero in current exec, 1546// Done as AND with exec. For inputs that are results of instruction that 1547// implicitly use same exec, for example compares in same basic block or SCC to 1548// VCC copy, use copy. 1552if (
MI->getParent() !=
MBB)
1555// Lane mask generated by SCC to VCC copy. 1556if (
MI->getOpcode() == AMDGPU::COPY) {
1557auto DstRB =
MRI.getRegBankOrNull(
MI->getOperand(0).getReg());
1558auto SrcRB =
MRI.getRegBankOrNull(
MI->getOperand(1).getReg());
1559if (DstRB && SrcRB && DstRB->
getID() == AMDGPU::VCCRegBankID &&
1560 SrcRB->getID() == AMDGPU::SGPRRegBankID)
1564// Lane mask generated using compare with same exec. 1565if (isa<GAnyCmp>(
MI))
1577bool AMDGPUInstructionSelector::selectBallot(
MachineInstr &
I)
const{
1580Register DstReg =
I.getOperand(0).getReg();
1581Register SrcReg =
I.getOperand(2).getReg();
1582constunsigned BallotSize =
MRI->getType(DstReg).getSizeInBits();
1585// In the common case, the return type matches the wave size. 1586// However we also support emitting i64 ballots in wave32 mode. 1587if (BallotSize != WaveSize && (BallotSize != 64 || WaveSize != 32))
1590 std::optional<ValueAndVReg> Arg =
1594// i64 ballot on Wave32: new Dst(i32) for WaveSize ballot. 1595if (BallotSize != WaveSize) {
1600const int64_t
Value = Arg->
Value.getZExtValue();
1603unsigned Opcode = WaveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1619// Dst = S_AND SrcReg, EXEC 1620unsigned AndOpc = WaveSize == 64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
1630// i64 ballot on Wave32: zero-extend i32 ballot to i64. 1631if (BallotSize != WaveSize) {
1632Register HiReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1634BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1645bool AMDGPUInstructionSelector::selectRelocConstant(
MachineInstr &
I)
const{
1646Register DstReg =
I.getOperand(0).getReg();
1652constbool IsVALU = DstBank->
getID() == AMDGPU::VGPRRegBankID;
1657auto *RelocSymbol = cast<GlobalVariable>(
1662 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1669bool AMDGPUInstructionSelector::selectGroupStaticSize(
MachineInstr &
I)
const{
1672Register DstReg =
I.getOperand(0).getReg();
1674unsigned Mov = DstRB->
getID() == AMDGPU::SGPRRegBankID ?
1675 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1696bool AMDGPUInstructionSelector::selectReturnAddress(
MachineInstr &
I)
const{
1703unsignedDepth =
I.getOperand(2).getImm();
1711// Check for kernel and shader functions 1721// There is a call to @llvm.returnaddress in this function 1724// Get the return address reg and mark it as an implicit live-in 1727 AMDGPU::SReg_64RegClass,
DL);
1734bool AMDGPUInstructionSelector::selectEndCfIntrinsic(
MachineInstr &
MI)
const{
1735// FIXME: Manually selecting to avoid dealing with the SReg_1 trick 1736// SelectionDAG uses for wave32 vs wave64. 1738BuildMI(*BB, &
MI,
MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1739 .
add(
MI.getOperand(1));
1742MI.eraseFromParent();
1744if (!
MRI->getRegClassOrNull(Reg))
1749bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1755unsigned IndexOperand =
MI.getOperand(7).getImm();
1756bool WaveRelease =
MI.getOperand(8).getImm() != 0;
1757bool WaveDone =
MI.getOperand(9).getImm() != 0;
1759if (WaveDone && !WaveRelease)
1762unsigned OrderedCountIndex = IndexOperand & 0x3f;
1763 IndexOperand &= ~0x3f;
1764unsigned CountDw = 0;
1767 CountDw = (IndexOperand >> 24) & 0xf;
1768 IndexOperand &= ~(0xf << 24);
1770if (CountDw < 1 || CountDw > 4) {
1772"ds_ordered_count: dword count must be between 1 and 4");
1779unsignedInstruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1782unsigned Offset0 = OrderedCountIndex << 2;
1783unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
1786 Offset1 |= (CountDw - 1) << 6;
1789 Offset1 |= ShaderType << 2;
1791unsignedOffset = Offset0 | (Offset1 << 8);
1809MI.eraseFromParent();
1815case Intrinsic::amdgcn_ds_gws_init:
1816return AMDGPU::DS_GWS_INIT;
1817case Intrinsic::amdgcn_ds_gws_barrier:
1818return AMDGPU::DS_GWS_BARRIER;
1819case Intrinsic::amdgcn_ds_gws_sema_v:
1820return AMDGPU::DS_GWS_SEMA_V;
1821case Intrinsic::amdgcn_ds_gws_sema_br:
1822return AMDGPU::DS_GWS_SEMA_BR;
1823case Intrinsic::amdgcn_ds_gws_sema_p:
1824return AMDGPU::DS_GWS_SEMA_P;
1825case Intrinsic::amdgcn_ds_gws_sema_release_all:
1826return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1832bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(
MachineInstr &
MI,
1834if (!STI.
hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1838// intrinsic ID, vsrc, offset 1839constbool HasVSrc =
MI.getNumOperands() == 3;
1840assert(HasVSrc ||
MI.getNumOperands() == 2);
1842Register BaseOffset =
MI.getOperand(HasVSrc ? 2 : 1).getReg();
1844if (OffsetRB->
getID() != AMDGPU::SGPRRegBankID)
1855// If we legalized the VGPR input, strip out the readfirstlane to analyze the 1856// incoming offset, in case there's an add of a constant. We'll have to put it 1858if (OffsetDef->
getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1859 Readfirstlane = OffsetDef;
1864if (OffsetDef->
getOpcode() == AMDGPU::G_CONSTANT) {
1865// If we have a constant offset, try to use the 0 in m0 as the base. 1866// TODO: Look into changing the default m0 initialization value. If the 1867// default -1 only set the low 16-bits, we could leave it as-is and add 1 to 1868// the immediate offset. 1874 std::tie(BaseOffset, ImmOffset) =
1878// We have the constant offset now, so put the readfirstlane back on the 1879// variable component. 1887 AMDGPU::SReg_32RegClass, *MRI))
1891Register M0Base =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1901// The resource id offset is computed as (<isa opaque base> + M0[21:16] + 1902// offset field) % 64. Some versions of the programming guide omit the m0 1903// part, or claim it's from offset 0. 1919MI.eraseFromParent();
1923bool AMDGPUInstructionSelector::selectDSAppendConsume(
MachineInstr &
MI,
1924bool IsAppend)
const{
1926LLT PtrTy =
MRI->getType(PtrBase);
1930 std::tie(PtrBase,
Offset) = selectDS1Addr1OffsetImpl(
MI.getOperand(2));
1932// TODO: Should this try to look through readfirstlane like GWS? 1933if (!isDSOffsetLegal(PtrBase,
Offset)) {
1934 PtrBase =
MI.getOperand(2).getReg();
1940constunsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1951MI.eraseFromParent();
1955bool AMDGPUInstructionSelector::selectInitWholeWave(
MachineInstr &
MI)
const{
1963bool AMDGPUInstructionSelector::selectSBarrier(
MachineInstr &
MI)
const{
1968// If the workgroup fits in a wave, remove s_barrier_signal and lower 1969// s_barrier/s_barrier_wait to wave_barrier. 1970if (IntrinsicID == Intrinsic::amdgcn_s_barrier ||
1971 IntrinsicID == Intrinsic::amdgcn_s_barrier_wait) {
1976MI.eraseFromParent();
1982// On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait 1989MI.eraseFromParent();
2001 TFE = (TexFailCtrl & 0x1) ?
true :
false;
2003 LWE = (TexFailCtrl & 0x2) ?
true :
false;
2006return TexFailCtrl == 0;
2009bool AMDGPUInstructionSelector::selectImageIntrinsic(
2018unsigned IntrOpcode =
Intr->BaseOpcode;
2023constunsigned ArgOffset =
MI.getNumExplicitDefs() + 1;
2027int NumVDataDwords = -1;
2028bool IsD16 =
MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
2029MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
2035 Unorm =
MI.getOperand(ArgOffset +
Intr->UnormIndex).getImm() != 0;
2039bool IsTexFail =
false;
2041 TFE, LWE, IsTexFail))
2044constintFlags =
MI.getOperand(ArgOffset +
Intr->NumArgs).getImm();
2045constbool IsA16 = (
Flags & 1) != 0;
2046constbool IsG16 = (
Flags & 2) != 0;
2048// A16 implies 16 bit gradients if subtarget doesn't support G16 2049if (IsA16 && !STI.
hasG16() && !IsG16)
2053unsigned DMaskLanes = 0;
2056 VDataOut =
MI.getOperand(0).getReg();
2057 VDataIn =
MI.getOperand(2).getReg();
2058LLT Ty =
MRI->getType(VDataIn);
2060// Be careful to allow atomic swap on 16-bit element vectors. 2061constbool Is64Bit = BaseOpcode->
AtomicX2 ?
2066assert(
MI.getOperand(3).getReg() == AMDGPU::NoRegister);
2068 DMask = Is64Bit ? 0xf : 0x3;
2069 NumVDataDwords = Is64Bit ? 4 : 2;
2071 DMask = Is64Bit ? 0x3 : 0x1;
2072 NumVDataDwords = Is64Bit ? 2 : 1;
2075 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
2078if (BaseOpcode->
Store) {
2079 VDataIn =
MI.getOperand(1).getReg();
2080 VDataTy =
MRI->getType(VDataIn);
2085 VDataOut =
MI.getOperand(0).getReg();
2086 VDataTy =
MRI->getType(VDataOut);
2087 NumVDataDwords = DMaskLanes;
2090 NumVDataDwords = (DMaskLanes + 1) / 2;
2095if (Subtarget->
hasG16() && IsG16) {
2099 IntrOpcode = G16MappingInfo->
G16;
// set opcode to variant with _g16 2102// TODO: Check this in verifier. 2103assert((!IsTexFail || DMaskLanes >= 1) &&
"should have legalized this");
2105unsignedCPol =
MI.getOperand(ArgOffset +
Intr->CachePolicyIndex).getImm();
2112int NumVAddrRegs = 0;
2113int NumVAddrDwords = 0;
2114for (
unsignedI =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
2115// Skip the $noregs and 0s inserted during legalization. 2118continue;
// XXX - Break? 2125 NumVAddrDwords += (
MRI->getType(
Addr).getSizeInBits() + 31) / 32;
2128// The legalizer preprocessed the intrinsic arguments. If we aren't using 2129// NSA, these should have been packed into a single value in the first 2132 NumVAddrRegs != 1 &&
2134 : NumVAddrDwords == NumVAddrRegs);
2135if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
2146 NumVDataDwords, NumVAddrDwords);
2147 }
elseif (IsGFX11Plus) {
2149 UseNSA ? AMDGPU::MIMGEncGfx11NSA
2150 : AMDGPU::MIMGEncGfx11Default,
2151 NumVDataDwords, NumVAddrDwords);
2152 }
elseif (IsGFX10Plus) {
2154 UseNSA ? AMDGPU::MIMGEncGfx10NSA
2155 : AMDGPU::MIMGEncGfx10Default,
2156 NumVDataDwords, NumVAddrDwords);
2160 NumVDataDwords, NumVAddrDwords);
2164 <<
"requested image instruction is not supported on this GPU\n");
2171 NumVDataDwords, NumVAddrDwords);
2174 NumVDataDwords, NumVAddrDwords);
2184constbool Is64 =
MRI->getType(VDataOut).getSizeInBits() == 64;
2187 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
2188unsignedSubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
2191if (!
MRI->use_empty(VDataOut)) {
2197 MIB.
addDef(VDataOut);
// vdata output 2202 MIB.
addReg(VDataIn);
// vdata input 2204for (
intI = 0;
I != NumVAddrRegs; ++
I) {
2212 MIB.
addReg(
MI.getOperand(ArgOffset +
Intr->RsrcIndex).getReg());
2214 MIB.
addReg(
MI.getOperand(ArgOffset +
Intr->SampIndex).getReg());
2216 MIB.
addImm(DMask);
// dmask 2224 MIB.
addImm(IsA16 &&
// a16 or r128 2225 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2227 MIB.
addImm(IsA16 ? -1 : 0);
2241 MIB.
addImm(IsD16 ? -1 : 0);
2243MI.eraseFromParent();
2249// We need to handle this here because tablegen doesn't support matching 2250// instructions with multiple outputs. 2251bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2262unsignedOffset =
MI.getOperand(6).getImm();
2264auto MIB =
BuildMI(*
MBB, &
MI,
DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2272MI.eraseFromParent();
2276bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2279switch (IntrinsicID) {
2280case Intrinsic::amdgcn_end_cf:
2281return selectEndCfIntrinsic(
I);
2282case Intrinsic::amdgcn_ds_ordered_add:
2283case Intrinsic::amdgcn_ds_ordered_swap:
2284return selectDSOrderedIntrinsic(
I, IntrinsicID);
2285case Intrinsic::amdgcn_ds_gws_init:
2286case Intrinsic::amdgcn_ds_gws_barrier:
2287case Intrinsic::amdgcn_ds_gws_sema_v:
2288case Intrinsic::amdgcn_ds_gws_sema_br:
2289case Intrinsic::amdgcn_ds_gws_sema_p:
2290case Intrinsic::amdgcn_ds_gws_sema_release_all:
2291return selectDSGWSIntrinsic(
I, IntrinsicID);
2292case Intrinsic::amdgcn_ds_append:
2293return selectDSAppendConsume(
I,
true);
2294case Intrinsic::amdgcn_ds_consume:
2295return selectDSAppendConsume(
I,
false);
2296case Intrinsic::amdgcn_init_whole_wave:
2297return selectInitWholeWave(
I);
2298case Intrinsic::amdgcn_s_barrier:
2299case Intrinsic::amdgcn_s_barrier_signal:
2300case Intrinsic::amdgcn_s_barrier_wait:
2301return selectSBarrier(
I);
2302case Intrinsic::amdgcn_raw_buffer_load_lds:
2303case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2304case Intrinsic::amdgcn_struct_buffer_load_lds:
2305case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2306return selectBufferLoadLds(
I);
2307case Intrinsic::amdgcn_global_load_lds:
2308return selectGlobalLoadLds(
I);
2309case Intrinsic::amdgcn_exp_compr:
2313F,
"intrinsic not supported on subtarget",
I.getDebugLoc(),
DS_Error);
2314F.getContext().diagnose(NoFpRet);
2318case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2319return selectDSBvhStackIntrinsic(
I);
2320case Intrinsic::amdgcn_s_barrier_init:
2321case Intrinsic::amdgcn_s_barrier_signal_var:
2322return selectNamedBarrierInit(
I, IntrinsicID);
2323case Intrinsic::amdgcn_s_barrier_join:
2324case Intrinsic::amdgcn_s_get_named_barrier_state:
2325return selectNamedBarrierInst(
I, IntrinsicID);
2326case Intrinsic::amdgcn_s_get_barrier_state:
2327return selectSGetBarrierState(
I, IntrinsicID);
2328case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2329return selectSBarrierSignalIsfirst(
I, IntrinsicID);
2334bool AMDGPUInstructionSelector::selectG_SELECT(
MachineInstr &
I)
const{
2341Register DstReg =
I.getOperand(0).getReg();
2346if (!isVCC(CCReg, *MRI)) {
2347unsigned SelectOpcode =
Size == 64 ? AMDGPU::S_CSELECT_B64 :
2348 AMDGPU::S_CSELECT_B32;
2352// The generic constrainSelectedInstRegOperands doesn't work for the scc register 2353// bank, because it does not cover the register class that we used to represent 2354// for it. So we need to manually set the register class here. 2355if (!
MRI->getRegClassOrNull(CCReg))
2358 .
add(
I.getOperand(2))
2359 .
add(
I.getOperand(3));
2368// Wide VGPR select should have been split in RegBankSelect. 2373BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2375 .
add(
I.getOperand(3))
2377 .
add(
I.getOperand(2))
2378 .
add(
I.getOperand(1));
2385bool AMDGPUInstructionSelector::selectG_TRUNC(
MachineInstr &
I)
const{
2386Register DstReg =
I.getOperand(0).getReg();
2387Register SrcReg =
I.getOperand(1).getReg();
2388constLLT DstTy =
MRI->getType(DstReg);
2389constLLT SrcTy =
MRI->getType(SrcReg);
2395// This is a special case. We don't treat s1 for legalization artifacts as 2404constbool IsVALU = DstRB->
getID() == AMDGPU::VGPRRegBankID;
2413if (!SrcRC || !DstRC)
2422if (DstRC == &AMDGPU::VGPR_16RegClass && SrcSize == 32) {
2427 .
addReg(SrcReg, 0, AMDGPU::lo16);
2439 .
addReg(SrcReg, 0, AMDGPU::sub0);
2441 .
addReg(SrcReg, 0, AMDGPU::sub1);
2444// Write the low 16-bits of the high element into the high 16-bits of the 2448 .
addImm(0)
// $src0_modifiers 2457Register TmpReg0 =
MRI->createVirtualRegister(DstRC);
2458Register TmpReg1 =
MRI->createVirtualRegister(DstRC);
2459Register ImmReg =
MRI->createVirtualRegister(DstRC);
2461BuildMI(*
MBB,
I,
DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2471unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2472unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2473unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2485And.setOperandDead(3);
// Dead scc 2486Or.setOperandDead(3);
// Dead scc 2500if (SubRegIdx == AMDGPU::NoSubRegister)
2503// Deal with weird cases where the class only partially supports the subreg 2506 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2510if (SrcWithSubRC != SrcRC) {
2515I.getOperand(1).setSubReg(SubRegIdx);
2518I.setDesc(TII.get(TargetOpcode::COPY));
2522/// \returns true if a bitmask for \p Size bits will be an inline immediate. 2524 Mask = maskTrailingOnes<unsigned>(
Size);
2525int SignedMask =
static_cast<int>(Mask);
2526return SignedMask >= -16 && SignedMask <= 64;
2529// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1. 2530constRegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2534if (
auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
2537// Ignore the type, since we don't use vcc in artifacts. 2538if (
auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
2543bool AMDGPUInstructionSelector::selectG_SZA_EXT(
MachineInstr &
I)
const{
2544bool InReg =
I.getOpcode() == AMDGPU::G_SEXT_INREG;
2545boolSigned =
I.getOpcode() == AMDGPU::G_SEXT || InReg;
2548constRegister DstReg =
I.getOperand(0).getReg();
2549constRegister SrcReg =
I.getOperand(1).getReg();
2551constLLT DstTy =
MRI->getType(DstReg);
2552constLLT SrcTy =
MRI->getType(SrcReg);
2553constunsigned SrcSize =
I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2559// Artifact casts should never use vcc. 2560constRegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2562// FIXME: This should probably be illegal and split earlier. 2563if (
I.getOpcode() == AMDGPU::G_ANYEXT) {
2565return selectCOPY(
I);
2568TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2571TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2573Register UndefReg =
MRI->createVirtualRegister(SrcRC);
2586if (SrcBank->
getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2587// 64-bit should have been split up in RegBankSelect 2589// Try to use an and with a mask if it will save code size. 2600constunsignedBFE =
Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2605 .
addImm(SrcSize);
// Width 2610if (SrcBank->
getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2612 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2616if (
Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2617constunsigned SextOpc = SrcSize == 8 ?
2618 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2625// Using a single 32-bit SALU to calculate the high half is smaller than 2626// S_BFE with a literal constant operand. 2627if (DstSize > 32 && SrcSize == 32) {
2628Register HiReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2629unsignedSubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2649constunsigned BFE64 =
Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2650constunsigned BFE32 =
Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2652// Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width. 2653if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2654// We need a 64-bit register source, but the high bits don't matter. 2655Register ExtReg =
MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2656Register UndefReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2657unsignedSubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2721if (Shuffle->
getOpcode() != AMDGPU::G_SHUFFLE_VECTOR)
2730if (Mask[0] == 1 && Mask[1] <= 1) {
2738bool AMDGPUInstructionSelector::selectG_FPEXT(
MachineInstr &
I)
const{
2744if (DstRB->
getID() != AMDGPU::SGPRRegBankID)
2753BuildMI(*BB, &
I,
I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2763bool AMDGPUInstructionSelector::selectG_FNEG(
MachineInstr &
MI)
const{
2764// Only manually handle the f64 SGPR case. 2766// FIXME: This is a workaround for 2.5 different tablegen problems. Because 2767// the bit ops theoretically have a second result due to the implicit def of 2768// SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing 2769// that is easy by disabling the check. The result works, but uses a 2770// nonsensical sreg32orlds_and_sreg_1 regclass. 2772// The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to 2773// the variadic REG_SEQUENCE operands. 2777if (DstRB->
getID() != AMDGPU::SGPRRegBankID ||
2792Register LoReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2793Register HiReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2794Register ConstReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2795Register OpReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2798 .
addReg(Src, 0, AMDGPU::sub0);
2800 .
addReg(Src, 0, AMDGPU::sub1);
2801BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2804// Set or toggle sign bit. 2805unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2810BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2815MI.eraseFromParent();
2819// FIXME: This is a workaround for the same tablegen problems as G_FNEG 2820bool AMDGPUInstructionSelector::selectG_FABS(
MachineInstr &
MI)
const{
2823if (DstRB->
getID() != AMDGPU::SGPRRegBankID ||
2830Register LoReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2831Register HiReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2832Register ConstReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2833Register OpReg =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2840 .
addReg(Src, 0, AMDGPU::sub0);
2842 .
addReg(Src, 0, AMDGPU::sub1);
2843BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2847// TODO: Should this used S_BITSET0_*? 2848BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2852BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2858MI.eraseFromParent();
2863returnMI.getOpcode() == TargetOpcode::G_CONSTANT;
2866void AMDGPUInstructionSelector::getAddrModeInfo(
constMachineInstr &Load,
2869unsigned OpNo =
Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2871MRI.getUniqueVRegDef(
Load.getOperand(OpNo).getReg());
2875if (PtrMI->
getOpcode() != TargetOpcode::G_PTR_ADD)
2880for (
unsigned i = 1; i != 3; ++i) {
2885// TODO: Could handle constant base + variable offset, but a combine 2886// probably should have commuted it. 2892if (OpBank->
getID() == AMDGPU::SGPRRegBankID)
2893 GEPInfo.SgprParts.push_back(GEPOp.
getReg());
2895 GEPInfo.VgprParts.push_back(GEPOp.
getReg());
2899 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2902bool AMDGPUInstructionSelector::isSGPR(
Register Reg)
const{
2903return RBI.
getRegBank(Reg, *MRI, TRI)->
getID() == AMDGPU::SGPRRegBankID;
2906bool AMDGPUInstructionSelector::isInstrUniform(
constMachineInstr &
MI)
const{
2907if (!
MI.hasOneMemOperand())
2913// UndefValue means this is a load of a kernel input. These are uniform. 2914// Sometimes LDS instructions have constant pointers. 2915// If Ptr is null, then that means this mem operand contains a 2916// PseudoSourceValue like GOT. 2917if (!
Ptr || isa<UndefValue>(
Ptr) || isa<Argument>(
Ptr) ||
2918 isa<Constant>(
Ptr) || isa<GlobalValue>(
Ptr))
2924if (
MI.getOpcode() == AMDGPU::G_PREFETCH)
2926 AMDGPU::SGPRRegBankID;
2929returnI &&
I->getMetadata(
"amdgpu.uniform");
2933for (
const GEPInfo &GEPInfo : AddrInfo) {
2934if (!GEPInfo.VgprParts.empty())
2940void AMDGPUInstructionSelector::initM0(
MachineInstr &
I)
const{
2941constLLT PtrTy =
MRI->getType(
I.getOperand(1).getReg());
2947// If DS instructions require M0 initialization, insert it before selecting. 2948BuildMI(*BB, &
I,
I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2953bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2960if (Reg.isPhysical())
2964constunsigned Opcode =
MI.getOpcode();
2966if (Opcode == AMDGPU::COPY)
2969if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2970 Opcode == AMDGPU::G_XOR)
2974if (
auto *GI = dyn_cast<GIntrinsic>(&
MI))
2975return GI->is(Intrinsic::amdgcn_class);
2977return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2980bool AMDGPUInstructionSelector::selectG_BRCOND(
MachineInstr &
I)
const{
2990// In SelectionDAG, we inspect the IR block for uniformity metadata to decide 2991// whether the branch is uniform when selecting the instruction. In 2992// GlobalISel, we should push that decision into RegBankSelect. Assume for now 2993// RegBankSelect knows what it's doing if the branch condition is scc, even 2994// though it currently does not. 2995if (!isVCC(CondReg, *MRI)) {
2999 CondPhysReg = AMDGPU::SCC;
3000 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
3001 ConstrainRC = &AMDGPU::SReg_32RegClass;
3003// FIXME: Should scc->vcc copies and with exec? 3005// Unless the value of CondReg is a result of a V_CMP* instruction then we 3006// need to insert an and with exec. 3009constunsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
3010constRegisterExec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
3020 CondPhysReg =
TRI.getVCC();
3021 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
3022 ConstrainRC =
TRI.getBoolRC();
3025if (!
MRI->getRegClassOrNull(CondReg))
3026MRI->setRegClass(CondReg, ConstrainRC);
3028BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), CondPhysReg)
3031 .
addMBB(
I.getOperand(1).getMBB());
3037bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
3039Register DstReg =
I.getOperand(0).getReg();
3041constbool IsVGPR = DstRB->
getID() == AMDGPU::VGPRRegBankID;
3042I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
3047 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
3050bool AMDGPUInstructionSelector::selectG_PTRMASK(
MachineInstr &
I)
const{
3051Register DstReg =
I.getOperand(0).getReg();
3052Register SrcReg =
I.getOperand(1).getReg();
3053Register MaskReg =
I.getOperand(2).getReg();
3054LLT Ty =
MRI->getType(DstReg);
3055LLT MaskTy =
MRI->getType(MaskReg);
3062constbool IsVGPR = DstRB->
getID() == AMDGPU::VGPRRegBankID;
3063if (DstRB != SrcRB)
// Should only happen for hand written MIR. 3066// Try to avoid emitting a bit operation when we only need to touch half of 3067// the 64-bit pointer. 3072constbool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
3073constbool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
3076 !CanCopyLow32 && !CanCopyHi32) {
3077auto MIB =
BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::S_AND_B64), DstReg)
3085unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
3087 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3092TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
3101"ptrmask should have been narrowed during legalize");
3103auto NewOp =
BuildMI(*BB, &
I,
DL, TII.get(NewOpc), DstReg)
3113Register HiReg =
MRI->createVirtualRegister(&RegRC);
3114Register LoReg =
MRI->createVirtualRegister(&RegRC);
3116// Extract the subregisters from the source pointer. 3117BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), LoReg)
3118 .
addReg(SrcReg, 0, AMDGPU::sub0);
3119BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), HiReg)
3120 .
addReg(SrcReg, 0, AMDGPU::sub1);
3125// If all the bits in the low half are 1, we only need a copy for it. 3128// Extract the mask subregister and apply the and. 3129Register MaskLo =
MRI->createVirtualRegister(&RegRC);
3130 MaskedLo =
MRI->createVirtualRegister(&RegRC);
3132BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), MaskLo)
3133 .
addReg(MaskReg, 0, AMDGPU::sub0);
3134BuildMI(*BB, &
I,
DL, TII.get(NewOpc), MaskedLo)
3140// If all the bits in the high half are 1, we only need a copy for it. 3143Register MaskHi =
MRI->createVirtualRegister(&RegRC);
3144 MaskedHi =
MRI->createVirtualRegister(&RegRC);
3146BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::COPY), MaskHi)
3147 .
addReg(MaskReg, 0, AMDGPU::sub1);
3148BuildMI(*BB, &
I,
DL, TII.get(NewOpc), MaskedHi)
3153BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3162/// Return the register to use for the index value, and the subregister to use 3163/// for the indirectly accessed register. 3164static std::pair<Register, unsigned>
3171 std::tie(IdxBaseReg,
Offset) =
3173if (IdxBaseReg == AMDGPU::NoRegister) {
3174// This will happen if the index is a known constant. This should ordinarily 3175// be legalized out, but handle it as a register just in case. 3177 IdxBaseReg = IdxReg;
3182// Skip out of bounds offsets, or else we would end up using an undefined 3184if (
static_cast<unsigned>(
Offset) >= SubRegs.
size())
3185return std::pair(IdxReg, SubRegs[0]);
3186return std::pair(IdxBaseReg, SubRegs[
Offset]);
3189bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3195LLT DstTy =
MRI->getType(DstReg);
3196LLT SrcTy =
MRI->getType(SrcReg);
3202// The index must be scalar. If it wasn't RegBankSelect should have moved this 3203// into a waterfall loop. 3204if (IdxRB->
getID() != AMDGPU::SGPRRegBankID)
3208TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3210TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3211if (!SrcRC || !DstRC)
3226if (SrcRB->
getID() == AMDGPU::SGPRRegBankID) {
3230BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3233unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3237MI.eraseFromParent();
3245BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3247BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3250MI.eraseFromParent();
3261MI.eraseFromParent();
3265// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd 3266bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3273LLT VecTy =
MRI->getType(DstReg);
3274LLT ValTy =
MRI->getType(ValReg);
3284// The index must be scalar. If it wasn't RegBankSelect should have moved this 3285// into a waterfall loop. 3286if (IdxRB->
getID() != AMDGPU::SGPRRegBankID)
3290TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3292TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3300if (VecRB->
getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3304 std::tie(IdxReg,
SubReg) =
3314BuildMI(*BB, &
MI,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3318 VecSize, ValSize, VecRB->
getID() == AMDGPU::SGPRRegBankID);
3323MI.eraseFromParent();
3335MI.eraseFromParent();
3339bool AMDGPUInstructionSelector::selectBufferLoadLds(
MachineInstr &
MI)
const{
3342unsignedSize =
MI.getOperand(3).getImm();
3344// The struct intrinsic variants add one additional operand over raw. 3345constbool HasVIndex =
MI.getNumOperands() == 9;
3349 VIndex =
MI.getOperand(4).getReg();
3353Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
3354 std::optional<ValueAndVReg> MaybeVOffset =
3356constbool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3362 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3363 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3364 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3365 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3368 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3369 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3370 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3371 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3374 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3375 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3376 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3377 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3383 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
3384 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
3385 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
3386 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
3392 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
3393 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
3394 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
3395 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
3402 .
add(
MI.getOperand(2));
3406if (HasVIndex && HasVOffset) {
3407Register IdxReg =
MRI->createVirtualRegister(
TRI.getVGPR64Class());
3408BuildMI(*
MBB, &*MIB,
DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3415 }
elseif (HasVIndex) {
3417 }
elseif (HasVOffset) {
3421 MIB.
add(
MI.getOperand(1));
// rsrc 3422 MIB.
add(
MI.getOperand(5 + OpOffset));
// soffset 3423 MIB.
add(
MI.getOperand(6 + OpOffset));
// imm offset 3425unsigned Aux =
MI.getOperand(7 + OpOffset).getImm();
3435 LoadPtrI.
Offset =
MI.getOperand(6 + OpOffset).getImm();
3437 StorePtrI.
V =
nullptr;
3451MI.eraseFromParent();
3455/// Match a zero extend from a 32-bit value to 64-bits. 3461// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0) 3463if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3466assert(Def->getNumOperands() == 3 &&
3469return Def->getOperand(1).getReg();
3475bool AMDGPUInstructionSelector::selectGlobalLoadLds(
MachineInstr &
MI)
const{
3477unsignedSize =
MI.getOperand(3).getImm();
3483 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3486 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3489 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3494 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
3499 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
3506 .
add(
MI.getOperand(2));
3510// Try to split SAddr and VOffset. Global and LDS pointers share the same 3511// immediate offset, so we cannot use a regular SelectGlobalSAddr(). 3514if (isSGPR(AddrDef->Reg)) {
3516 }
elseif (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3520Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3532 VOffset =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3544 MIB.
add(
MI.getOperand(4))
// offset 3545 .
add(
MI.getOperand(5));
// cpol 3549 LoadPtrI.
Offset =
MI.getOperand(4).getImm();
3559sizeof(int32_t),
Align(4));
3563MI.eraseFromParent();
3567bool AMDGPUInstructionSelector::selectBVHIntrinsic(
MachineInstr &
MI)
const{
3568MI.setDesc(TII.get(
MI.getOperand(1).getImm()));
3570MI.addImplicitDefUseOperands(*
MI.getParent()->getParent());
3574// FIXME: This should be removed and let the patterns select. We just need the 3575// AGPR/VGPR combination versions. 3576bool AMDGPUInstructionSelector::selectSMFMACIntrin(
MachineInstr &
MI)
const{
3579case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3580 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3582case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3583 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3585case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3586 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3588case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3589 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3591case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3592 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3594case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3595 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3597case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3598 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3600case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3601 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3603case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3604 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3606case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3607 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3609case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3610 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3612case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3613 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3615case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3616 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3618case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3619 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3621case Intrinsic::amdgcn_smfmac_f32_16x16x64_f16:
3622 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_F16_e64;
3624case Intrinsic::amdgcn_smfmac_f32_32x32x32_f16:
3625 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_F16_e64;
3627case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf16:
3628 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF16_e64;
3630case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf16:
3631 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF16_e64;
3633case Intrinsic::amdgcn_smfmac_i32_16x16x128_i8:
3634 Opc = AMDGPU::V_SMFMAC_I32_16X16X128_I8_e64;
3636case Intrinsic::amdgcn_smfmac_i32_32x32x64_i8:
3637 Opc = AMDGPU::V_SMFMAC_I32_32X32X64_I8_e64;
3639case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_bf8:
3640 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_BF8_e64;
3642case Intrinsic::amdgcn_smfmac_f32_16x16x128_bf8_fp8:
3643 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_BF8_FP8_e64;
3645case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_bf8:
3646 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_BF8_e64;
3648case Intrinsic::amdgcn_smfmac_f32_16x16x128_fp8_fp8:
3649 Opc = AMDGPU::V_SMFMAC_F32_16X16X128_FP8_FP8_e64;
3651case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_bf8:
3652 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_BF8_e64;
3654case Intrinsic::amdgcn_smfmac_f32_32x32x64_bf8_fp8:
3655 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_BF8_FP8_e64;
3657case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_bf8:
3658 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_BF8_e64;
3660case Intrinsic::amdgcn_smfmac_f32_32x32x64_fp8_fp8:
3661 Opc = AMDGPU::V_SMFMAC_F32_32X32X64_FP8_FP8_e64;
3667auto VDst_In =
MI.getOperand(4);
3669MI.setDesc(TII.get(Opc));
3670MI.removeOperand(4);
// VDst_In 3671MI.removeOperand(1);
// Intrinsic ID 3672MI.addOperand(VDst_In);
// Readd VDst_In to the end 3673MI.addImplicitDefUseOperands(*
MI.getParent()->getParent());
3677bool AMDGPUInstructionSelector::selectPermlaneSwapIntrin(
3679if (IntrID == Intrinsic::amdgcn_permlane16_swap &&
3682if (IntrID == Intrinsic::amdgcn_permlane32_swap &&
3686unsigned Opcode = IntrID == Intrinsic::amdgcn_permlane16_swap
3687 ? AMDGPU::V_PERMLANE16_SWAP_B32_e64
3688 : AMDGPU::V_PERMLANE32_SWAP_B32_e64;
3691MI.setDesc(TII.get(Opcode));
3700bool AMDGPUInstructionSelector::selectWaveAddress(
MachineInstr &
MI)
const{
3704constbool IsVALU = DstRB->
getID() == AMDGPU::VGPRRegBankID;
3720 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3724MI.eraseFromParent();
3728// Match BITOP3 operation and return a number of matched instructions plus 3733unsigned NumOpcodes = 0;
3737// Define truth table given Src0, Src1, Src2 bits permutations: 3746constuint8_t SrcBits[3] = { 0xf0, 0xcc, 0xaa };
3757for (
unsignedI = 0;
I < Src.size(); ++
I) {
3758// Try to find existing reused operand 3763// Try to replace parent operator 3771if (Src.size() == 3) {
3772// No room left for operands. Try one last time, there can be a 'not' of 3773// one of our source operands. In this case we can compute the bits 3774// without growing Src vector. 3778for (
unsignedI = 0;
I < Src.size(); ++
I) {
3789 Bits = SrcBits[Src.size()];
3795switch (
MI->getOpcode()) {
3796case TargetOpcode::G_AND:
3797case TargetOpcode::G_OR:
3798case TargetOpcode::G_XOR: {
3803if (!getOperandBits(
LHS, LHSBits) ||
3804 !getOperandBits(
RHS, RHSBits)) {
3806return std::make_pair(0, 0);
3809// Recursion is naturally limited by the size of the operand vector. 3812 NumOpcodes +=
Op.first;
3818 NumOpcodes +=
Op.first;
3824return std::make_pair(0, 0);
3828switch (
MI->getOpcode()) {
3829case TargetOpcode::G_AND:
3830 TTbl = LHSBits & RHSBits;
3832case TargetOpcode::G_OR:
3833 TTbl = LHSBits | RHSBits;
3835case TargetOpcode::G_XOR:
3836 TTbl = LHSBits ^ RHSBits;
3842return std::make_pair(NumOpcodes + 1, TTbl);
3845bool AMDGPUInstructionSelector::selectBITOP3(
MachineInstr &
MI)
const{
3851constbool IsVALU = DstRB->
getID() == AMDGPU::VGPRRegBankID;
3859 std::tie(NumOpcodes, TTbl) =
BitOp3_Op(DstReg, Src, *MRI);
3861// Src.empty() case can happen if all operands are all zero or all ones. 3862// Normally it shall be optimized out before reaching this. 3863if (NumOpcodes < 2 || Src.empty())
3867if (NumOpcodes == 2 && IsB32) {
3868// Avoid using BITOP3 for OR3, XOR3, AND_OR. This is not faster but makes 3869// asm more readable. This cannot be modeled with AddedComplexity because 3870// selector does not know how many operations did we match. 3875 }
elseif (NumOpcodes < 4) {
3876// For a uniform case threshold should be higher to account for moves 3877// between VGPRs and SGPRs. It needs one operand in a VGPR, rest two can be 3878// in SGPRs and a readtfirstlane after. 3882unsigned Opc = IsB32 ? AMDGPU::V_BITOP3_B32_e64 : AMDGPU::V_BITOP3_B16_e64;
3887for (
unsignedI = 0;
I < Src.size(); ++
I) {
3889if (RB->
getID() != AMDGPU::SGPRRegBankID)
3895Register NewReg =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3901// Last operand can be ignored, turning a ternary operation into a binary. 3902// For example: (~a & b & c) | (~a & b & ~c) -> (~a & b). We can replace 3903// 'c' with 'a' here without changing the answer. In some pathological 3904// cases it should be possible to get an operation with a single operand 3905// too if optimizer would not catch it. 3906while (Src.size() < 3)
3907 Src.push_back(Src[0]);
3911 MIB.
addImm(0);
// src_mod0 3914 MIB.
addImm(0);
// src_mod1 3917 MIB.
addImm(0);
// src_mod2 3924MI.eraseFromParent();
3929bool AMDGPUInstructionSelector::selectStackRestore(
MachineInstr &
MI)
const{
3942 WaveAddr =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3952MI.eraseFromParent();
3958if (!
I.isPreISelOpcode()) {
3960return selectCOPY(
I);
3964switch (
I.getOpcode()) {
3965case TargetOpcode::G_AND:
3966case TargetOpcode::G_OR:
3967case TargetOpcode::G_XOR:
3972return selectG_AND_OR_XOR(
I);
3973case TargetOpcode::G_ADD:
3974case TargetOpcode::G_SUB:
3975case TargetOpcode::G_PTR_ADD:
3978return selectG_ADD_SUB(
I);
3979case TargetOpcode::G_UADDO:
3980case TargetOpcode::G_USUBO:
3981case TargetOpcode::G_UADDE:
3982case TargetOpcode::G_USUBE:
3983return selectG_UADDO_USUBO_UADDE_USUBE(
I);
3984case AMDGPU::G_AMDGPU_MAD_U64_U32:
3985case AMDGPU::G_AMDGPU_MAD_I64_I32:
3986return selectG_AMDGPU_MAD_64_32(
I);
3987case TargetOpcode::G_INTTOPTR:
3988case TargetOpcode::G_BITCAST:
3989case TargetOpcode::G_PTRTOINT:
3990case TargetOpcode::G_FREEZE:
3991return selectCOPY(
I);
3992case TargetOpcode::G_FNEG:
3995return selectG_FNEG(
I);
3996case TargetOpcode::G_FABS:
3999return selectG_FABS(
I);
4000case TargetOpcode::G_EXTRACT:
4001return selectG_EXTRACT(
I);
4002case TargetOpcode::G_MERGE_VALUES:
4003case TargetOpcode::G_CONCAT_VECTORS:
4004return selectG_MERGE_VALUES(
I);
4005case TargetOpcode::G_UNMERGE_VALUES:
4006return selectG_UNMERGE_VALUES(
I);
4007case TargetOpcode::G_BUILD_VECTOR:
4008case TargetOpcode::G_BUILD_VECTOR_TRUNC:
4009return selectG_BUILD_VECTOR(
I);
4010case TargetOpcode::G_IMPLICIT_DEF:
4011return selectG_IMPLICIT_DEF(
I);
4012case TargetOpcode::G_INSERT:
4013return selectG_INSERT(
I);
4014case TargetOpcode::G_INTRINSIC:
4015case TargetOpcode::G_INTRINSIC_CONVERGENT:
4016return selectG_INTRINSIC(
I);
4017case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
4018case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
4019return selectG_INTRINSIC_W_SIDE_EFFECTS(
I);
4020case TargetOpcode::G_ICMP:
4021case TargetOpcode::G_FCMP:
4022if (selectG_ICMP_or_FCMP(
I))
4025case TargetOpcode::G_LOAD:
4026case TargetOpcode::G_ZEXTLOAD:
4027case TargetOpcode::G_SEXTLOAD:
4028case TargetOpcode::G_STORE:
4029case TargetOpcode::G_ATOMIC_CMPXCHG:
4030case TargetOpcode::G_ATOMICRMW_XCHG:
4031case TargetOpcode::G_ATOMICRMW_ADD:
4032case TargetOpcode::G_ATOMICRMW_SUB:
4033case TargetOpcode::G_ATOMICRMW_AND:
4034case TargetOpcode::G_ATOMICRMW_OR:
4035case TargetOpcode::G_ATOMICRMW_XOR:
4036case TargetOpcode::G_ATOMICRMW_MIN:
4037case TargetOpcode::G_ATOMICRMW_MAX:
4038case TargetOpcode::G_ATOMICRMW_UMIN:
4039case TargetOpcode::G_ATOMICRMW_UMAX:
4040case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
4041case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
4042case TargetOpcode::G_ATOMICRMW_FADD:
4043case TargetOpcode::G_ATOMICRMW_FMIN:
4044case TargetOpcode::G_ATOMICRMW_FMAX:
4045return selectG_LOAD_STORE_ATOMICRMW(
I);
4046case TargetOpcode::G_SELECT:
4047return selectG_SELECT(
I);
4048case TargetOpcode::G_TRUNC:
4049return selectG_TRUNC(
I);
4050case TargetOpcode::G_SEXT:
4051case TargetOpcode::G_ZEXT:
4052case TargetOpcode::G_ANYEXT:
4053case TargetOpcode::G_SEXT_INREG:
4054// This is a workaround. For extension from type i1, `selectImpl()` uses 4055// patterns from TD file and generates an illegal VGPR to SGPR COPY as type 4056// i1 can only be hold in a SGPR class. 4060return selectG_SZA_EXT(
I);
4061case TargetOpcode::G_FPEXT:
4062if (selectG_FPEXT(
I))
4065case TargetOpcode::G_BRCOND:
4066return selectG_BRCOND(
I);
4067case TargetOpcode::G_GLOBAL_VALUE:
4068return selectG_GLOBAL_VALUE(
I);
4069case TargetOpcode::G_PTRMASK:
4070return selectG_PTRMASK(
I);
4071case TargetOpcode::G_EXTRACT_VECTOR_ELT:
4072return selectG_EXTRACT_VECTOR_ELT(
I);
4073case TargetOpcode::G_INSERT_VECTOR_ELT:
4074return selectG_INSERT_VECTOR_ELT(
I);
4075case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
4076case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
4077case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET:
4078case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
4079case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
4082assert(
Intr &&
"not an image intrinsic with image pseudo");
4083return selectImageIntrinsic(
I,
Intr);
4085case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
4086return selectBVHIntrinsic(
I);
4089return selectG_SBFX_UBFX(
I);
4090case AMDGPU::G_SI_CALL:
4091I.setDesc(TII.get(AMDGPU::SI_CALL));
4093case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
4094return selectWaveAddress(
I);
4095case AMDGPU::G_STACKRESTORE:
4096return selectStackRestore(
I);
4099case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
4100return selectCOPY_SCC_VCC(
I);
4101case AMDGPU::G_AMDGPU_COPY_VCC_SCC:
4102return selectCOPY_VCC_SCC(
I);
4103case AMDGPU::G_AMDGPU_READANYLANE:
4104return selectReadAnyLane(
I);
4105case TargetOpcode::G_CONSTANT:
4106case TargetOpcode::G_FCONSTANT:
4114AMDGPUInstructionSelector::selectVCSRC(
MachineOperand &Root)
const{
4121std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
4122Register Src,
bool IsCanonicalizing,
bool AllowAbs,
bool OpSel)
const{
4126if (
MI->getOpcode() == AMDGPU::G_FNEG) {
4127 Src =
MI->getOperand(1).getReg();
4130 }
elseif (
MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
4131// Fold fsub [+-]0 into fneg. This may not have folded depending on the 4132// denormal mode, but we're implicitly canonicalizing in a source operand. 4135if (LHS &&
LHS->isZero()) {
4137 Src =
MI->getOperand(2).getReg();
4141if (AllowAbs &&
MI->getOpcode() == AMDGPU::G_FABS) {
4142 Src =
MI->getOperand(1).getReg();
4149return std::pair(Src, Mods);
4152Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
4154bool ForceVGPR)
const{
4155if ((Mods != 0 || ForceVGPR) &&
4158// If we looked through copies to find source modifiers on an SGPR operand, 4159// we now have an SGPR register source. To avoid potentially violating the 4160// constant bus restriction, we need to insert a copy to a VGPR. 4163 TII.get(AMDGPU::COPY), VGPRSrc)
4172/// This will select either an SGPR or VGPR operand and will save us from 4173/// having to write an extra tablegen pattern. 4175AMDGPUInstructionSelector::selectVSRC0(
MachineOperand &Root)
const{
4182AMDGPUInstructionSelector::selectVOP3Mods0(
MachineOperand &Root)
const{
4185 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg());
4189 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4198AMDGPUInstructionSelector::selectVOP3BMods0(
MachineOperand &Root)
const{
4201 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg(),
4202/*IsCanonicalizing=*/true,
4207 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4216AMDGPUInstructionSelector::selectVOP3OMods(
MachineOperand &Root)
const{
4225AMDGPUInstructionSelector::selectVOP3Mods(
MachineOperand &Root)
const{
4228 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg());
4232 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4239AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
4243 std::tie(Src, Mods) =
4244 selectVOP3ModsImpl(Root.
getReg(),
/*IsCanonicalizing=*/false);
4248 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4255AMDGPUInstructionSelector::selectVOP3BMods(
MachineOperand &Root)
const{
4258 std::tie(Src, Mods) =
4259 selectVOP3ModsImpl(Root.
getReg(),
/*IsCanonicalizing=*/true,
4264 MIB.
addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
4271AMDGPUInstructionSelector::selectVOP3NoMods(
MachineOperand &Root)
const{
4274if (
Def->getOpcode() == AMDGPU::G_FNEG ||
Def->getOpcode() == AMDGPU::G_FABS)
4281std::pair<Register, unsigned>
4282AMDGPUInstructionSelector::selectVOP3PModsImpl(
4287if (
MI->getOpcode() == AMDGPU::G_FNEG &&
4288// It's possible to see an f32 fneg here, but unlikely. 4289// TODO: Treat f32 fneg as only high bit. 4292 Src =
MI->getOperand(1).getReg();
4293MI =
MRI.getVRegDef(Src);
4296// TODO: Handle G_FSUB 0 as fneg 4298// TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. 4299 (void)IsDOT;
// DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() 4301// Packed instructions do not have abs modifiers. 4304return std::pair(Src, Mods);
4308AMDGPUInstructionSelector::selectVOP3PMods(
MachineOperand &Root)
const{
4314 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.
getReg(), MRI);
4323AMDGPUInstructionSelector::selectVOP3PModsDOT(
MachineOperand &Root)
const{
4329 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.
getReg(), MRI,
true);
4338AMDGPUInstructionSelector::selectVOP3PModsNeg(
MachineOperand &Root)
const{
4339// Literal i1 value set in intrinsic, represents SrcMods for the next operand. 4340// Value is in Imm operand as i1 sign extended to int64_t. 4341// 1(-1) promotes packed values to signed, 0 treats them as unsigned. 4343"expected i1 value");
4353AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
4356"expected i1 value");
4370switch (Elts.
size()) {
4372 DstRegClass = &AMDGPU::VReg_256RegClass;
4375 DstRegClass = &AMDGPU::VReg_128RegClass;
4378 DstRegClass = &AMDGPU::VReg_64RegClass;
4385auto MIB =
B.buildInstr(AMDGPU::REG_SEQUENCE)
4386 .addDef(
MRI.createVirtualRegister(DstRegClass));
4387for (
unsigned i = 0; i < Elts.
size(); ++i) {
4398if (ModOpcode == TargetOpcode::G_FNEG) {
4400// Check if all elements also have abs modifier 4402for (
auto El : Elts) {
4408if (Elts.size() != NegAbsElts.
size()) {
4417assert(ModOpcode == TargetOpcode::G_FABS);
4425AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(
MachineOperand &Root)
const{
4431assert(BV->getNumSources() > 0);
4432// Based on first element decide which mod we match, neg or abs 4434unsigned ModOpcode = (ElF32->
getOpcode() == AMDGPU::G_FNEG)
4437for (
unsigned i = 0; i < BV->getNumSources(); ++i) {
4438 ElF32 =
MRI->getVRegDef(BV->getSourceReg(i));
4444// All elements had ModOpcode modifier 4445if (BV->getNumSources() == EltsF32.
size()) {
4456AMDGPUInstructionSelector::selectWMMAModsF16Neg(
MachineOperand &Root)
const{
4462for (
unsigned i = 0; i < CV->getNumSources(); ++i) {
4469// All elements had ModOpcode modifier 4470if (CV->getNumSources() == EltsV2F16.
size()) {
4482AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(
MachineOperand &Root)
const{
4488assert(CV->getNumSources() > 0);
4490// Based on first element decide which mod we match, neg or abs 4491unsigned ModOpcode = (ElV2F16->
getOpcode() == AMDGPU::G_FNEG)
4495for (
unsigned i = 0; i < CV->getNumSources(); ++i) {
4496 ElV2F16 =
MRI->getVRegDef(CV->getSourceReg(i));
4502// All elements had ModOpcode modifier 4503if (CV->getNumSources() == EltsV2F16.
size()) {
4515AMDGPUInstructionSelector::selectWMMAVISrc(
MachineOperand &Root)
const{
4516 std::optional<FPValueAndVReg> FPValReg;
4520 MIB.
addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4523// Non-inlineable splat floats should not fall-through for integer immediate 4540AMDGPUInstructionSelector::selectSWMMACIndex8(
MachineOperand &Root)
const{
4546 std::optional<ValueAndVReg> ShiftAmt;
4548MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4549 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4550Key = ShiftAmt->Value.getZExtValue() / 8;
4561AMDGPUInstructionSelector::selectSWMMACIndex16(
MachineOperand &Root)
const{
4568 std::optional<ValueAndVReg> ShiftAmt;
4570MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4571 ShiftAmt->Value.getZExtValue() == 16) {
4583AMDGPUInstructionSelector::selectVOP3OpSelMods(
MachineOperand &Root)
const{
4586 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg());
4588// FIXME: Handle op_sel 4596AMDGPUInstructionSelector::selectVINTERPMods(
MachineOperand &Root)
const{
4599 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg(),
4600/*IsCanonicalizing=*/true,
4607 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB,
/* ForceVGPR */true));
4614AMDGPUInstructionSelector::selectVINTERPModsHi(
MachineOperand &Root)
const{
4617 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg(),
4618/*IsCanonicalizing=*/true,
4625 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB,
/* ForceVGPR */true));
4631bool AMDGPUInstructionSelector::selectSmrdOffset(
MachineOperand &Root,
4638// FIXME: We should shrink the GEP if the offset is known to be <= 32-bits, 4639// then we can select all ptr + 32-bit offsets. 4641 getAddrModeInfo(*
MI, *MRI, AddrInfo);
4643if (AddrInfo.
empty())
4646const GEPInfo &GEPI = AddrInfo[0];
4647 std::optional<int64_t> EncodedImm;
4651/*HasSOffset=*/true);
4652if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4653 AddrInfo.
size() > 1) {
4654const GEPInfo &GEPI2 = AddrInfo[1];
4655if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4658Base = GEPI2.SgprParts[0];
4659 *SOffset = OffsetReg;
4664// For unbuffered smem loads, it is illegal for the Immediate Offset 4665// to be negative if the resulting (Offset + (M0 or SOffset or zero) 4666// is negative. Handle the case where the Immediate Offset + SOffset 4669if (*
Offset + SKnown.getMinValue().getSExtValue() < 0)
4680/*HasSOffset=*/false);
4681if (
Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4682Base = GEPI.SgprParts[0];
4687// SGPR offset is unsigned. 4688if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4690// If we make it this far we have a load with an 32-bit immediate offset. 4691// It is OK to select this using a sgpr offset, because we have already 4692// failed trying to select this load into one of the _IMM variants since 4693// the _IMM Patterns are considered before the _SGPR patterns. 4694Base = GEPI.SgprParts[0];
4695 *SOffset =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4696BuildMI(*
MBB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4701if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4703Base = GEPI.SgprParts[0];
4704 *SOffset = OffsetReg;
4713AMDGPUInstructionSelector::selectSmrdImm(
MachineOperand &Root)
const{
4716if (!selectSmrdOffset(Root,
Base,
/* SOffset= */nullptr, &
Offset))
4724AMDGPUInstructionSelector::selectSmrdImm32(
MachineOperand &Root)
const{
4726 getAddrModeInfo(*Root.
getParent(), *MRI, AddrInfo);
4728if (AddrInfo.
empty() || AddrInfo[0].SgprParts.size() != 1)
4731const GEPInfo &GEPInfo = AddrInfo[0];
4732Register PtrReg = GEPInfo.SgprParts[0];
4733 std::optional<int64_t> EncodedImm =
4745AMDGPUInstructionSelector::selectSmrdSgpr(
MachineOperand &Root)
const{
4747if (!selectSmrdOffset(Root,
Base, &SOffset,
/* Offset= */nullptr))
4755AMDGPUInstructionSelector::selectSmrdSgprImm(
MachineOperand &Root)
const{
4758if (!selectSmrdOffset(Root,
Base, &SOffset, &
Offset))
4766std::pair<Register, int>
4767AMDGPUInstructionSelector::selectFlatOffsetImpl(
MachineOperand &Root,
4777 int64_t ConstOffset;
4778 std::tie(PtrBase, ConstOffset) =
4779 getPtrBaseWithConstantOffset(Root.
getReg(), *MRI);
4782 !isFlatScratchBaseLegal(Root.
getReg())))
4785unsigned AddrSpace = (*
MI->memoperands_begin())->getAddrSpace();
4789return std::pair(PtrBase, ConstOffset);
4793AMDGPUInstructionSelector::selectFlatOffset(
MachineOperand &Root)
const{
4803AMDGPUInstructionSelector::selectGlobalOffset(
MachineOperand &Root)
const{
4813AMDGPUInstructionSelector::selectScratchOffset(
MachineOperand &Root)
const{
4822// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) 4824AMDGPUInstructionSelector::selectGlobalSAddr(
MachineOperand &Root)
const{
4827 int64_t ConstOffset;
4828 int64_t ImmOffset = 0;
4830// Match the immediate offset first, which canonically is moved as low as 4832 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(
Addr, *MRI);
4834if (ConstOffset != 0) {
4838 ImmOffset = ConstOffset;
4841if (isSGPR(PtrBaseDef->Reg)) {
4842if (ConstOffset > 0) {
4843// Offset is too large. 4845// saddr + large_offset -> saddr + 4846// (voffset = large_offset & ~MaxOffset) + 4847// (large_offset & MaxOffset); 4848 int64_t SplitImmOffset, RemainderOffset;
4852if (isUInt<32>(RemainderOffset)) {
4856MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4858BuildMI(*
MBB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4860 .
addImm(RemainderOffset);
4872// We are adding a 64 bit SGPR and a constant. If constant bus limit 4873// is 1 we would need to perform 1 or 2 extra moves for each half of 4874// the constant and it is better to do a scalar add and then issue a 4875// single VALU instruction to materialize zero. Otherwise it is less 4876// instructions to perform VALU adds with immediates or inline literals. 4877unsigned NumLiterals =
4886// Match the variable offset. 4888if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4889// Look through the SGPR->VGPR copy. 4894Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4896// It's possible voffset is an SGPR here, but the copy to VGPR will be 4912// FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and 4914if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4915 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4918// It's cheaper to materialize a single 32-bit zero for vaddr than the two 4919// moves required to copy a 64-bit SGPR to VGPR. 4922Register VOffset =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4924BuildMI(*
MBB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4935AMDGPUInstructionSelector::selectScratchSAddr(
MachineOperand &Root)
const{
4938 int64_t ConstOffset;
4939 int64_t ImmOffset = 0;
4941// Match the immediate offset first, which canonically is moved as low as 4943 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(
Addr, *MRI);
4945if (ConstOffset != 0 && isFlatScratchBaseLegal(
Addr) &&
4949 ImmOffset = ConstOffset;
4953if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4954int FI = AddrDef->MI->getOperand(1).
getIndex();
4963if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4969if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4970 isSGPR(RHSDef->Reg)) {
4971int FI = LHSDef->MI->getOperand(1).getIndex();
4975 SAddr =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4977BuildMI(*BB, &
I,
DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4993// Check whether the flat scratch SVS swizzle bug affects this access. 4994bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4999// The bug affects the swizzling of SVS accesses if there is any carry out 5000// from the two low order bits (i.e. from bit 1 into bit 2) when adding 5001// voffset to (soffset + inst_offset). 5005uint64_t VMax = VKnown.getMaxValue().getZExtValue();
5007return (VMax & 3) + (
SMax & 3) >= 4;
5011AMDGPUInstructionSelector::selectScratchSVAddr(
MachineOperand &Root)
const{
5014 int64_t ConstOffset;
5015 int64_t ImmOffset = 0;
5017// Match the immediate offset first, which canonically is moved as low as 5019 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(
Addr, *MRI);
5022if (ConstOffset != 0 &&
5025 ImmOffset = ConstOffset;
5029if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
5039if (OrigAddr !=
Addr) {
5040if (!isFlatScratchBaseLegalSVImm(OrigAddr))
5043if (!isFlatScratchBaseLegalSV(OrigAddr))
5047if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
5050if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
5051int FI = LHSDef->MI->getOperand(1).getIndex();
5070AMDGPUInstructionSelector::selectMUBUFScratchOffen(
MachineOperand &Root)
const{
5079Register HighBits =
MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5081// TODO: Should this be inside the render function? The iterator seems to 5084BuildMI(*
MBB,
MI,
MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
5095// Use constant zero for soffset and rely on eliminateFrameIndex 5096// to choose the appropriate frame register if need be. 5106// Try to fold a frame index directly into the MUBUF vaddr field, and any 5108 std::optional<int> FI;
5113 int64_t ConstOffset;
5114 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
5115if (ConstOffset != 0) {
5120if (PtrBaseDef->
getOpcode() == AMDGPU::G_FRAME_INDEX)
5126 }
elseif (RootDef->
getOpcode() == AMDGPU::G_FRAME_INDEX) {
5140// Use constant zero for soffset and rely on eliminateFrameIndex 5141// to choose the appropriate frame register if need be. 5149bool AMDGPUInstructionSelector::isDSOffsetLegal(
RegisterBase,
5157// On Southern Islands instruction with a negative base value and an offset 5158// don't seem to work. 5162bool AMDGPUInstructionSelector::isDSOffset2Legal(
RegisterBase, int64_t Offset0,
5165if (Offset0 %
Size != 0 || Offset1 %
Size != 0)
5167if (!isUInt<8>(Offset0 /
Size) || !isUInt<8>(Offset1 /
Size))
5173// On Southern Islands instruction with a negative base value and an offset 5174// don't seem to work. 5178// Return whether the operation has NoUnsignedWrap property. 5180returnAddr->getOpcode() == TargetOpcode::G_OR ||
5181 (
Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
5185// Check that the base address of flat scratch load/store in the form of `base + 5186// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware 5187// requirement). We always treat the first operand as the base address here. 5188bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
RegisterAddr)
const{
5194// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 5202if (AddrMI->
getOpcode() == TargetOpcode::G_PTR_ADD) {
5203 std::optional<ValueAndVReg> RhsValReg =
5205// If the immediate offset is negative and within certain range, the base 5206// address cannot also be negative. If the base is also negative, the sum 5207// would be either negative or much larger than the valid range of scratch 5208// memory a thread can access. 5209if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
5210 RhsValReg->Value.getSExtValue() > -0x40000000)
5217// Check address value in SGPR/VGPR are legal for flat scratch in the form 5219bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(
RegisterAddr)
const{
5225// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 5235// Check address value in SGPR/VGPR are legal for flat scratch in the form 5236// of: SGPR + VGPR + Imm. 5237bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
5239// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative 5246 std::optional<DefinitionAndSourceRegister> BaseDef =
5248 std::optional<ValueAndVReg> RHSOffset =
5252// If the immediate offset is negative and within certain range, the base 5253// address cannot also be negative. If the base is also negative, the sum 5254// would be either negative or much larger than the valid range of scratch 5255// memory a thread can access. 5258 (RHSOffset->Value.getSExtValue() < 0 &&
5259 RHSOffset->Value.getSExtValue() > -0x40000000)))
5267bool AMDGPUInstructionSelector::isUnneededShiftMask(
constMachineInstr &
MI,
5268unsigned ShAmtBits)
const{
5269assert(
MI.getOpcode() == TargetOpcode::G_AND);
5271 std::optional<APInt>
RHS =
5276if (
RHS->countr_one() >= ShAmtBits)
5280return (LHSKnownZeros | *RHS).
countr_one() >= ShAmtBits;
5284AMDGPUInstructionSelector::selectMUBUFScratchOffset(
5289 std::optional<DefinitionAndSourceRegister>
Def =
5291assert(Def &&
"this shouldn't be an optional result");
5308// FIXME: Copy check is a hack 5346std::pair<Register, unsigned>
5347AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(
MachineOperand &Root)
const{
5349 int64_t ConstAddr = 0;
5353 std::tie(PtrBase,
Offset) =
5354 getPtrBaseWithConstantOffset(Root.
getReg(), *MRI);
5357if (isDSOffsetLegal(PtrBase,
Offset)) {
5359return std::pair(PtrBase,
Offset);
5361 }
elseif (RootDef->
getOpcode() == AMDGPU::G_SUB) {
5370return std::pair(Root.
getReg(), 0);
5374AMDGPUInstructionSelector::selectDS1Addr1Offset(
MachineOperand &Root)
const{
5377 std::tie(Reg,
Offset) = selectDS1Addr1OffsetImpl(Root);
5385AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(
MachineOperand &Root)
const{
5386return selectDSReadWrite2(Root, 4);
5390AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(
MachineOperand &Root)
const{
5391return selectDSReadWrite2(Root, 8);
5395AMDGPUInstructionSelector::selectDSReadWrite2(
MachineOperand &Root,
5399 std::tie(Reg,
Offset) = selectDSReadWrite2Impl(Root,
Size);
5407std::pair<Register, unsigned>
5408AMDGPUInstructionSelector::selectDSReadWrite2Impl(
MachineOperand &Root,
5411 int64_t ConstAddr = 0;
5415 std::tie(PtrBase,
Offset) =
5416 getPtrBaseWithConstantOffset(Root.
getReg(), *MRI);
5419 int64_t OffsetValue0 =
Offset;
5421if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1,
Size)) {
5423return std::pair(PtrBase, OffsetValue0 /
Size);
5425 }
elseif (RootDef->
getOpcode() == AMDGPU::G_SUB) {
5433return std::pair(Root.
getReg(), 0);
5436/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return 5437/// the base value with the constant offset. There may be intervening copies 5438/// between \p Root and the identified constant. Returns \p Root, 0 if this does 5439/// not match the pattern. 5440std::pair<Register, int64_t>
5441AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
5444if (RootI->
getOpcode() != TargetOpcode::G_PTR_ADD)
5448 std::optional<ValueAndVReg> MaybeOffset =
5459/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p 5460/// BasePtr is not valid, a null base pointer will be used. 5464Register RSrc2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5465Register RSrc3 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5466Register RSrcHi =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5467Register RSrc =
MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5469B.buildInstr(AMDGPU::S_MOV_B32)
5472B.buildInstr(AMDGPU::S_MOV_B32)
5476// Build the half of the subregister with the constants before building the 5477// full 128-bit register. If we are building multiple resource descriptors, 5478// this will allow CSEing of the 2-component register. 5479B.buildInstr(AMDGPU::REG_SEQUENCE)
5482 .addImm(AMDGPU::sub0)
5484 .addImm(AMDGPU::sub1);
5488 RSrcLo =
MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5489B.buildInstr(AMDGPU::S_MOV_B64)
5494B.buildInstr(AMDGPU::REG_SEQUENCE)
5497 .addImm(AMDGPU::sub0_sub1)
5499 .addImm(AMDGPU::sub2_sub3);
5506uint64_t DefaultFormat =
TII.getDefaultRsrcDataFormat();
5508// FIXME: Why are half the "default" bits ignored based on the addressing 5515uint64_t DefaultFormat =
TII.getDefaultRsrcDataFormat();
5517// FIXME: Why are half the "default" bits ignored based on the addressing 5522AMDGPUInstructionSelector::MUBUFAddressData
5523AMDGPUInstructionSelector::parseMUBUFAddress(
Register Src)
const{
5524 MUBUFAddressData
Data;
5530 std::tie(PtrBase,
Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5538Data.N2 = InputAdd->getOperand(1).getReg();
5539Data.N3 = InputAdd->getOperand(2).getReg();
5541// FIXME: Need to fix extra SGPR->VGPRcopies inserted 5542// FIXME: Don't know this was defined by operand 0 5544// TODO: Remove this when we have copy folding optimizations after 5553/// Return if the addr64 mubuf mode should be used for the given address. 5554bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData
Addr)
const{
5555// (ptr_add N2, N3) -> addr64, or 5556// (ptr_add (ptr_add N2, N3), C1) -> addr64 5561return N0Bank->
getID() == AMDGPU::VGPRRegBankID;
5564/// Split an immediate offset \p ImmOffset depending on whether it fits in the 5565/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable 5567void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5572// Illegal offset, store it in soffset. 5573 SOffset =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5574B.buildInstr(AMDGPU::S_MOV_B32)
5580bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5583// FIXME: Predicates should stop this from reaching here. 5584// addr64 bit was removed for volcanic islands. 5588 MUBUFAddressData AddrData = parseMUBUFAddress(Root.
getReg());
5589if (!shouldUseAddr64(AddrData))
5597// Base pointer for the SRD. 5604// Both N2 and N3 are divergent. Use N0 (the result of the add) as the 5605// addr64, and construct the default resource from a 0 address. 5612// N2 is not divergent. 5616 }
elseif (RBI.
getRegBank(N0, *MRI, TRI)->
getID() == AMDGPU::VGPRRegBankID) {
5617// Use the default null pointer in the resource 5621// (N0 + C1) -> offset 5627 splitIllegalMUBUFOffset(
B, SOffset,
Offset);
5631bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5635// FIXME: Pattern should not reach here. 5639 MUBUFAddressData AddrData = parseMUBUFAddress(Root.
getReg());
5640if (shouldUseAddr64(AddrData))
5644// (N0 + C1) -> offset 5648// TODO: Look through extensions for 32-bit soffset. 5652 splitIllegalMUBUFOffset(
B, SOffset,
Offset);
5657AMDGPUInstructionSelector::selectMUBUFAddr64(
MachineOperand &Root)
const{
5663if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset,
Offset))
5666// FIXME: Use defaulted operands for trailing 0s and remove from the complex 5679 MIB.
addReg(AMDGPU::SGPR_NULL);
5693AMDGPUInstructionSelector::selectMUBUFOffset(
MachineOperand &Root)
const{
5698if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset,
Offset))
5709 MIB.
addReg(AMDGPU::SGPR_NULL);
5721AMDGPUInstructionSelector::selectBUFSOffset(
MachineOperand &Root)
const{
5726 SOffset = AMDGPU::SGPR_NULL;
5731/// Get an immediate that must be 32-bits, and treated as zero extended. 5732static std::optional<uint64_t>
5734// getIConstantVRegVal sexts any values, so see if that matters. 5736if (!OffsetVal || !isInt<32>(*OffsetVal))
5738returnLo_32(*OffsetVal);
5742AMDGPUInstructionSelector::selectSMRDBufferImm(
MachineOperand &Root)
const{
5743 std::optional<uint64_t> OffsetVal =
5748 std::optional<int64_t> EncodedImm =
5757AMDGPUInstructionSelector::selectSMRDBufferImm32(
MachineOperand &Root)
const{
5764 std::optional<int64_t> EncodedImm =
5773AMDGPUInstructionSelector::selectSMRDBufferSgprImm(
MachineOperand &Root)
const{
5774// Match the (soffset + offset) pair as a 32-bit register base and 5775// an immediate offset. 5779 *MRI, Root.
getReg(),
KB,
/*CheckNUW*/true);
5783 std::optional<int64_t> EncodedOffset =
5793std::pair<Register, unsigned>
5794AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(
MachineOperand &Root,
5795bool &Matched)
const{
5800 std::tie(Src, Mods) = selectVOP3ModsImpl(Root.
getReg());
5805// Only change Src if src modifier could be gained. In such cases new Src 5806// could be sgpr but this does not violate constant bus restriction for 5807// instruction that is being selected. 5810constauto CheckAbsNeg = [&]() {
5811// Be careful about folding modifiers if we already have an abs. fneg is 5812// applied last, so we don't want to apply an earlier fneg. 5815 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(Src);
5827// op_sel/op_sel_hi decide the source type and source. 5828// If the source's op_sel_hi is set, it indicates to do a conversion from 5829// fp16. If the sources's op_sel is set, it picks the high half of the 5846AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5851 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5862AMDGPUInstructionSelector::selectVOP3PMadMixMods(
MachineOperand &Root)
const{
5866 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5874bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5878Register CCReg =
I.getOperand(0).getReg();
5880BuildMI(*
MBB, &
I,
DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5881 .
addImm(
I.getOperand(2).getImm());
5890bool AMDGPUInstructionSelector::selectSGetBarrierState(
5895 std::optional<int64_t> BarValImm =
5899auto CopyMIB =
BuildMI(*
MBB, &
I,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5904unsigned Opc = BarValImm ? AMDGPU::S_GET_BARRIER_STATE_IMM
5905 : AMDGPU::S_GET_BARRIER_STATE_M0;
5908auto DstReg =
I.getOperand(0).getReg();
5910TRI.getConstrainedRegClassForOperand(
I.getOperand(0), *MRI);
5922if (HasInlineConst) {
5926case Intrinsic::amdgcn_s_barrier_join:
5927return AMDGPU::S_BARRIER_JOIN_IMM;
5928case Intrinsic::amdgcn_s_get_named_barrier_state:
5929return AMDGPU::S_GET_BARRIER_STATE_IMM;
5935case Intrinsic::amdgcn_s_barrier_join:
5936return AMDGPU::S_BARRIER_JOIN_M0;
5937case Intrinsic::amdgcn_s_get_named_barrier_state:
5938return AMDGPU::S_GET_BARRIER_STATE_M0;
5943bool AMDGPUInstructionSelector::selectNamedBarrierInit(
5950// BarID = (BarOp >> 4) & 0x3F 5951Register TmpReg0 =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5957Register TmpReg1 =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5963// MO = ((CntOp & 0x3F) << shAmt) | BarID 5964Register TmpReg2 =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5970Register TmpReg3 =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5971constexprunsigned ShAmt = 16;
5977Register TmpReg4 =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5987unsigned Opc = IntrID == Intrinsic::amdgcn_s_barrier_init
5988 ? AMDGPU::S_BARRIER_INIT_M0
5989 : AMDGPU::S_BARRIER_SIGNAL_M0;
5997bool AMDGPUInstructionSelector::selectNamedBarrierInst(
6001MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_named_barrier_state
6004 std::optional<int64_t> BarValImm =
6008// BarID = (BarOp >> 4) & 0x3F 6009Register TmpReg0 =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6015Register TmpReg1 =
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
6021auto CopyMIB =
BuildMI(*
MBB, &
I,
DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
6030if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
6031auto DstReg =
I.getOperand(0).getReg();
6033TRI.getConstrainedRegClassForOperand(
I.getOperand(0), *MRI);
6040auto BarId = ((*BarValImm) >> 4) & 0x3F;
6051assert(
MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6052"Expected G_CONSTANT");
6053 MIB.
addImm(
MI.getOperand(1).getCImm()->getSExtValue());
6059assert(
MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6060"Expected G_CONSTANT");
6061 MIB.
addImm(-
MI.getOperand(1).getCImm()->getSExtValue());
6068assert(
MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1);
6069 MIB.
addImm(
Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
6075assert(
MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6076"Expected G_CONSTANT");
6077 MIB.
addImm(
MI.getOperand(1).getCImm()->getValue().popcount());
6080/// This only really exists to satisfy DAG type checking machinery, so is a 6096 MIB.
addImm(
MI.getOperand(OpIdx).getImm() != 0);
6102assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6106void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_0(
6108assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6113void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_0_1(
6115assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6116 MIB.
addImm((
MI.getOperand(OpIdx).getImm() & 0x2)
6121void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_0(
6123assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6128void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_1_1(
6130assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6131 MIB.
addImm((
MI.getOperand(OpIdx).getImm() & 0x1)
6136void AMDGPUInstructionSelector::renderDstSelToOpSelXForm(
6138assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6143void AMDGPUInstructionSelector::renderSrcSelToOpSelXForm(
6145assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6150void AMDGPUInstructionSelector::renderSrcAndDstSelToOpSelXForm_2_0(
6152assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6157void AMDGPUInstructionSelector::renderDstSelToOpSel3XFormXForm(
6159assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6167assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6168 MIB.
addImm(
MI.getOperand(OpIdx).getImm() &
6176assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6177constboolSwizzle =
MI.getOperand(OpIdx).getImm() &
6183void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
6185assert(OpIdx >= 0 &&
"expected to match an immediate operand");
6186constuint32_t Cpol =
MI.getOperand(OpIdx).getImm() &
6201constAPFloat &APF =
MI.getOperand(1).getFPImm()->getValueAPF();
6203assert(ExpVal != INT_MIN);
6210// "round.towardzero" -> TowardZero 0 -> FP_ROUND_ROUND_TO_ZERO 3 6211// "round.tonearest" -> NearestTiesToEven 1 -> FP_ROUND_ROUND_TO_NEAREST 0 6212// "round.upward" -> TowardPositive 2 -> FP_ROUND_ROUND_TO_INF 1 6213// "round.downward -> TowardNegative 3 -> FP_ROUND_ROUND_TO_NEGINF 2 6214 MIB.
addImm((
MI.getOperand(OpIdx).getImm() + 3) % 4);
6217/// Convert from 2-bit value to enum values used for op_sel* source modifiers. 6218void AMDGPUInstructionSelector::renderScaledMAIIntrinsicOperand(
6220unsigned Val =
MI.getOperand(OpIdx).getImm();
6229bool AMDGPUInstructionSelector::isInlineImmediate(
constAPInt &Imm)
const{
6233bool AMDGPUInstructionSelector::isInlineImmediate(
constAPFloat &Imm)
const{
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static std::pair< unsigned, uint8_t > BitOp3_Op(Register R, SmallVectorImpl< Register > &Src, const MachineRegisterInfo &MRI)
static bool isLaneMaskFromSameBlock(Register Reg, MachineRegisterInfo &MRI, MachineBasicBlock *MBB)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static Register stripCopy(Register Reg, MachineRegisterInfo &MRI)
static Register stripBitCast(Register Reg, MachineRegisterInfo &MRI)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
uint32_t getLDSSize() const
bool isEntryFunction() const
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Class for arbitrary precision integers.
APInt zext(unsigned width) const
Zero extend to a new width.
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
int64_t getSExtValue() const
Get sign extended value.
unsigned countr_one() const
Count the number of trailing one bits.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ FCMP_ULT
1 1 0 0 True if unordered or less than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
@ ICMP_ULT
unsigned less than
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ ICMP_SGE
signed greater or equal
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
@ ICMP_ULE
unsigned less or equal
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
bool isFPPredicate() const
bool isIntPredicate() const
ConstantFP - Floating Point Values [float, double].
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
This class represents an Operation in the Expression.
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasPermlane32Swap() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool unsafeDSOffsetFoldingEnabled() const
bool hasBitOp3Insts() const
bool hasFlatInstOffsets() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasPermlane16Swap() const
bool hasFlatScratchSVSSwizzleBug() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
bool hasGWSSemaReleaseAll() const
bool hasAddNoCarry() const
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
CodeGenCoverage * CoverageInfo
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr bool isValid() const
constexpr bool isVector() const
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Describe properties that are true of each instruction in the target description file.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
unsigned getNumOperands() const
Retuns the total number of operands.
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
void setImm(int64_t immVal)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
A Module instance is used to store all the information related to an LLVM module.
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
unsigned getID() const
Get the identifier of this register bank.
Wrapper class representing virtual and physical registers.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
MCRegister getExec() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
unsigned getID() const
Return the register class ID number.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Value(Type *Ty, unsigned scid)
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer, bool HasSOffset)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
Intrinsic::ID getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_XOR, true > m_GXor(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_FPEXT > m_GFPExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
SpecificConstantMatch m_AllOnesInt()
BinaryOp_match< LHS, RHS, TargetOpcode::G_OR, true > m_GOr(const LHS &L, const RHS &R)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
BinaryOp_match< LHS, RHS, TargetOpcode::G_AND, true > m_GAnd(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_BITCAST > m_GBitcast(const SrcTy &Src)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
OneUse_match< T > m_OneUse(const T &SubPattern)
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
int popcount(T Value) noexcept
Count the number of set bits in a value.
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false)
Compute knownbits resulting from addition of LHS and RHS.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.