1//===-- SIOptimizeExecMaskingPreRA.cpp ------------------------------------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 10/// This pass performs exec mask handling peephole optimizations which needs 11/// to be done before register allocation to reduce register pressure. 13//===----------------------------------------------------------------------===// 24#define DEBUG_TYPE "si-optimize-exec-masking-pre-ra" 37unsigned OrSaveExecOpc;
55return"SI optimize exec mask operations pre-RA";
65}
// End anonymous namespace. 68"SI optimize exec mask operations pre-RA",
false,
false)
73char SIOptimizeExecMaskingPreRA::
ID = 0;
78returnnew SIOptimizeExecMaskingPreRA();
81// See if there is a def between \p AndIdx and \p SelIdx that needs to live 89// FIXME: Why do we bother trying to handle physical registers here? 108// %sel = V_CNDMASK_B32_e64 0, 1, %cc 109// %cmp = V_CMP_NE_U32 1, %sel 110// $vcc = S_AND_B64 $exec, %cmp 113// $vcc = S_ANDN2_B64 $exec, %cc 116// It is the negation pattern inserted by DAGCombiner::visitBRCOND() in the 117// rebuildSetCC(). We start with S_CBRANCH to avoid exhaustive search, but 118// only 3 first instructions are really needed. S_AND_B64 with exec is a 119// required part of the pattern since V_CNDMASK_B32 writes zeroes for inactive 122// Returns true on success. 125 unsigned Opc = MI.getOpcode();
126 return Opc == AMDGPU::S_CBRANCH_VCCZ ||
127 Opc == AMDGPU::S_CBRANCH_VCCNZ; });
132TRI->findReachingDef(CondReg, AMDGPU::NoSubRegister, *
I, *
MRI, LIS);
133if (!
And ||
And->getOpcode() != AndOpc ||
134 !
And->getOperand(1).isReg() || !
And->getOperand(2).isReg())
141 AndCC = &
And->getOperand(2);
144 }
elseif (
And->getOperand(2).getReg() !=
Register(ExecReg)) {
148auto *
Cmp =
TRI->findReachingDef(CmpReg, CmpSubReg, *
And, *
MRI, LIS);
149if (!Cmp || !(
Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e32 ||
150Cmp->getOpcode() == AMDGPU::V_CMP_NE_U32_e64) ||
151Cmp->getParent() !=
And->getParent())
165auto *Sel =
TRI->findReachingDef(SelReg, Op1->
getSubReg(), *Cmp, *
MRI, LIS);
166if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
169if (
TII->hasModifiersSet(*Sel, AMDGPU::OpName::src0_modifiers) ||
170TII->hasModifiersSet(*Sel, AMDGPU::OpName::src1_modifiers))
173 Op1 =
TII->getNamedOperand(*Sel, AMDGPU::OpName::src0);
174 Op2 =
TII->getNamedOperand(*Sel, AMDGPU::OpName::src1);
182// If there was a def between the select and the and, we would need to move it 187// Cannot safely mirror live intervals with PHI nodes, so check for these 188// before optimization. 189SlotIndex SelIdx = LIS->getInstructionIndex(*Sel);
193 return VNI->isPHIDef();
197// TODO: Guard against implicit def operands? 198LLVM_DEBUG(
dbgs() <<
"Folding sequence:\n\t" << *Sel <<
'\t' << *Cmp <<
'\t' 203And->getOperand(0).getReg())
212SlotIndex AndIdx = LIS->ReplaceMachineInstrInMaps(*
And, *Andn2);
213And->eraseFromParent();
217// Update live intervals for CCReg before potentially removing CmpReg/SelReg, 218// and their associated liveness information. 219SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
224 LIS->removeInterval(CCReg);
225 LIS->createAndComputeVirtRegInterval(CCReg);
228 LIS->removeAllRegUnitsForPhysReg(CCReg);
230// Try to remove compare. Cmp value should not used in between of cmp 231// and s_and_b64 if VCC or just unused if any other register. 237 return MI.readsRegister(CondReg, TRI);
241 LIS->removeVRegDefAt(*CmpLI, CmpIdx.
getRegSlot());
242 LIS->RemoveMachineInstrFromMaps(*Cmp);
243Cmp->eraseFromParent();
245// Try to remove v_cndmask_b32. 246// Kill status must be checked before shrinking the live range. 248 LIS->shrinkToUses(SelLI);
250if (
MRI->use_nodbg_empty(SelReg) && (IsKill ||
IsDead)) {
253 LIS->removeVRegDefAt(*SelLI, SelIdx.
getRegSlot());
254 LIS->RemoveMachineInstrFromMaps(*Sel);
255bool ShrinkSel = Sel->getOperand(0).readsReg();
256 Sel->eraseFromParent();
258// The result of the V_CNDMASK was a subreg def which counted as a read 259// from the other parts of the reg. Shrink their live ranges. 260 LIS->shrinkToUses(SelLI);
269// %dst = S_OR_SAVEEXEC %src 270// ... instructions not modifying exec ... 271// %tmp = S_AND $exec, %dst 272// $exec = S_XOR_term $exec, %tmp 274// %dst = S_OR_SAVEEXEC %src 275// ... instructions not modifying exec ... 276// $exec = S_XOR_term $exec, %dst 278// Clean up potentially unnecessary code added for safety during 279// control flow lowering. 281// Return whether any changes were made to MBB. 286// Check this is an else block. 289if (SaveExecMI.
getOpcode() != OrSaveExecOpc)
293 return MI.getOpcode() == XorTermrOpc;
305// Find potentially unnecessary S_AND 308while (
I !=
First && !AndExecMI) {
309if (
I->getOpcode() == AndOpc &&
I->getOperand(0).getReg() == DstReg &&
310I->getOperand(1).getReg() ==
Register(ExecReg))
317// Check for exec modifying instructions. 318// Note: exec defs do not create live ranges beyond the 319// instruction so isDefBetween cannot be used. 320// Instead just check that the def segments are adjacent. 321SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI);
322SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI);
324LiveRange &RegUnit = LIS->getRegUnit(Unit);
325if (RegUnit.
find(StartIdx) != std::prev(RegUnit.
find(EndIdx)))
329// Remove unnecessary S_AND 330 LIS->removeInterval(SavedExecReg);
331 LIS->removeInterval(DstReg);
335 LIS->RemoveMachineInstrFromMaps(*AndExecMI);
338 LIS->createAndComputeVirtRegInterval(DstReg);
343bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(
MachineFunction &MF) {
348TRI =
ST.getRegisterInfo();
349TII =
ST.getInstrInfo();
351 LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
354 AndOpc =
Wave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
355 Andn2Opc =
Wave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
357Wave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
358 XorTermrOpc =
Wave32 ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
367if (optimizeElseBranch(
MBB)) {
368 RecalcRegs.
insert(AMDGPU::SCC);
372if (optimizeVcndVcmpPair(
MBB)) {
373 RecalcRegs.
insert(AMDGPU::VCC_LO);
374 RecalcRegs.insert(AMDGPU::VCC_HI);
375 RecalcRegs.insert(AMDGPU::SCC);
379// Try to remove unneeded instructions before s_endpgm. 384// Skip this if the endpgm has any implicit uses, otherwise we would need 385// to be careful to update / remove them. 386// S_ENDPGM always has a single imm operand that is not used other than to 387// end up in the encoding 389if (
Term.getOpcode() != AMDGPU::S_ENDPGM ||
Term.getNumOperands() != 1)
395auto *CurBB =
Blocks.pop_back_val();
396autoI = CurBB->rbegin(), E = CurBB->rend();
398if (
I->isUnconditionalBranch() ||
I->getOpcode() == AMDGPU::S_ENDPGM)
400elseif (
I->isBranch())
405if (
I->isDebugInstr()) {
410if (
I->mayStore() ||
I->isBarrier() ||
I->isCall() ||
411I->hasUnmodeledSideEffects() ||
I->hasOrderedMemoryRef())
415 <<
"Removing no effect instruction: " << *
I <<
'\n');
417for (
auto &
Op :
I->operands()) {
419 RecalcRegs.insert(
Op.getReg());
422auto Next = std::next(
I);
423 LIS->RemoveMachineInstrFromMaps(*
I);
433// Try to ascend predecessors. 434for (
auto *Pred : CurBB->predecessors()) {
435if (Pred->succ_size() == 1)
442// If the only user of a logical operation is move to exec, fold it now 443// to prevent forming of saveexec. I.e.: 445// %0:sreg_64 = COPY $exec 446// %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64 448// %1 = S_AND_B64 $exec, %2:sreg_64 449unsigned ScanThreshold = 10;
451 && ScanThreshold--; ++
I) {
452// Continue scanning if this is not a full exec copy 453if (!(
I->isFullCopy() &&
I->getOperand(1).getReg() ==
Register(ExecReg)))
456Register SavedExec =
I->getOperand(0).getReg();
457if (SavedExec.
isVirtual() &&
MRI->hasOneNonDBGUse(SavedExec)) {
462if (SingleExecUser->
getParent() ==
I->getParent() &&
464TII->isOperandLegal(*SingleExecUser,
Idx, &
I->getOperand(1))) {
466 LIS->RemoveMachineInstrFromMaps(*
I);
468MRI->replaceRegWith(SavedExec, ExecReg);
469 LIS->removeInterval(SavedExec);
478for (
auto Reg : RecalcRegs) {
479if (
Reg.isVirtual()) {
480 LIS->removeInterval(Reg);
481if (!
MRI->reg_empty(Reg))
482 LIS->createAndComputeVirtRegInterval(Reg);
484 LIS->removeAllRegUnitsForPhysReg(Reg);
unsigned const MachineRegisterInfo * MRI
Provides AMDGPU specific target descriptions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
DenseMap< Block *, BlockRelaxAux > Blocks
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
SI optimize exec mask operations pre RA
static bool isDefBetween(const LiveRange &LR, SlotIndex AndIdx, SlotIndex SelIdx)
SI optimize exec mask operations
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
LiveInterval - This class represents the liveness of a register, or stack slot.
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveRange & getRegUnit(unsigned Unit)
Return the live range for register unit Unit.
LiveInterval & getInterval(Register Reg)
Result of a LiveRange query.
bool isDeadDef() const
Return true if this instruction has a dead def.
VNInfo * valueIn() const
Return the value that is live-in to the instruction.
VNInfo * valueOut() const
Return the value leaving the instruction, if any.
bool isKill() const
Return true if the live-in value is killed by this instruction.
This class represents the liveness of a register, stack slot, etc.
iterator_range< vni_iterator > vnis()
LiveQueryResult Query(SlotIndex Idx) const
Query Liveness at Idx.
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
Wrapper class representing physical registers. Should be passed by value.
static MCRegister from(unsigned Val)
Check the provided unsigned value is a valid MCRegister.
iterator_range< iterator > terminators()
reverse_iterator rbegin()
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
int findRegisterUseOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isKill=false) const
Returns the operand index that is a use of the specific register or -1 if it is not found.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
SlotIndex - An opaque wrapper around machine indexes.
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
VNInfo - Value Number Information.
std::pair< iterator, bool > insert(const ValueT &V)
self_iterator getIterator()
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Reg
All possible values of the reg field in the ModR/M byte.
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
char & SIOptimizeExecMaskingPreRAID
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ And
Bitwise or logical AND of integers.
void initializeSIOptimizeExecMaskingPreRAPass(PassRegistry &)
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
FunctionPass * createSIOptimizeExecMaskingPreRAPass()
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.