1//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 10/// This pass lowers the pseudo control flow instructions to real 11/// machine instructions. 13/// All control flow is handled using predicated instructions and 14/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector 15/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs 16/// by writing to the 64-bit EXEC register (each bit corresponds to a 17/// single vector ALU). Typically, for predicates, a vector ALU will write 18/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each 19/// Vector ALU) and then the ScalarALU will AND the VCC register with the 20/// EXEC to update the predicates. 23/// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2 24/// %sgpr0 = SI_IF %vcc 25/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 26/// %sgpr0 = SI_ELSE %sgpr0 27/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0 32/// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc // Save and update the exec mask 33/// %sgpr0 = S_XOR_B64 %sgpr0, %exec // Clear live bits from saved exec mask 34/// S_CBRANCH_EXECZ label0 // This instruction is an optional 35/// // optimization which allows us to 36/// // branch if all the bits of 38/// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch 41/// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0 // Restore the exec mask for the Then 43/// %exec = S_XOR_B64 %sgpr0, %exec // Update the exec mask 44/// S_CBRANCH_EXECZ label1 // Use our branch optimization 45/// // instruction again. 46/// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr // Do the ELSE block 48/// %exec = S_OR_B64 %exec, %sgpr0 // Re-enable saved exec mask bits 49//===----------------------------------------------------------------------===// 64#define DEBUG_TYPE "si-lower-control-flow" 72classSILowerControlFlow {
93unsigned OrSaveExecOpc;
96bool EnableOptimizeEndCf =
false;
116// Skip to the next instruction, ignoring debug instructions, and trivial 117// block boundaries (blocks that have one (typically fallthrough) successor, 118// and the successor has one predecessor. 123 /// Find the insertion point for a new conditional branch. 129// FIXME: What if we had multiple pre-existing conditional branches? 131while (
I !=
End && !
I->isUnconditionalBranch())
136// Remove redundant SI_END_CF instructions. 142 : LIS(LIS), LV(LV), MDT(MDT) {}
155return"SI Lower control flow pseudo instructions";
160// Should preserve the same set that TwoAddressInstructions does. 169}
// end anonymous namespace 171char SILowerControlFlowLegacy::ID = 0;
190while (!Worklist.
empty()) {
205Register SaveExecReg =
MI.getOperand(0).getReg();
206auto U =
MRI->use_instr_nodbg_begin(SaveExecReg);
208if (U ==
MRI->use_instr_nodbg_end() ||
209 std::next(U) !=
MRI->use_instr_nodbg_end() ||
210 U->getOpcode() != AMDGPU::SI_END_CF)
220Register SaveExecReg =
MI.getOperand(0).getReg();
222assert(
Cond.getSubReg() == AMDGPU::NoSubRegister);
227// If there is only one use of save exec register and that use is SI_END_CF, 228// we can optimize SI_IF by returning the full saved exec mask instead of 233// Check for SI_KILL_*_TERMINATOR on path from if to endif. 234// if there is any such terminator simplifications are not safe. 235autoUseMI =
MRI->use_instr_nodbg_begin(SaveExecReg);
239// Add an implicit def of exec to discourage scheduling VALU after this which 240// will interfere with trying to form s_and_saveexec_b64 later. 241Register CopyReg = SimpleIf ? SaveExecReg
242 :
MRI->createVirtualRegister(BoolRC);
247 LoweredIf.
insert(CopyReg);
258 setImpSCCDefDead(*
And,
true);
266 setImpSCCDefDead(*
Xor, ImpDefSCC.
isDead());
269// Use a copy that is a terminator to get correct spill code placement it with 277// Skip ahead to the unconditional branch in case there are other terminators 279I = skipToUncondBrOrEnd(
MBB,
I);
281// Insert the S_CBRANCH_EXECZ instruction which will be optimized later 282// during SIPreEmitPeephole. 284 .
add(
MI.getOperand(2));
293// Replace with and so we don't need to fix the live interval for condition 305// FIXME: Is there a better way of adjusting the liveness? It shouldn't be 306// hard to add another def here but I'm not sure how to correctly update the 308 RecomputeRegs.
insert(SaveExecReg);
323// This must be inserted before phis and any spill code inserted before the 325Register SaveReg =
MRI->createVirtualRegister(BoolRC);
328 .
add(
MI.getOperand(1));
// Saved EXEC 336// This accounts for any modification of the EXEC mask within the block and 337// can be optimized out pre-RA when not required. 347// Skip ahead to the unconditional branch in case there are other terminators 349 ElsePt = skipToUncondBrOrEnd(
MBB, ElsePt);
369 RecomputeRegs.
insert(SrcReg);
370 RecomputeRegs.
insert(DstReg);
373// Let this be recomputed. 380auto Dst =
MI.getOperand(0).getReg();
382// Skip ANDing with exec if the break condition is already masked by exec 383// because it is a V_CMP in the same basic block. (We know the break 384// condition operand was an i1 in IR, so if it is a VALU instruction it must 385// be one with a carry-out.) 386bool SkipAnding =
false;
387if (
MI.getOperand(1).isReg()) {
389 SkipAnding =
Def->getParent() ==
MI.getParent()
394// AND the break condition operand with exec, then OR that into the "loop 399 AndReg =
MRI->createVirtualRegister(BoolRC);
402 .
add(
MI.getOperand(1));
407 .
add(
MI.getOperand(2));
410 .
add(
MI.getOperand(1))
411 .
add(
MI.getOperand(2));
421// Read of original operand 1 is on And now not Or. 422 RecomputeRegs.
insert(
And->getOperand(2).getReg());
438 .
add(
MI.getOperand(0));
442auto BranchPt = skipToUncondBrOrEnd(
MBB,
MI.getIterator());
445 .
add(
MI.getOperand(1));
448 RecomputeRegs.
insert(
MI.getOperand(0).getReg());
457SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
467for ( ; It != E; ++It) {
468if (
TII->mayReadEXEC(*
MRI, *It))
475if (
B->succ_size() != 1)
478// If there is one trivial successor, advance to the next block. 492// If we have instructions that aren't prolog instructions, split the block 493// and emit a terminator instruction. This ensures correct spill placement. 494// FIXME: We should unconditionally split the block here. 495bool NeedBlockSplit =
false;
499if (
I->modifiesRegister(DataReg,
TRI)) {
500 NeedBlockSplit =
true;
505unsigned Opcode = OrOpc;
509if (MDT && SplitBB != &
MBB) {
524 .
add(
MI.getOperand(0));
528if (SplitBB != &
MBB) {
529// Track the set of registers defined in the original block so we don't 530// accidentally add the original block to AliveBlocks. AliveBlocks only 531// includes blocks which are live through, which excludes live outs and 538if (
Op.getReg().isVirtual())
544for (
unsigned i = 0, e =
MRI->getNumVirtRegs(); i != e; ++i) {
552if (
Kill->getParent() == SplitBB && !DefInOrigBlock.
contains(Reg))
560 LoweredEndCf.
insert(NewMI);
572// Returns replace operands for a logical operation, either single result 573// for exec or two operands if source was another equivalent operation. 574void SILowerControlFlow::findMaskOperands(
MachineInstr &
MI,
unsigned OpNo,
577if (!
Op.isReg() || !
Op.getReg().isVirtual()) {
583if (!Def ||
Def->getParent() !=
MI.getParent() ||
584 !(
Def->isFullCopy() || (
Def->getOpcode() ==
MI.getOpcode())))
587// Make sure we do not modify exec between def and use. 588// A copy with implicitly defined exec inserted earlier is an exclusion, it 589// does not really modify exec. 590for (
autoI =
Def->getIterator();
I !=
MI.getIterator(); ++
I)
591if (
I->modifiesRegister(AMDGPU::EXEC,
TRI) &&
592 !(
I->isCopy() &&
I->getOperand(0).getReg() != Exec))
595for (
constauto &
SrcOp :
Def->explicit_operands())
598 Src.push_back(
SrcOp);
601// Search and combine pairs of equivalent instructions, like 602// S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y 603// S_OR_B64 x, (S_OR_B64 x, y) => S_OR_B64 x, y 604// One of the operands is exec mask. 606assert(
MI.getNumExplicitOperands() == 3);
608unsigned OpToReplace = 1;
609 findMaskOperands(
MI, 1, Ops);
610if (Ops.
size() == 1) OpToReplace = 2;
// First operand can be exec or its copy 611 findMaskOperands(
MI, 2, Ops);
612if (Ops.
size() != 3)
return;
614unsigned UniqueOpndIdx;
615if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
616elseif (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
617elseif (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
621MI.removeOperand(OpToReplace);
622MI.addOperand(Ops[UniqueOpndIdx]);
623if (
MRI->use_empty(Reg))
624MRI->getUniqueVRegDef(Reg)->eraseFromParent();
627void SILowerControlFlow::optimizeEndCf() {
628// If the only instruction immediately following this END_CF is another 629// END_CF in the only successor we can avoid emitting exec mask restore here. 630if (!EnableOptimizeEndCf)
636 skipIgnoreExecInstsTrivialSucc(
MBB, std::next(
MI->getIterator()));
637if (Next ==
MBB.
end() || !LoweredEndCf.
count(&*Next))
639// Only skip inner END_CF if outer ENDCF belongs to SI_IF. 640// If that belongs to SI_ELSE then saved mask has an inverted value. 642 =
TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
646if (Def && LoweredIf.
count(SavedExec)) {
652Reg =
TII->getNamedOperand(*
MI, AMDGPU::OpName::src1)->getReg();
653MI->eraseFromParent();
656 removeMBBifRedundant(
MBB);
668switch (
MI.getOpcode()) {
677case AMDGPU::SI_IF_BREAK:
685case AMDGPU::SI_WATERFALL_LOOP:
686MI.setDesc(
TII->get(AMDGPU::S_CBRANCH_EXECNZ));
689case AMDGPU::SI_END_CF:
690 SplitBB = emitEndCf(
MI);
694assert(
false &&
"Attempt to process unsupported instruction");
703case AMDGPU::S_AND_B64:
704case AMDGPU::S_OR_B64:
705case AMDGPU::S_AND_B32:
706case AMDGPU::S_OR_B32:
707// Cleanup bit manipulations on exec mask 708 combineMasks(MaskMI);
721if (!
I.isDebugInstr() && !
I.isUnconditionalBranch())
732if (
P->getFallThrough(
false) == &
MBB)
734P->ReplaceUsesOfBlockWith(&
MBB, Succ);
742// If Succ, the single successor of MBB, is dominated by MBB, MDT needs 743// updating by changing Succ's idom to the one of MBB; otherwise, MBB must 744// be a leaf node in MDT and could be erased directly. 753// Note: we cannot update block layout and preserve live intervals; 754// hence we must insert a branch. 767TII =
ST.getInstrInfo();
768TRI = &
TII->getRegisterInfo();
773 BoolRC =
TRI->getBoolRC();
776 AndOpc = AMDGPU::S_AND_B32;
777 OrOpc = AMDGPU::S_OR_B32;
778 XorOpc = AMDGPU::S_XOR_B32;
779 MovTermOpc = AMDGPU::S_MOV_B32_term;
780 Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
781 XorTermrOpc = AMDGPU::S_XOR_B32_term;
782 OrTermrOpc = AMDGPU::S_OR_B32_term;
783 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
784Exec = AMDGPU::EXEC_LO;
786 AndOpc = AMDGPU::S_AND_B64;
787 OrOpc = AMDGPU::S_OR_B64;
788 XorOpc = AMDGPU::S_XOR_B64;
789 MovTermOpc = AMDGPU::S_MOV_B64_term;
790 Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
791 XorTermrOpc = AMDGPU::S_XOR_B64_term;
792 OrTermrOpc = AMDGPU::S_OR_B64_term;
793 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
797// Compute set of blocks with kills 800for (
auto &
MBB : MF) {
801bool IsKillBlock =
false;
803if (
TII->isKillTerminator(
Term.getOpcode())) {
809if (CanDemote && !IsKillBlock) {
811if (
MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
822 BI != MF.end(); BI = NextBB) {
823 NextBB = std::next(BI);
833switch (
MI.getOpcode()) {
836case AMDGPU::SI_IF_BREAK:
837case AMDGPU::SI_WATERFALL_LOOP:
839case AMDGPU::SI_END_CF:
840 SplitMBB = process(
MI);
846MBB = Next->getParent();
861 RecomputeRegs.
clear();
862 LoweredEndCf.
clear();
869bool SILowerControlFlowLegacy::runOnMachineFunction(
MachineFunction &MF) {
870// This doesn't actually need LiveIntervals, but we can preserve them. 871auto *LISWrapper = getAnalysisIfAvailable<LiveIntervalsWrapperPass>();
872LiveIntervals *LIS = LISWrapper ? &LISWrapper->getLIS() :
nullptr;
873// This doesn't actually need LiveVariables, but we can preserve them. 874auto *LVWrapper = getAnalysisIfAvailable<LiveVariablesWrapperPass>();
876auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
878return SILowerControlFlow(LIS, LV, MDT).run(MF);
889bool Changed = SILowerControlFlow(LIS, LV, MDT).run(MF);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
Provides AMDGPU specific target descriptions.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
AMD GCN specific subclass of TargetSubtarget.
const HexagonInstrInfo * TII
unsigned const TargetRegisterInfo * TRI
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< bool > RemoveRedundantEndcf("amdgpu-remove-redundant-endcf", cl::init(true), cl::ReallyHidden)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI)
This file defines the SmallSet class.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addUsedIfAvailable()
Add the specified Pass class to the set of analyses used by this pass.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents an Operation in the Expression.
Implements a dense probed hash-table based set.
Base class for the actual dominator tree node.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
void removeAllRegUnitsForPhysReg(MCRegister Reg)
Remove associated live ranges for the register units associated with Reg.
SlotIndex InsertMachineInstrInMaps(MachineInstr &MI)
void handleMove(MachineInstr &MI, bool UpdateFlags=false)
Call this method to notify LiveIntervals that instruction MI has been moved within a basic block.
void RemoveMachineInstrFromMaps(MachineInstr &MI)
void removeInterval(Register Reg)
Interval removal.
LiveInterval & createAndComputeVirtRegInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
void recomputeForSingleDefVirtReg(Register Reg)
Recompute liveness from scratch for a virtual register Reg that is known to have a single def that do...
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
int getNumber() const
MachineBasicBlocks are uniquely numbered at the function level, unless they're not in a MachineFuncti...
succ_iterator succ_begin()
unsigned succ_size() const
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
pred_iterator pred_begin()
bool isLayoutSuccessor(const MachineBasicBlock *MBB) const
Return true if the specified MBB will be emitted immediately after this block, such that if this bloc...
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
iterator_range< iterator > terminators()
DebugLoc findBranchDebugLoc()
Find and return the merged DebugLoc of the branch instructions of the block.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
Analysis pass which computes a MachineDominatorTree.
Analysis pass which computes a MachineDominatorTree.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
bool dominates(const MachineInstr *A, const MachineInstr *B) const
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
MachineOperand class - Representation of each machine instruction operand.
void setIsDead(bool Val=true)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static bool isVALU(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
A vector that has set insertion semantics.
void clear()
Completely clear the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
self_iterator getIterator()
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< DefNode * > Def
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
auto reverse(ContainerTy &&C)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
char & SILowerControlFlowLegacyID
@ Or
Bitwise or logical OR of integers.
@ Xor
Bitwise or logical XOR of integers.
@ And
Bitwise or logical AND of integers.
VarInfo - This represents the regions where a virtual register is live in the program.
std::vector< MachineInstr * > Kills
Kills - List of MachineInstruction's which are the last use of this virtual register (kill it) in the...