1//===----- X86CallFrameOptimization.cpp - Optimize x86 call sequences -----===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This file defines a pass that optimizes call sequences on x86. 10// Currently, it converts movs of function parameters onto the stack into 11// pushes. This is beneficial for two main reasons: 12// 1) The push instruction encoding is much smaller than a stack-ptr-based mov. 13// 2) It is possible to push memory arguments directly. So, if the 14// the transformation is performed pre-reg-alloc, it can help relieve 17//===----------------------------------------------------------------------===// 52#define DEBUG_TYPE "x86-cf-opt" 56cl::desc(
"Avoid optimizing x86 call frames for size"),
70// Information we know about a particular call site 72 CallContext() : FrameSetup(nullptr), ArgStoreVector(4, nullptr) {}
74// Iterator referring to the frame setup instruction 77// Actual call instruction 80// A copy of the stack pointer 83// The total displacement of all passed parameters 84 int64_t ExpectedDist = 0;
86// The sequence of storing instructions used to pass the parameters 89// True if this call site has no stack parameters 90bool NoStackParams =
false;
92// True if this call site can use push instructions 105void adjustCallSequence(
MachineFunction &MF,
const CallContext &Context);
110enum InstClassification { Convert,
Skip,
Exit };
123unsigned SlotSize = 0;
124unsigned Log2SlotSize = 0;
127}
// end anonymous namespace 128char X86CallFrameOptimization::ID = 0;
130"X86 Call Frame Optimization",
false,
false)
132// This checks whether the transformation is legal. 133// Also returns false in cases where it's potentially legal, but 134// we don't even want to try. 139// We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset 140// in the compact unwind encoding that Darwin uses. So, bail if there 141// is a danger of that being generated. 142if (STI->isTargetDarwin() &&
143 (!MF.getLandingPads().empty() ||
144 (MF.getFunction().needsUnwindTableEntry() && !TFL->hasFP(MF))))
147// It is not valid to change the stack pointer outside the prolog/epilog 149if (STI->isTargetWin64())
152// You would expect straight-line code between call-frame setup and 153// call-frame destroy. You would be wrong. There are circumstances (e.g. 154// CMOV_GR8 expansion of a select that feeds a function call!) where we can 155// end up with the setup and the destroy in different basic blocks. 156// This is bad, and breaks SP adjustment. 157// So, check that all of the frames in the function are closed inside 158// the same block, and, for good measure, that there are no nested frames. 160// If any call allocates more argument stack memory than the stack 161// probe size, don't do this optimization. Otherwise, this pass 162// would need to synthesize additional stack probe calls to allocate 163// memory for arguments. 164unsigned FrameSetupOpcode =
TII->getCallFrameSetupOpcode();
165unsigned FrameDestroyOpcode =
TII->getCallFrameDestroyOpcode();
166bool EmitStackProbeCall = STI->getTargetLowering()->hasStackProbeSymbol(MF);
167unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF);
169bool InsideFrameSequence =
false;
171if (
MI.getOpcode() == FrameSetupOpcode) {
172if (
TII->getFrameSize(
MI) >= StackProbeSize && EmitStackProbeCall)
174if (InsideFrameSequence)
176 InsideFrameSequence =
true;
177 }
elseif (
MI.getOpcode() == FrameDestroyOpcode) {
178if (!InsideFrameSequence)
180 InsideFrameSequence =
false;
184if (InsideFrameSequence)
191// Check whether this transformation is profitable for a particular 192// function - in terms of code size. 194 ContextVector &CallSeqVector) {
195// This transformation is always a win when we do not expect to have 196// a reserved call frame. Under other circumstances, it may be either 197// a win or a loss, and requires a heuristic. 199if (CannotReserveFrame)
202Align StackAlign = TFL->getStackAlign();
204 int64_t Advantage = 0;
205for (
constauto &
CC : CallSeqVector) {
206// Call sites where no parameters are passed on the stack 207// do not affect the cost, since there needs to be no 213// If we don't use pushes for a particular call site, 214// we pay for not having a reserved call frame with an 215// additional sub/add esp pair. The cost is ~3 bytes per instruction, 216// depending on the size of the constant. 217// TODO: Callee-pop functions should have a smaller penalty, because 218// an add is needed even with a reserved call frame. 221// We can use pushes. First, account for the fixed costs. 222// We'll need a add after the call. 224// If we have to realign the stack, we'll also need a sub before 227// Now, for each push, we save ~3 bytes. For small constants, we actually, 228// save more (up to 5 bytes), but 3 should be a good approximation. 229 Advantage += (
CC.ExpectedDist >> Log2SlotSize) * 3;
233return Advantage >= 0;
236bool X86CallFrameOptimization::runOnMachineFunction(
MachineFunction &MF) {
238TII = STI->getInstrInfo();
239 TFL = STI->getFrameLowering();
244 SlotSize =
RegInfo.getSlotSize();
246 Log2SlotSize =
Log2_32(SlotSize);
251unsigned FrameSetupOpcode =
TII->getCallFrameSetupOpcode();
255 ContextVector CallSeqVector;
259if (
MI.getOpcode() == FrameSetupOpcode) {
261 collectCallInfo(MF,
MBB,
MI, Context);
262 CallSeqVector.push_back(Context);
268for (
constauto &
CC : CallSeqVector) {
270 adjustCallSequence(MF,
CC);
278X86CallFrameOptimization::InstClassification
279X86CallFrameOptimization::classifyInstruction(
285// The instructions we actually care about are movs onto the stack or special 286// cases of constant-stores to stack 287switch (
MI->getOpcode()) {
290case X86::AND64mi32: {
307// Not all calling conventions have only stack MOVs between the stack 308// adjust and the call. 310// We want to tolerate other instructions, to cover more cases. 312// a) PCrel calls, where we expect an additional COPY of the basereg. 313// b) Passing frame-index addresses. 314// c) Calling conventions that have inreg parameters. These generate 315// both copies and movs into registers. 316// To avoid creating lots of special cases, allow any instruction 317// that does not write into memory, does not def or use the stack 318// pointer, and does not def any register that was used by a preceding 320// (Reading from memory is allowed, even if referenced through a 321// frame index, since these will get adjusted properly in PEI) 323// The reason for the last condition is that the pushes can't replace 324// the movs in place, because the order must be reversed. 325// So if we have a MOV32mr that uses EDX, then an instruction that defs 326// EDX, and then the call, after the transformation the push will use 327// the modified version of EDX, and not the original one. 328// Since we are still in SSA form at this point, we only need to 329// make sure we don't clobber any *physical* registers that were 330// used by an earlier mov that will become a push. 332if (
MI->isCall() ||
MI->mayStore())
339if (!
Reg.isPhysical())
344for (
unsignedint U : UsedRegs)
345if (
RegInfo.regsOverlap(Reg, U))
356 CallContext &Context) {
357// Check that this particular call sequence is amenable to the 362// We expect to enter this at the beginning of a call sequence 363assert(
I->getOpcode() ==
TII->getCallFrameSetupOpcode());
365 Context.FrameSetup = FrameSetup;
367// How much do we adjust the stack? This puts an upper bound on 368// the number of parameters actually passed on it. 369unsignedint MaxAdjust =
TII->getFrameSize(*FrameSetup) >> Log2SlotSize;
371// A zero adjustment means no stack parameters 373 Context.NoStackParams =
true;
377// Skip over DEBUG_VALUE. 378// For globals in PIC mode, we can have some LEAs here. Skip them as well. 379// TODO: Extend this to something that covers more cases. 380while (
I->getOpcode() == X86::LEA32r ||
I->isDebugInstr())
384auto StackPtrCopyInst =
MBB.
end();
385// SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual 386// register. If it's there, use that virtual register as stack pointer 387// instead. Also, we need to locate this instruction so that we can later 388// safely ignore it while doing the conservative processing of the call chain. 389// The COPY can be located anywhere between the call-frame setup 390// instruction and its first use. We use the call instruction as a boundary 391// because it is usually cheaper to check if an instruction is a call than 392// checking if an instruction uses a register. 393for (
auto J =
I; !J->isCall(); ++J)
394if (J->isCopy() && J->getOperand(0).isReg() && J->getOperand(1).isReg() &&
395 J->getOperand(1).getReg() == StackPtr) {
396 StackPtrCopyInst = J;
397 Context.SPCopy = &*J++;
398StackPtr = Context.SPCopy->getOperand(0).getReg();
402// Scan the call setup sequence for the pattern we're looking for. 403// We only handle a simple case - a sequence of store instructions that 404// push a sequence of stack-slot-aligned values onto the stack, with 405// no gaps between them. 407 Context.ArgStoreVector.resize(MaxAdjust,
nullptr);
411for (InstClassification Classification = Skip; Classification !=
Exit; ++
I) {
412// If this is the COPY of the stack pointer, it's ok to ignore. 413if (
I == StackPtrCopyInst)
415 Classification = classifyInstruction(
MBB,
I,
RegInfo, UsedRegs);
416if (Classification != Convert)
418// We know the instruction has a supported store opcode. 419// We only want movs of the form: 420// mov imm/reg, k(%StackPtr) 421// If we run into something else, bail. 422// Note that AddrBaseReg may, counter to its name, not be a register, 423// but rather a frame index. 424// TODO: Support the fi case. This should probably work now that we 425// have the infrastructure to track the stack pointer within a call 438"Negative stack displacement when passing parameters");
440// We really don't want to consider the unaligned case. 441if (StackDisp & (SlotSize - 1))
443 StackDisp >>= Log2SlotSize;
445assert((
size_t)StackDisp < Context.ArgStoreVector.size() &&
446"Function call has more parameters than the stack is adjusted for.");
448// If the same stack slot is being filled twice, something's fishy. 449if (Context.ArgStoreVector[StackDisp] !=
nullptr)
451 Context.ArgStoreVector[StackDisp] = &*
I;
464// We now expect the end of the sequence. If we stopped early, 465// or reached the end of the block without finding a call, bail. 473// Now, go through the vector, and see that we don't have any gaps, 474// but only a series of storing instructions. 475auto MMI = Context.ArgStoreVector.begin(), MME = Context.ArgStoreVector.end();
476for (; MMI != MME; ++MMI, Context.ExpectedDist += SlotSize)
480// If the call had no parameters, do nothing 481if (MMI == Context.ArgStoreVector.begin())
484// We are either at the last parameter, or a gap. 485// Make sure it's not a gap 486for (; MMI != MME; ++MMI)
490 Context.UsePush =
true;
494const CallContext &Context) {
495// Ok, we can in fact do the transformation for this call. 496// Do not remove the FrameSetup instruction, but adjust the parameters. 497// PEI will end up finalizing the handling of this. 500TII->setFrameAdjustment(*FrameSetup, Context.ExpectedDist);
503bool Is64Bit = STI->is64Bit();
504// Now, iterate through the vector in reverse order, and replace the store to 505// stack with pushes. MOVmi/MOVmr doesn't have any defs, so no need to 507for (
intIdx = (Context.ExpectedDist >> Log2SlotSize) - 1;
Idx >= 0; --
Idx) {
512switch (
Store->getOpcode()) {
523 PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSH32i;
525 Push->cloneMemRefs(MF, *Store);
531// If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg 532// in preparation for the PUSH64. The upper 32 bits can be undef. 533if (Is64Bit &&
Store->getOpcode() == X86::MOV32mr) {
534Register UndefReg =
MRI->createVirtualRegister(&X86::GR64RegClass);
535Reg =
MRI->createVirtualRegister(&X86::GR64RegClass);
543// If PUSHrmm is not slow on this target, try to fold the source of the 544// push into the instruction. 545bool SlowPUSHrmm = STI->slowTwoMemOps();
547// Check that this is legal to fold. Right now, we're extremely 548// conservative about that. 550if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
551 PushOpcode = Is64Bit ? X86::PUSH64rmm : X86::PUSH32rmm;
557 Push->cloneMergedMemRefs(MF, {DefMov, &*
Store});
560 PushOpcode = Is64Bit ? X86::PUSH64r : X86::PUSH32r;
564 Push->cloneMemRefs(MF, *Store);
570// For debugging, when using SP-based CFA, we need to adjust the CFA 571// offset after each push. 572// TODO: This is needed only if we require precise CFA. 575MBB, std::next(Push),
DL,
581// The stack-pointer copy is no longer used in the call sequences. 582// There should not be any other users, but we can't commit to that, so: 583if (Context.SPCopy &&
MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
584 Context.SPCopy->eraseFromParent();
586// Once we've done this, we need to make sure PEI doesn't assume a reserved 592MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
594// Do an extremely restricted form of load folding. 595// ISel will often create patterns like: 598// movl 12(%edi), %edx 603// Get rid of those with prejudice. 607// Make sure this is the only use of Reg. 608if (!
MRI->hasOneNonDBGUse(Reg))
613// Make sure the def is a MOV from memory. 614// If the def is in another block, give up. 615if ((
DefMI.getOpcode() != X86::MOV32rm &&
616DefMI.getOpcode() != X86::MOV64rm) ||
617DefMI.getParent() != FrameSetup->getParent())
620// Make sure we don't have any instructions between DefMI and the 621// push that make folding the load illegal. 623if (
I->isLoadFoldBarrier())
630returnnew X86CallFrameOptimization();
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines the DenseSet and SmallDenseSet classes.
const HexagonInstrInfo * TII
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallVector class.
static bool isProfitable(const SmallVector< std::unique_ptr< StableFunctionMap::StableFunctionEntry > > &SFS)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static cl::opt< bool > NoX86CFOpt("no-x86-call-frame-opt", cl::desc("Avoid optimizing x86 call frames for size"), cl::init(false), cl::Hidden)
Implements a dense probed hash-table based set.
FunctionPass class - This class is used to implement most global optimizations.
static MCCFIInstruction createAdjustCfaOffset(MCSymbol *L, int64_t Adjustment, SMLoc Loc={})
.cfi_adjust_cfa_offset Same as .cfi_def_cfa_offset, but Offset is a relative value that is added/subt...
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
bool hasVarSizedObjects() const
This method may be called any time after instruction selection is complete to determine if the stack ...
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
virtual bool runOnMachineFunction(MachineFunction &MF)=0
runOnMachineFunction - This method must be overloaded to perform the desired machine code transformat...
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
const MCInstrDesc & getDesc() const
Returns the target instruction descriptor of this MachineInstr.
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
Wrapper class representing virtual and physical registers.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
X86MachineFunctionInfo - This class is derived from MachineFunction and contains private X86 target-s...
void setHasPushSequences(bool HasPush)
std::pair< iterator, bool > insert(const ValueT &V)
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
bool isAligned(Align Lhs, uint64_t SizeInBytes)
Checks that SizeInBytes is a multiple of the alignment.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
FunctionPass * createX86CallFrameOptimization()
Return a pass that optimizes the code-size of x86 call sequences.
This struct is a compact representation of a valid (non-zero power of two) alignment.