1//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This file implements a Loop Data Prefetching Pass. 11//===----------------------------------------------------------------------===// 33#define DEBUG_TYPE "loop-data-prefetch" 37// By default, we limit this to creating 16 PHIs (which is a little over half 38// of the allocatable register set). 41cl::desc(
"Prefetch write addresses"));
45cl::desc(
"Number of instructions to prefetch ahead"),
53"max-prefetch-iters-ahead",
56STATISTIC(NumPrefetches,
"Number of prefetches inserted");
60/// Loop prefetch implementation class. 61classLoopDataPrefetch {
66 : AC(AC), DT(DT), LI(LI), SE(SE),
TTI(
TTI), ORE(ORE) {}
71bool runOnLoop(
Loop *L);
73 /// Check if the stride of the accesses is large enough to 74 /// warrant a prefetch. 75bool isStrideLargeEnough(
constSCEVAddRecExpr *AR,
unsigned TargetMinStride);
77unsigned getMinPrefetchStride(
unsigned NumMemAccesses,
78unsigned NumStridedMemAccesses,
79unsigned NumPrefetches,
84 NumPrefetches, HasCall);
87unsigned getPrefetchDistance() {
93unsigned getMaxPrefetchIterationsAhead() {
99bool doPrefetchWrites() {
113/// Legacy class for inserting loop data prefetches. 116staticcharID;
// Pass ID, replacement for typeid 139char LoopDataPrefetchLegacyPass::ID = 0;
141"Loop Data Prefetch",
false,
false)
152returnnew LoopDataPrefetchLegacyPass();
155bool LoopDataPrefetch::isStrideLargeEnough(
constSCEVAddRecExpr *AR,
156unsigned TargetMinStride) {
157// No need to check if any stride goes. 158if (TargetMinStride <= 1)
162// If MinStride is set, don't prefetch unless we can ensure that stride is 167unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
168return TargetMinStride <= AbsStride;
181 LoopDataPrefetch LDP(AC, DT, LI, SE,
TTI, ORE);
182bool Changed = LDP.run();
194bool LoopDataPrefetchLegacyPass::runOnFunction(
Function &
F) {
198DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
199LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
200ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
202 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
204 &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
206 &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
F);
208 LoopDataPrefetch LDP(AC, DT, LI, SE,
TTI, ORE);
212bool LoopDataPrefetch::run() {
213// If PrefetchDistance is not set, don't run the pass. This gives an 214// opportunity for targets to run this pass for selected subtargets only 215// (whose TTI sets PrefetchDistance and CacheLineSize). 217LLVM_DEBUG(
dbgs() <<
"Please set both PrefetchDistance and CacheLineSize " 218"for loop data prefetch.\n");
222bool MadeChange =
false;
226 MadeChange |= runOnLoop(L);
231/// A record for a potential prefetch made during the initial scan of the 232/// loop. This is used to let a single prefetch target multiple memory accesses. 234 /// The address formula for this prefetch as returned by ScalarEvolution. 236 /// The point of insertion for the prefetch instruction. 238 /// True if targeting a write memory access. 240 /// The (first seen) prefetched instruction. 243 /// Constructor to create a new Prefetch for \p I. 248 /// Add the instruction \param I to this prefetch. If it's not the first 249 /// one, 'InsertPt' and 'Writes' will be updated as required. 250 /// \param PtrDiff the known constant address difference to the first added 253 int64_t PtrDiff = 0) {
261if (PrefBB != InsBB) {
267if (isa<StoreInst>(
I) && PtrDiff == 0)
273bool LoopDataPrefetch::runOnLoop(
Loop *L) {
274bool MadeChange =
false;
276// Only prefetch in the inner-most loop 277if (!
L->isInnermost())
283// Calculate the number of iterations ahead to prefetch 286for (
constauto BB :
L->blocks()) {
287// If the loop already has prefetches, then assume that the user knows 288// what they are doing and don't add any more. 290if (isa<CallInst>(&
I) || isa<InvokeInst>(&
I)) {
292if (
F->getIntrinsicID() == Intrinsic::prefetch)
296 }
else {
// indirect call. 301Metrics.analyzeBasicBlock(BB, *
TTI, EphValues);
304if (!
Metrics.NumInsts.isValid())
307unsigned LoopSize = *
Metrics.NumInsts.getValue();
311unsigned ItersAhead = getPrefetchDistance() / LoopSize;
315if (ItersAhead > getMaxPrefetchIterationsAhead())
319if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)
322unsigned NumMemAccesses = 0;
323unsigned NumStridedMemAccesses = 0;
325for (
constauto BB :
L->blocks())
330if (
LoadInst *LMemI = dyn_cast<LoadInst>(&
I)) {
332 PtrValue = LMemI->getPointerOperand();
333 }
elseif (
StoreInst *SMemI = dyn_cast<StoreInst>(&
I)) {
334if (!doPrefetchWrites())
continue;
336 PtrValue = SMemI->getPointerOperand();
343if (
L->isLoopInvariant(PtrValue))
347constSCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
350 NumStridedMemAccesses++;
352// We don't want to double prefetch individual cache lines. If this 353// access is known to be within one cache line of some other one that 354// has already been prefetched, then don't prefetch this one as well. 356for (
auto &Pref : Prefetches) {
359 dyn_cast<SCEVConstant>(PtrDiff)) {
360 int64_t
PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
362 Pref.addInstruction(MemI, DT, PD);
369 Prefetches.push_back(
Prefetch(LSCEVAddRec, MemI));
372unsigned TargetMinStride =
373 getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
374 Prefetches.size(), HasCall);
377 <<
" iterations ahead (loop size: " << LoopSize <<
") in " 378 <<
L->getHeader()->getParent()->getName() <<
": " << *L);
380 << NumMemAccesses <<
" memory accesses, " 381 << NumStridedMemAccesses <<
" strided memory accesses, " 382 << Prefetches.size() <<
" potential prefetch(es), " 383 <<
"a minimum stride of " << TargetMinStride <<
", " 384 << (HasCall ?
"calls" :
"no calls") <<
".\n");
386for (
auto &
P : Prefetches) {
387// Check if the stride of the accesses is large enough to warrant a 389if (!isStrideLargeEnough(
P.LSCEVAddRec, TargetMinStride))
396P.LSCEVAddRec->getStepRecurrence(*SE)));
397if (!SCEVE.isSafeToExpand(NextLSCEV))
402Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr,
P.InsertPt);
406 Builder.CreateIntrinsic(Intrinsic::prefetch, PrefPtrValue->
getType(),
407 {PrefPtrValue, ConstantInt::get(I32, P.Writes),
408 ConstantInt::get(I32, 3),
409 ConstantInt::get(I32, 1)});
412 << *
P.MemI->getOperand(isa<LoadInst>(
P.MemI) ? 0 : 1)
413 <<
", SCEV: " << *
P.LSCEVAddRec <<
"\n");
416 <<
"prefetched memory access";
This file builds on the ADT/GraphTraits.h file to build generic depth first graph iterator.
SmallVector< uint32_t, 0 > Writes
static cl::opt< bool > PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false), cl::desc("Prefetch write addresses"))
static cl::opt< unsigned > MinPrefetchStride("min-prefetch-stride", cl::desc("Min stride to add prefetches"), cl::Hidden)
static cl::opt< unsigned > PrefetchDistance("prefetch-distance", cl::desc("Number of instructions to prefetch ahead"), cl::Hidden)
static cl::opt< unsigned > MaxPrefetchIterationsAhead("max-prefetch-iters-ahead", cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden)
This file provides the interface for LLVM's Loop Data Prefetching Pass.
static const Function * getCalledFunction(const Value *V)
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This pass exposes codegen information to IR-level passes.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequiredID(const void *ID)
AnalysisUsage & addPreservedID(const void *ID)
AnalysisUsage & addRequired()
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Analysis pass which computes a DominatorTree.
Legacy analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Run the pass over the function.
The legacy pass manager's analysis pass to compute loop information.
Represents a single loop in the control flow graph.
OptimizationRemarkEmitter legacy analysis pass.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserve()
Mark an analysis as preserved.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
This class represents a constant integer value.
This class uses information about analyze scalars to rewrite expressions in canonical form.
This class represents an analyzed expression in the program.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Analysis pass providing the TargetTransformInfo.
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
unsigned getPrefetchDistance() const
bool enableWritePrefetching() const
unsigned getMaxPrefetchIterationsAhead() const
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Some HW prefetchers can handle accesses up to a certain constant stride.
bool shouldPrefetchAddressSpace(unsigned AS) const
unsigned getCacheLineSize() const
bool isLoweredToCall(const Function *F) const
Test whether calls to a function lower to actual program function calls.
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
int getNumOccurrences() const
const ParentTy * getParent() const
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ PD
PD - Prefix code for packed double precision vector floating point operations performed in the SSE re...
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &)
FunctionPass * createLoopDataPrefetchPass()
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
iterator_range< df_iterator< T > > depth_first(const T &G)
A record for a potential prefetch made during the initial scan of the loop.
void addInstruction(Instruction *I, DominatorTree *DT=nullptr, int64_t PtrDiff=0)
Add the instruction.
const SCEVAddRecExpr * LSCEVAddRec
The address formula for this prefetch as returned by ScalarEvolution.
Prefetch(const SCEVAddRecExpr *L, Instruction *I)
Constructor to create a new Prefetch for I.
Utility to calculate the size and a few similar metrics for a set of basic blocks.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).