1//===- LoopVectorizationLegality.cpp --------------------------------------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This file provides loop vectorization legality analysis. Original code 10// resided in LoopVectorize.cpp for a long time. 12// At this point, it is implemented as a utility class, not as an analysis 13// pass. It should be easy to create an analysis pass around it if there 14// is a need (but D45420 needs to happen first). 32using namespacePatternMatch;
34#define LV_NAME "loop-vectorize" 35#define DEBUG_TYPE LV_NAME 39cl::desc(
"Enable if-conversion during vectorization."));
43cl::desc(
"Enable recognition of non-constant strided " 44"pointer induction variables."));
49cl::desc(
"Allow enabling loop hints to reorder " 50"FP operations during vectorization."));
53// TODO: Move size-based thresholds out of legality checking, make cost based 54// decisions instead of hard thresholds. 57cl::desc(
"The maximum number of SCEV checks allowed."));
61cl::desc(
"The maximum number of SCEV checks allowed with a " 62"vectorize(enable) pragma"));
68cl::desc(
"Control whether the compiler can use scalable vectors to " 72"Scalable vectorization is disabled."),
75"Scalable vectorization is available and favored when the " 76"cost is inconclusive."),
79"Scalable vectorization is available and favored when the " 80"cost is inconclusive.")));
84cl::desc(
"Enables autovectorization of some loops containing histograms"));
86/// Maximum vectorization interleave count. 91bool LoopVectorizeHints::Hint::validate(
unsigned Val) {
102return (Val == 0 || Val == 1);
108bool InterleaveOnlyWhenForced,
112 Interleave(
"interleave.count", InterleaveOnlyWhenForced, HK_INTERLEAVE),
113 Force(
"vectorize.enable", FK_Undefined, HK_FORCE),
114 IsVectorized(
"isvectorized", 0, HK_ISVECTORIZED),
115 Predicate(
"vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
116 Scalable(
"vectorize.scalable.enable", SK_Unspecified, HK_SCALABLE),
117 TheLoop(L), ORE(ORE) {
118// Populate values with existing loop metadata. 119 getHintsFromMetadata();
121// force-vector-interleave overrides DisableInterleaving. 125// If the metadata doesn't explicitly specify whether to enable scalable 126// vectorization, then decide based on the following criteria (increasing 127// level of priority): 130// - Force option (always overrides) 137// If the width is set, but the metadata says nothing about the scalable 138// property, then assume it concerns only a fixed-width UserVF. 139// If width is not set, the flag takes precedence. 143// If the flag is set to force any use of scalable vectors, override the loop 149// Scalable vectorization is disabled if no preference is specified. 153if (IsVectorized.Value != 1)
154// If the vectorization width and interleaving count are both 1 then 155// consider the loop to have been already vectorized because there's 156// nothing more that we can do. 160 <<
"LV: Interleaving disabled by the pass manager\n");
173 {
Twine(Prefix(),
"vectorize.").
str(),
174Twine(Prefix(),
"interleave.").
str()},
178// Update internal cache. 179 IsVectorized.Value = 1;
185LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: #pragma vectorize disable.\n");
191LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: No #pragma vectorize enable.\n");
197LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Disabled/already vectorized.\n");
198// FIXME: Add interleave.disable metadata. This will allow 199// vectorize.disable to be used without disabling the pass and errors 200// to differentiate between disabled vectorization and a width of 1. 203"AllDisabled", L->getStartLoc(),
205 <<
"loop not vectorized: vectorization and interleaving are " 206"explicitly disabled, or the loop has already been " 223 <<
"loop not vectorized: vectorization is explicitly disabled";
227 R <<
"loop not vectorized";
229 R <<
" (Force=" << NV(
"Force", true);
230 if (Width.Value != 0)
231 R <<
", Vector Width=" << NV(
"VectorWidth", getWidth());
232 if (getInterleave() != 0)
233 R <<
", Interleave Count=" << NV(
"InterleaveCount", getInterleave());
251// Allow the vectorizer to change the order of operations if enabling 252// loop hints are provided 256 EC.getKnownMinValue() > 1);
259void LoopVectorizeHints::getHintsFromMetadata() {
264// First operand should refer to the loop id itself. 272// The expected hint is either a MDString or a MDNode with the first 273// operand a MDString. 274if (
constMDNode *MD = dyn_cast<MDNode>(MDO)) {
275if (!MD || MD->getNumOperands() == 0)
277 S = dyn_cast<MDString>(MD->getOperand(0));
278for (
unsignedIdx = 1;
Idx < MD->getNumOperands(); ++
Idx)
279 Args.push_back(MD->getOperand(
Idx));
281 S = dyn_cast<MDString>(MDO);
282assert(Args.size() == 0 &&
"too many arguments for MDString");
288// Check if the hint starts with the loop metadata prefix. 291 setHint(
Name, Args[0]);
296if (!
Name.starts_with(Prefix()))
300constConstantInt *
C = mdconst::dyn_extract<ConstantInt>(Arg);
303unsigned Val =
C->getZExtValue();
305 Hint *Hints[] = {&Width, &Interleave, &Force,
307for (
auto *
H : Hints) {
318// Return true if the inner loop \p Lp is uniform with regard to the outer loop 319// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes 320// executing the inner loop will execute the same iterations). This check is 321// very constrained for now but it will be relaxed in the future. \p Lp is 322// considered uniform if it meets all the following conditions: 323// 1) it has a canonical IV (starting from 0 and with stride 1), 324// 2) its latch terminator is a conditional branch and, 325// 3) its latch condition is a compare instruction whose operands are the 326// canonical IV and an OuterLp invariant. 327// This check doesn't take into account the uniformity of other conditions not 328// related to the loop latch because they don't affect the loop uniformity. 330// NOTE: We decided to keep all these checks and its associated documentation 331// together so that we can easily have a picture of the current supported loop 332// nests. However, some of the current checks don't depend on \p OuterLp and 333// would be redundantly executed for each \p Lp if we invoked this function for 334// different candidate outer loops. This is not the case for now because we 335// don't currently have the infrastructure to evaluate multiple candidate outer 336// loops and \p OuterLp will be a fixed parameter while we only support explicit 337// outer loop vectorization. It's also very likely that these checks go away 338// before introducing the aforementioned infrastructure. However, if this is not 339// the case, we should move the \p OuterLp independent checks to a separate 340// function that is only executed once for each \p Lp. 344// If Lp is the outer loop, it's uniform by definition. 359if (!LatchBr || LatchBr->isUnconditional()) {
365auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
368dbgs() <<
"LV: Loop latch condition is not a compare instruction.\n");
372Value *CondOp0 = LatchCmp->getOperand(0);
373Value *CondOp1 = LatchCmp->getOperand(1);
374Value *IVUpdate =
IV->getIncomingValueForBlock(Latch);
384// Return true if \p Lp and all its nested loops are uniform with regard to \p 390// Check if nested loops are uniform. 391for (
Loop *SubLp : *Lp)
400returnDL.getIntPtrType(Ty);
402// It is possible that char's or short's overflow when we ask for the loop's 403// trip count, work around this by changing the type size. 418/// Check that the instruction has outside loop users and is not an 419/// identified reduction variable. 422// Reductions, Inductions and non-header phis are allowed to have exit users. All 423// other instructions must not have external users. 424if (!AllowedExit.
count(Inst))
425// Check that all of the users of the loop are inside the BB. 428// This user may be a reduction exit value. 430LLVM_DEBUG(
dbgs() <<
"LV: Found an outside user for : " << *UI <<
'\n');
437/// Returns true if A and B have same pointer operands or same SCEVs addresses 444// Otherwise Compare pointers 445Value *APtr =
A->getPointerOperand();
446Value *BPtr =
B->getPointerOperand();
450// Otherwise compare address SCEVs 456// FIXME: Currently, the set of symbolic strides is sometimes queried before 457// it's collected. This happens from canVectorizeWithIfConvert, when the 458// pointer is checked to reference consecutive elements suitable for a 466 CanAddPredicate,
false).value_or(0);
467if (Stride == 1 || Stride == -1)
477/// A rewriter to build the SCEVs for each of the VF lanes in the expected 478/// vectorized loop, which can then be compared to detect their uniformity. This 479/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop) 480/// with new AddRecs where the step is multiplied by StepMultiplier and Offset * 481/// Step is added. Also checks if all sub-expressions are analyzable w.r.t. 483classSCEVAddRecForUniformityRewriter
485 /// Multiplier to be applied to the step of AddRecs in TheLoop. 486unsigned StepMultiplier;
488 /// Offset to be added to the AddRecs in TheLoop. 491 /// Loop for which to rewrite AddRecsFor. 494 /// Is any sub-expressions not analyzable w.r.t. uniformity? 495bool CannotAnalyze =
false;
497bool canAnalyze()
const{
return !CannotAnalyze; }
500 SCEVAddRecForUniformityRewriter(
ScalarEvolution &SE,
unsigned StepMultiplier,
501unsigned Offset,
Loop *TheLoop)
507"addrec outside of TheLoop must be invariant and should have been " 509// Build a new AddRec by multiplying the step by StepMultiplier and 510// incrementing the start by Offset * step. 513if (!SE.isLoopInvariant(Step, TheLoop)) {
518 SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier));
519constSCEV *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset));
520constSCEV *NewStart = SE.getAddExpr(Expr->
getStart(), ScaledOffset);
525if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop))
531if (SE.isLoopInvariant(S, TheLoop))
533// The value could vary across iterations. 539// Could not analyze the expression. 545unsigned StepMultiplier,
unsigned Offset,
547 /// Bail out if the expression does not contain an UDiv expression. 548 /// Uniform values which are not loop invariant require operations to strip 549 /// out the lowest bits. For now just look for UDivs and use it to avoid 550 /// re-writing UDIV-free expressions for other lanes to limit compile time. 552 [](
constSCEV *S) {
return isa<SCEVUDivExpr>(S); }))
555 SCEVAddRecForUniformityRewriter
Rewriter(SE, StepMultiplier, Offset,
575// Since we rely on SCEV for uniformity, if the type is not SCEVable, it is 576// never considered uniform. 577auto *SE = PSE.
getSE();
582// Rewrite AddRecs in TheLoop to step by VF and check if the expression for 583// lane 0 matches the expressions for all other lanes. 585constSCEV *FirstLaneExpr =
586 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop);
587if (isa<SCEVCouldNotCompute>(FirstLaneExpr))
590// Make sure the expressions for lanes FixedVF-1..1 match the expression for 591// lane 0. We check lanes in reverse order for compile-time, as frequently 592// checking the last lane is sufficient to rule out uniformity. 594constSCEV *IthLaneExpr =
595 SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF,
I, TheLoop);
596return FirstLaneExpr == IthLaneExpr;
605// Note: There's nothing inherent which prevents predicated loads and 606// stores from being uniform. The current lowering simply doesn't handle 607// it; in particular, the cost model distinguishes scatter/gather from 608// scalar w/predication, and we currently rely on the scalar path. 612bool LoopVectorizationLegality::canVectorizeOuterLoop() {
614// Store the result and return it at the end instead of exiting early, in case 615// allowExtraAnalysis is used to report multiple reasons for not vectorizing. 620// Check whether the BB terminator is a BranchInst. Any other terminator is 622auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
625"loop control flow is not understood by vectorizer",
626"CFGNotUnderstood", ORE, TheLoop);
633// Check whether the BranchInst is a supported one. Only unconditional 634// branches, conditional branches with an outer loop invariant condition or 635// backedges are supported. 636// FIXME: We skip these checks when VPlan predication is enabled as we 637// want to allow divergent branches. This whole check will be removed 638// once VPlan predication is on by default. 639if (Br && Br->isConditional() &&
644"loop control flow is not understood by vectorizer",
645"CFGNotUnderstood", ORE, TheLoop);
653// Check whether inner loops are uniform. At this point, we only support 654// simple outer loops scenarios with uniform nested loops. 656 TheLoop
/*context outer loop*/)) {
658"loop control flow is not understood by vectorizer",
659"CFGNotUnderstood", ORE, TheLoop);
666// Check whether we are able to set up outer loop induction. 667if (!setupOuterLoopInductions()) {
669"UnsupportedPhi", ORE, TheLoop);
679void LoopVectorizationLegality::addInductionPhi(
684// In case this induction also comes with casts that we know we can ignore 685// in the vectorized loop body, record them here. All casts could be recorded 686// here for ignoring, but suffices to record only the first (as it is the 687// only one that may bw used outside the cast sequence). 695// Get the widest type. 703// Int inductions are special because we only allow one IV. 705ID.getConstIntStepValue() &&
ID.getConstIntStepValue()->isOne() &&
706 isa<Constant>(
ID.getStartValue()) &&
707 cast<Constant>(
ID.getStartValue())->isNullValue()) {
709// Use the phi node with the widest type as induction. Use the last 710// one if there are multiple (no good reason for doing this other 711// than it is expedient). We've checked that it begins at zero and 712// steps by one, so this is a canonical induction variable. 713if (!PrimaryInduction || PhiTy == WidestIndTy)
714 PrimaryInduction =
Phi;
717// Both the PHI node itself, and the "post-increment" value feeding 718// back into the PHI node may have external users. 719// We can allow those uses, except if the SCEVs we have for them rely 720// on predicates that only hold within the loop, since allowing the exit 721// currently means re-using this SCEV outside the loop (see PR33706 for more 731bool LoopVectorizationLegality::setupOuterLoopInductions() {
734// Returns true if a given Phi is a supported induction. 735auto IsSupportedPhi = [&](
PHINode &
Phi) ->
bool {
739 addInductionPhi(&Phi,
ID, AllowedExit);
742// Bail out for any Phi in the outer loop header that is not a supported 745dbgs() <<
"LV: Found unsupported PHI for outer loop vectorization.\n");
752/// Checks if a function is scalarizable according to the TLI, in 753/// the sense that it should be vectorized and then expanded in 754/// multiple scalar calls. This is represented in the 755/// TLI via mappings that do not specify a vector name, as in the 756/// following example: 758/// const VecDesc VecIntrinsics[] = { 759/// {"llvm.phx.abs.i32", "", 4} 764// Check that all known VFs are not associated to a vector 765// function, i.e. the vector name is emty. 768 TLI.
getWidestVF(ScalarName, WidestFixedVF, WidestScalableVF);
776"Caller may decide to scalarize a variant using a scalable VF");
781/// Returns true if the call return type `Ty` can be widened by the loop 784auto *StructTy = dyn_cast<StructType>(Ty);
785// TODO: Remove the homogeneous types restriction. This is just an initial 786// simplification. When we want to support things like the overflow intrinsics 787// we will have to lift this restriction. 788if (StructTy && !StructTy->containsHomogeneousTypes())
793bool LoopVectorizationLegality::canVectorizeInstrs() {
796// For each block in the loop. 798// Scan the instructions in the block and look for hazards. 800if (
auto *Phi = dyn_cast<PHINode>(&
I)) {
801Type *PhiTy = Phi->getType();
802// Check that this PHI type is allowed. 806"loop control flow is not understood by vectorizer",
807"CFGNotUnderstood", ORE, TheLoop);
811// If this PHINode is not in the header block, then we know that we 812// can convert it to select during if-conversion. No need to check if 813// the PHIs in this block are induction or reduction variables. 815// Non-header phi nodes that have outside uses can be vectorized. Add 816// them to the list of allowed exits. 817// Unsafe cyclic dependencies with header phis are identified during 818// legalization for reduction, induction and fixed order 824// We only allow if-converted PHIs with exactly two incoming values. 825if (
Phi->getNumIncomingValues() != 2) {
827"loop control flow is not understood by vectorizer",
828"CFGNotUnderstood", ORE, TheLoop, Phi);
837 Reductions[
Phi] = RedDes;
841// We prevent matching non-constant strided pointer IVS to preserve 842// historical vectorizer behavior after a generalization of the 843// IVDescriptor code. The intent is to remove this check, but we 844// have to fix issues around code quality for such loops first. 845auto IsDisallowedStridedPointerInduction =
850ID.getConstIntStepValue() ==
nullptr;
853// TODO: Instead of recording the AllowedExit, it would be good to 854// record the complementary set: NotAllowedExit. These include (but may 855// not be limited to): 856// 1. Reduction phis as they represent the one-before-last value, which 857// is not available when vectorized 858// 2. Induction phis and increment when SCEV predicates cannot be used 859// outside the loop - see addInductionPhi 860// 3. Non-Phis with outside uses when SCEV predicates cannot be used 861// outside the loop - see call to hasOutsideLoopUser in the non-phi 863// 4. FixedOrderRecurrence phis that can possibly be handled by 865// By recording these, we can then reason about ways to vectorize each 866// of these NotAllowedExit. 869 !IsDisallowedStridedPointerInduction(
ID)) {
870 addInductionPhi(Phi,
ID, AllowedExit);
877 FixedOrderRecurrences.
insert(Phi);
881// As a last resort, coerce the PHI to a AddRec expression 882// and re-try classifying it a an induction PHI. 884 !IsDisallowedStridedPointerInduction(
ID)) {
885 addInductionPhi(Phi,
ID, AllowedExit);
890"value that could not be identified as " 891"reduction is used outside the loop",
892"NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
894 }
// end of PHI handling 896// We handle calls that: 897// * Are debug info intrinsics. 898// * Have a mapping to an IR intrinsic. 899// * Have a vector version available. 900auto *CI = dyn_cast<CallInst>(&
I);
903 !isa<DbgInfoIntrinsic>(CI) &&
904 !(CI->getCalledFunction() && TLI &&
907// If the call is a recognized math libary call, it is likely that 908// we can vectorize it given loosened floating-point constraints. 911 TLI && CI->getCalledFunction() &&
912 CI->getType()->isFloatingPointTy() &&
913 TLI->
getLibFunc(CI->getCalledFunction()->getName(), Func) &&
917// TODO: Ideally, we should not use clang-specific language here, 918// but it's hard to provide meaningful yet generic advice. 919// Also, should this be guarded by allowExtraAnalysis() and/or be part 920// of the returned info from isFunctionVectorizable()? 922"Found a non-intrinsic callsite",
923"library call cannot be vectorized. " 924"Try compiling with -fno-math-errno, -ffast-math, " 926"CantVectorizeLibcall", ORE, TheLoop, CI);
929"call instruction cannot be vectorized",
930"CantVectorizeLibcall", ORE, TheLoop, CI);
935// Some intrinsics have scalar arguments and should be same in order for 936// them to be vectorized (i.e. loop invariant). 938auto *SE = PSE.
getSE();
940for (
unsignedIdx = 0;
Idx < CI->arg_size(); ++
Idx)
945"intrinsic instruction cannot be vectorized",
946"CantVectorizeIntrinsic", ORE, TheLoop, CI);
952// If we found a vectorized variant of a function, note that so LV can 953// make better decisions about maximum VF. 955 VecCallVariantsFound =
true;
957auto CanWidenInstructionTy = [
this](
Instructionconst &Inst) {
958Type *InstTy = Inst.getType();
959if (!isa<StructType>(InstTy))
962// For now, we only recognize struct values returned from calls where 963// all users are extractvalue as vectorizable. All element types of the 964// struct must be types that can be widened. 966all_of(Inst.users(), IsaPred<ExtractValueInst>)) {
967// TODO: Remove the `StructVecCallFound` flag once vectorizing calls 968// with struct returns is supported. 969 StructVecCallFound =
true;
976// Check that the instruction return type is vectorizable. 977// We can't vectorize casts from vector type to scalar type. 978// Also, we can't vectorize extractelement instructions. 979if (!CanWidenInstructionTy(
I) ||
982 isa<ExtractElementInst>(
I)) {
984"instruction return type cannot be vectorized",
985"CantVectorizeInstructionReturnType", ORE, TheLoop, &
I);
989// Check that the stored type is vectorizable. 990if (
auto *ST = dyn_cast<StoreInst>(&
I)) {
991Type *
T =
ST->getValueOperand()->getType();
994"CantVectorizeStore", ORE, TheLoop, ST);
998// For nontemporal stores, check that a nontemporal vector version is 999// supported on the target. 1000if (
ST->getMetadata(LLVMContext::MD_nontemporal)) {
1001// Arbitrarily try a vector of 2 elements. 1003assert(VecTy &&
"did not find vectorized version of stored type");
1006"nontemporal store instruction cannot be vectorized",
1007"CantVectorizeNontemporalStore", ORE, TheLoop, ST);
1012 }
elseif (
auto *LD = dyn_cast<LoadInst>(&
I)) {
1013if (
LD->getMetadata(LLVMContext::MD_nontemporal)) {
1014// For nontemporal loads, check that a nontemporal vector version is 1015// supported on the target (arbitrarily try a vector of 2 elements). 1017assert(VecTy &&
"did not find vectorized version of load type");
1020"nontemporal load instruction cannot be vectorized",
1021"CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
1026// FP instructions can allow unsafe algebra, thus vectorizable by 1027// non-IEEE-754 compliant SIMD units. 1028// This applies to floating-point math operations and calls, not memory 1029// operations, shuffles, or casts, as they don't change precision or 1031 }
elseif (
I.getType()->isFloatingPointTy() && (CI ||
I.isBinaryOp()) &&
1034 Hints->setPotentiallyUnsafe();
1037// Reduction instructions are allowed to have exit users. 1038// All other instructions must not have external users. 1040// We can safely vectorize loops where instructions within the loop are 1041// used outside the loop only if the SCEV predicates within the loop is 1042// same as outside the loop. Allowing the exit means reusing the SCEV 1049"ValueUsedOutsideLoop", ORE, TheLoop, &
I);
1055if (!PrimaryInduction) {
1056if (Inductions.
empty()) {
1058"loop induction variable could not be identified",
1059"NoInductionVariable", ORE, TheLoop);
1064"integer loop induction variable could not be identified",
1065"NoIntegerInductionVariable", ORE, TheLoop);
1068LLVM_DEBUG(
dbgs() <<
"LV: Did not find one integer induction var.\n");
1071// Now we know the widest induction type, check if our found induction 1072// is the same size. If it's not, unset it here and InnerLoopVectorizer 1073// will create another. 1074if (PrimaryInduction && WidestIndTy != PrimaryInduction->
getType())
1075 PrimaryInduction =
nullptr;
1080/// Find histogram operations that match high-level code in loops: 1082/// buckets[indices[i]]+=step; 1085/// It matches a pattern starting from \p HSt, which Stores to the 'buckets' 1086/// array the computed histogram. It uses a BinOp to sum all counts, storing 1087/// them using a loop-variant index Load from the 'indices' input array. 1089/// On successful matches it updates the STATISTIC 'HistogramsDetected', 1090/// regardless of hardware support. When there is support, it additionally 1091/// stores the BinOp/Load pairs in \p HistogramCounts, as well the pointers 1092/// used to update histogram in \p HistogramPtrs. 1097// Store value must come from a Binary Operation. 1103// BinOp must be an Add or a Sub modifying the bucket value by a 1104// loop invariant amount. 1105// FIXME: We assume the loop invariant term is on the RHS. 1106// Fine for an immediate/constant, but maybe not a generic value? 1107Value *HIncVal =
nullptr;
1112// Make sure the increment value is loop invariant. 1116// The address to store is calculated through a GEP Instruction. 1121// Restrict address calculation to constant indices except for the last term. 1122Value *HIdx =
nullptr;
1123for (
Value *Index :
GEP->indices()) {
1126if (!isa<ConstantInt>(Index))
1133// Check that the index is calculated by loading from another array. Ignore 1135// FIXME: Support indices from other sources than a linear load from memory? 1136// We're currently trying to match an operation looping over an array 1137// of indices, but there could be additional levels of indirection 1138// in place, or possibly some additional calculation to form the index 1139// from the loaded data. 1144// Make sure the index address varies in this loop, not an outer loop. 1145constauto *AR = dyn_cast<SCEVAddRecExpr>(PSE.
getSE()->
getSCEV(VPtrVal));
1146if (!AR || AR->getLoop() != TheLoop)
1149// Ensure we'll have the same mask by checking that all parts of the histogram 1150// (gather load, update, scatter store) are in the same block. 1158// Store the operations that make up the histogram. 1163bool LoopVectorizationLegality::canVectorizeIndirectUnsafeDependences() {
1164// For now, we only support an IndirectUnsafe dependency that calculates 1169// Find a single IndirectUnsafe dependency. 1173// If there were too many dependences, LAA abandons recording them. We can't 1174// proceed safely if we don't know what the dependences are. 1179// Ignore dependencies that are either known to be safe or can be 1180// checked at runtime. 1185// We're only interested in IndirectUnsafe dependencies here, where the 1186// address might come from a load from memory. We also only want to handle 1187// one such dependency, at least for now. 1196// For now only normal loads and stores are supported. 1203LLVM_DEBUG(
dbgs() <<
"LV: Checking for a histogram on: " << *SI <<
"\n");
1207bool LoopVectorizationLegality::canVectorizeMemory() {
1208 LAI = &LAIs.
getInfo(*TheLoop);
1213"loop not vectorized: ", *LAR);
1218return canVectorizeIndirectUnsafeDependences();
1222"write to a loop invariant address could not " 1224"CantVectorizeStoreToLoopInvariantAddress", ORE,
1229// We can vectorize stores to invariant address when final reduction value is 1230// guaranteed to be stored at the end of the loop. Also, if decision to 1231// vectorize loop is made, runtime checks are added so as to make sure that 1232// invariant address won't alias with any other objects. 1234// For each invariant address, check if last stored value is unconditional 1235// and the address is not calculated inside the loop. 1242"We don't allow storing to uniform addresses",
1243"write of conditional recurring variant value to a loop " 1244"invariant address could not be vectorized",
1245"CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1249// Invariant address should be defined outside of loop. LICM pass usually 1250// makes sure it happens, but in rare cases it does not, we do not want 1251// to overcomplicate vectorization to support this case. 1255"Invariant address is calculated inside the loop",
1256"write to a loop invariant address could not " 1258"CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1265// For each invariant address, check its last stored value is the result 1266// of one of our reductions. 1268// We do not check if dependence with loads exists because that is already 1269// checked via hasLoadStoreDependenceInvolvingLoopInvariantAddress. 1274// Earlier stores to this address are effectively deadcode. 1275// With opaque pointers it is possible for one pointer to be used with 1276// different sizes of stored values: 1277// store i32 0, ptr %x 1278// store i8 0, ptr %x 1279// The latest store doesn't complitely overwrite the first one in the 1280// example. That is why we have to make sure that types of stored 1282// TODO: Check that bitwidth of unhandled store is smaller then the 1283// one that overwrites it and add a test. 1286I->getValueOperand()->getType() ==
1287SI->getValueOperand()->getType();
1294bool IsOK = UnhandledStores.
empty();
1295// TODO: we should also validate against InvariantMemSets. 1298"We don't allow storing to uniform addresses",
1299"write to a loop invariant address could not " 1301"CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
1312bool EnableStrictReductions) {
1314// First check if there is any ExactFP math or if we allow reassociations 1318// If the above is false, we have ExactFPMath & do not allow reordering. 1319// If the EnableStrictReductions flag is set, first check if we have any 1320// Exact FP induction vars, which we cannot vectorize. 1321if (!EnableStrictReductions ||
1328// We can now only vectorize if all reductions with Exact FP math also 1329// have the isOrdered flag set, which indicates that we can move the 1330// reduction operations in-loop. 1352return V == InvariantAddress ||
1359PHINode *PN = dyn_cast_or_null<PHINode>(In0);
1363return Inductions.
count(PN);
1388constValue *V)
const{
1389auto *Inst = dyn_cast<Instruction>(V);
1390return (Inst && InductionCastsToIgnore.
count(Inst));
1399return FixedOrderRecurrences.
count(Phi);
1403// When vectorizing early exits, create predicates for the latch block only. 1404// The early exiting block must be a direct predecessor of the latch at the 1410"Uncountable exiting block must be a direct predecessor of latch");
1416bool LoopVectorizationLegality::blockCanBePredicated(
1420// We can predicate blocks with calls to assume, as long as we drop them in 1421// case we flatten the CFG via predication. 1422if (
match(&
I, m_Intrinsic<Intrinsic::assume>())) {
1427// Do not let llvm.experimental.noalias.scope.decl block the vectorization. 1428// TODO: there might be cases that it should block the vectorization. Let's 1429// ignore those for now. 1430if (isa<NoAliasScopeDeclInst>(&
I))
1433// We can allow masked calls if there's at least one vector variant, even 1434// if we end up scalarizing due to the cost model calculations. 1435// TODO: Allow other calls if they have appropriate attributes... readonly 1437if (
CallInst *CI = dyn_cast<CallInst>(&
I))
1443// Loads are handled via masking (or speculated if safe to do so.) 1444if (
auto *LI = dyn_cast<LoadInst>(&
I)) {
1450// Predicated store requires some form of masking: 1451// 1) masked store HW instruction, 1452// 2) emulation via load-blend-store (only if safe and legal to do so, 1453// be aware on the race conditions), or 1454// 3) element-by-element predicate check and scalar store. 1455if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
1460if (
I.mayReadFromMemory() ||
I.mayWriteToMemory() ||
I.mayThrow())
1467bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
1470"IfConversionDisabled", ORE, TheLoop);
1476// A list of pointers which are known to be dereferenceable within scope of 1477// the loop body for each iteration of the loop which executes. That is, 1478// the memory pointed to can be dereferenced (with the access size implied by 1479// the value's type) unconditionally within the loop header without 1480// introducing a new fault. 1483// Collect safe addresses. 1492// For a block which requires predication, a address may be safe to access 1493// in the loop w/o predication if we can prove dereferenceability facts 1494// sufficient to ensure it'll never fault within the loop. For the moment, 1495// we restrict this to loads; stores are more complicated due to 1496// concurrency restrictions. 1501// Pass the Predicates pointer to isDereferenceableAndAlignedInLoop so 1502// that it will consider loops that need guarding by SCEV checks. The 1503// vectoriser will generate these checks if we decide to vectorise. 1512// Collect the blocks that need predication. 1514// We support only branches and switch statements as terminators inside the 1516if (isa<SwitchInst>(BB->getTerminator())) {
1519"LoopContainsUnsupportedSwitch", ORE,
1520 TheLoop, BB->getTerminator());
1523 }
elseif (!isa<BranchInst>(BB->getTerminator())) {
1525"LoopContainsUnsupportedTerminator", ORE,
1526 TheLoop, BB->getTerminator());
1530// We must be able to predicate all blocks that need to be predicated. 1532 !blockCanBePredicated(BB, SafePointers, MaskedOp)) {
1534"Control flow cannot be substituted for a select",
"NoCFGForSelect",
1535 ORE, TheLoop, BB->getTerminator());
1540// We can if-convert this loop. 1544// Helper function to canVectorizeLoopNestCFG. 1545bool LoopVectorizationLegality::canVectorizeLoopCFG(
Loop *Lp,
1546bool UseVPlanNativePath) {
1548"VPlan-native path is not enabled.");
1550// TODO: ORE should be improved to show more accurate information when an 1551// outer loop can't be vectorized because a nested loop is not understood or 1552// legal. Something like: "outer_loop_location: loop not vectorized: 1553// (inner_loop_location) loop control flow is not understood by vectorizer". 1555// Store the result and return it at the end instead of exiting early, in case 1556// allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1560// We must have a loop in canonical form. Loops with indirectbr in them cannot 1564"loop control flow is not understood by vectorizer",
1565"CFGNotUnderstood", ORE, TheLoop);
1572// We must have a single backedge. 1575"loop control flow is not understood by vectorizer",
1576"CFGNotUnderstood", ORE, TheLoop);
1586bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1587Loop *Lp,
bool UseVPlanNativePath) {
1588// Store the result and return it at the end instead of exiting early, in case 1589// allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1592if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1599// Recursively check whether the loop control flow of nested loops is 1601for (
Loop *SubLp : *Lp)
1602if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
1612bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
1616"Cannot vectorize early exit loop",
1617"NoLatchEarlyExit", ORE, TheLoop);
1621if (Reductions.
size() || FixedOrderRecurrences.
size()) {
1623"Found reductions or recurrences in early-exit loop",
1624"Cannot vectorize early exit loop with reductions or recurrences",
1625"RecurrencesInEarlyExitLoop", ORE, TheLoop);
1632// Keep a record of all the exiting blocks. 1634 std::optional<std::pair<BasicBlock *, BasicBlock *>> SingleUncountableEdge;
1638if (isa<SCEVCouldNotCompute>(EC)) {
1640if (Succs.size() != 2) {
1642"Early exiting block does not have exactly two successors",
1643"Incorrect number of successors from early exiting block",
1644"EarlyExitTooManySuccessors", ORE, TheLoop);
1650 ExitBlock = Succs[0];
1653 ExitBlock = Succs[1];
1656if (SingleUncountableEdge) {
1658"Loop has too many uncountable exits",
1659"Cannot vectorize early exit loop with more than one early exit",
1660"TooManyUncountableEarlyExits", ORE, TheLoop);
1664 SingleUncountableEdge = {BB, ExitBlock};
1666 CountableExitingBlocks.push_back(BB);
1668// We can safely ignore the predicates here because when vectorizing the loop 1669// the PredicatatedScalarEvolution class will keep track of all predicates 1670// for each exiting block anyway. This happens when calling 1671// PSE.getSymbolicMaxBackedgeTakenCount() below. 1674if (!SingleUncountableEdge) {
1679// The only supported early exit loops so far are ones where the early 1680// exiting block is a unique predecessor of the latch block. 1682if (LatchPredBB != SingleUncountableEdge->first) {
1684"Cannot vectorize early exit loop",
1685"EarlyExitNotLatchPredecessor", ORE, TheLoop);
1689// The latch block must have a countable exit. 1690if (isa<SCEVCouldNotCompute>(
1693"Cannot determine exact exit count for latch block",
1694"Cannot vectorize early exit loop",
1695"UnknownLatchExitCountEarlyExitLoop", ORE, TheLoop);
1699"Latch block not found in list of countable exits!");
1701// Check to see if there are instructions that could potentially generate 1702// exceptions or have side-effects. 1704switch (
I->getOpcode()) {
1705case Instruction::Load:
1706case Instruction::Store:
1707case Instruction::PHI:
1708case Instruction::Br:
1709// These are checked separately. 1716for (
auto *BB : TheLoop->
blocks())
1717for (
auto &
I : *BB) {
1718if (
I.mayWriteToMemory()) {
1719// We don't support writes to memory. 1721"Writes to memory unsupported in early exit loops",
1722"Cannot vectorize early exit loop with writes to memory",
1723"WritesInEarlyExitLoop", ORE, TheLoop);
1725 }
elseif (!IsSafeOperation(&
I)) {
1727"cannot be speculatively executed",
1728"UnsafeOperationsEarlyExitLoop", ORE,
1734// The vectoriser cannot handle loads that occur after the early exit block. 1736"Expected latch predecessor to be the early exiting block");
1738// TODO: Handle loops that may fault. 1744"Cannot vectorize potentially faulting early exit loop",
1745"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
1749 [[maybe_unused]]
constSCEV *SymbolicMaxBTC =
1751// Since we have an exact exit count for the latch and the early exit 1752// dominates the latch, then this should guarantee a computed SCEV value. 1753assert(!isa<SCEVCouldNotCompute>(SymbolicMaxBTC) &&
1754"Failed to get symbolic expression for backedge taken count");
1755LLVM_DEBUG(
dbgs() <<
"LV: Found an early exit loop with symbolic max " 1756"backedge taken count: " 1757 << *SymbolicMaxBTC <<
'\n');
1758 UncountableEdge = SingleUncountableEdge;
1763// Store the result and return it at the end instead of exiting early, in case 1764// allowExtraAnalysis is used to report multiple reasons for not vectorizing. 1768// Check whether the loop-related control flow in the loop nest is expected by 1770if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
1771if (DoExtraAnalysis) {
1779// We need to have a loop header. 1783// Specific checks for outer loops. We skip the remaining legal checks at this 1784// point because they don't support outer loops. 1786assert(UseVPlanNativePath &&
"VPlan-native path is not enabled.");
1788if (!canVectorizeOuterLoop()) {
1790"UnsupportedOuterLoop", ORE, TheLoop);
1791// TODO: Implement DoExtraAnalysis when subsequent legal checks support 1801// Check if we can if-convert non-single-bb loops. 1803if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
1811// Check if we can vectorize the instructions and CFG in this loop. 1812if (!canVectorizeInstrs()) {
1813LLVM_DEBUG(
dbgs() <<
"LV: Can't vectorize the instructions or CFG\n");
1823"UnsupportedUncountableLoop", ORE, TheLoop);
1829if (!isVectorizableEarlyExitLoop()) {
1830 UncountableEdge = std::nullopt;
1839// Go over each instruction and look at memory deps. 1840if (!canVectorizeMemory()) {
1841LLVM_DEBUG(
dbgs() <<
"LV: Can't vectorize due to memory conflicts\n");
1851 ?
" (with a runtime bound check)" 1862"due to SCEVThreshold");
1864"Too many SCEV assumptions need to be made and checked at runtime",
1865"TooManySCEVRunTimeChecks", ORE, TheLoop);
1872// Okay! We've done all the tests. If any have failed, return false. Otherwise 1873// we can vectorize, and at this point we don't have any other mem analysis 1874// which may limit our maximum vectorization factor, so just return true with 1881LLVM_DEBUG(
dbgs() <<
"LV: checking if tail can be folded by masking.\n");
1888// TODO: handle non-reduction outside users when tail is folded by masking. 1889for (
auto *AE : AllowedExit) {
1890// Check that all users of allowed exit values are inside the loop or 1891// are the live-out of a reduction. 1892if (ReductionLiveOuts.
count(AE))
1894for (
User *U : AE->users()) {
1900 <<
"LV: Cannot fold tail by masking, loop has an outside user for " 1907PHINode *OrigPhi = Entry.first;
1909auto *UI = cast<Instruction>(U);
1911LLVM_DEBUG(
dbgs() <<
"LV: Cannot fold tail by masking, loop IV has an " 1919// The list of pointers that we can safely read and write to remains empty. 1922// Check all blocks for predication, including those that ordinarily do not 1923// need predication such as the header block. 1926if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp)) {
1938// The list of pointers that we can safely read and write to remains empty. 1941// Mark all blocks for predication, including those that ordinarily do not 1942// need predication such as the header block. 1944 [[maybe_unused]]
bool R = blockCanBePredicated(BB, SafePointers, MaskedOp);
1945assert(R &&
"Must be able to predicate block when tail-folding.");
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
loop Loop Strength Reduction
static cl::opt< LoopVectorizeHints::ScalableForceKind > ForceScalableVectorization("scalable-vectorization", cl::init(LoopVectorizeHints::SK_Unspecified), cl::Hidden, cl::desc("Control whether the compiler can use scalable vectors to " "vectorize a loop"), cl::values(clEnumValN(LoopVectorizeHints::SK_FixedWidthOnly, "off", "Scalable vectorization is disabled."), clEnumValN(LoopVectorizeHints::SK_PreferScalable, "preferred", "Scalable vectorization is available and favored when the " "cost is inconclusive."), clEnumValN(LoopVectorizeHints::SK_PreferScalable, "on", "Scalable vectorization is available and favored when the " "cost is inconclusive.")))
static cl::opt< unsigned > PragmaVectorizeSCEVCheckThreshold("pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed with a " "vectorize(enable) pragma"))
static const unsigned MaxInterleaveFactor
Maximum vectorization interleave count.
static cl::opt< bool > AllowStridedPointerIVs("lv-strided-pointer-ivs", cl::init(false), cl::Hidden, cl::desc("Enable recognition of non-constant strided " "pointer induction variables."))
static cl::opt< unsigned > VectorizeSCEVCheckThreshold("vectorize-scev-check-threshold", cl::init(16), cl::Hidden, cl::desc("The maximum number of SCEV checks allowed."))
static cl::opt< bool > EnableHistogramVectorization("enable-histogram-loop-vectorization", cl::init(false), cl::Hidden, cl::desc("Enables autovectorization of some loops containing histograms"))
static cl::opt< bool > EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization."))
This file defines the LoopVectorizationLegality class.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This pass exposes codegen information to IR-level passes.
Virtual Register Rewriter
static const uint32_t IV[8]
Class for arbitrary precision integers.
LLVM Basic Block Representation.
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
This class represents a function call, abstracting a target machine's calling convention.
static ConstantAsMetadata * get(Constant *C)
This is the shared class of boolean and integer constants.
A parsed version of the target data layout string in and methods for querying it.
static constexpr ElementCount getScalable(ScalarTy MinVal)
static constexpr ElementCount getFixed(ScalarTy MinVal)
constexpr bool isScalar() const
Exactly one element.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
static bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
Instruction * getExactFPMathInst()
Returns floating-point induction operator that does not allow reassociation (transforming the inducti...
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Value * getPointerOperand()
const LoopAccessInfo & getInfo(Loop &L)
const MemoryDepChecker & getDepChecker() const
the Memory Dependence Checker which can determine the loop-independent and loop-carried dependences b...
ArrayRef< StoreInst * > getStoresToInvariantAddresses() const
Return the list of stores to invariant addresses.
const OptimizationRemarkAnalysis * getReport() const
The diagnostics report generated for the analysis.
const RuntimePointerChecking * getRuntimePointerChecking() const
bool canVectorizeMemory() const
Return true we can analyze the memory accesses in the loop and there are no memory dependence cycles.
bool isInvariant(Value *V) const
Returns true if value V is loop invariant.
bool hasLoadStoreDependenceInvolvingLoopInvariantAddress() const
Return true if the loop has memory dependence involving a load and a store to an invariant address,...
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop, DominatorTree *DT)
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
bool hasStoreStoreDependenceInvolvingLoopInvariantAddress() const
Return true if the loop has memory dependence involving two stores to an invariant address,...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
unsigned getNumBlocks() const
Get the number of blocks in this loop in constant time.
unsigned getNumBackEdges() const
Calculate the number of back edges to the loop header.
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
bool isLoopHeader(const BlockT *BB) const
bool isInvariantStoreOfReduction(StoreInst *SI)
Returns True if given store is a final invariant store of one of the reductions found in the loop.
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
bool isUniform(Value *V, ElementCount VF) const
Returns true if value V is uniform across VF lanes, when VF is provided, and otherwise if V is invari...
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
bool hasUncountableEarlyExit() const
Returns true if the loop has exactly one uncountable early exit, i.e.
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
BasicBlock * getUncountableEarlyExitingBlock() const
Returns the uncountable early exiting block, if there is exactly one.
bool isInductionVariable(const Value *V) const
Returns True if V can be considered as an induction variable in this loop.
bool isCastedInductionVariable(const Value *V) const
Returns True if V is a cast that is part of an induction def-use chain, and had been proven to be red...
Instruction * getExactFPInst()
void addExactFPMathInst(Instruction *I)
Track the 1st floating-point instruction that can not be reassociated.
@ SK_PreferScalable
Vectorize loops using scalable vectors or fixed-width vectors, but favor scalable vectors when the co...
@ SK_Unspecified
Not selected.
@ SK_FixedWidthOnly
Disables vectorization with scalable vectors.
enum ForceKind getForce() const
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
ElementCount getWidth() const
@ FK_Enabled
Forcing enabled.
@ FK_Undefined
Not selected.
@ FK_Disabled
Forcing disabled.
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
LoopVectorizeHints(const Loop *L, bool InterleaveOnlyWhenForced, OptimizationRemarkEmitter &ORE, const TargetTransformInfo *TTI=nullptr)
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
unsigned getInterleave() const
unsigned getIsVectorized() const
Represents a single loop in the control flow graph.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
void setLoopID(MDNode *LoopID) const
Set the llvm.loop loop id metadata for this loop.
PHINode * getCanonicalInductionVariable() const
Check to see if the loop has a canonical induction variable: an integer recurrence that starts at 0 a...
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
const MDOperand & getOperand(unsigned I) const
ArrayRef< MDOperand > operands() const
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
unsigned getNumOperands() const
Return number of MDNode operands.
Tracking metadata reference owned by Metadata.
StringRef getString() const
static MDString * get(LLVMContext &Context, StringRef Str)
size_type count(const KeyT &Key) const
iterator find(const KeyT &Key)
Checks memory dependences among accesses to the same underlying object to determine whether there vec...
const SmallVectorImpl< Dependence > * getDependences() const
Returns the memory dependences.
Root of the metadata hierarchy.
Diagnostic information for optimization analysis remarks.
static const char * AlwaysPrint
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
void addPredicate(const SCEVPredicate &Pred)
Adds a new predicate.
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSymbolicMaxBackedgeTakenCount()
Get the (predicated) symbolic max backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static bool isFixedOrderRecurrence(PHINode *Phi, Loop *TheLoop, DominatorTree *DT)
Returns true if Phi is a fixed-order recurrence.
bool hasExactFPMath() const
Returns true if the recurrence has floating-point math that requires precise (ordered) operations.
Instruction * getLoopExitInstr() const
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
StoreInst * IntermediateStore
Reductions may store temporary or final result to an invariant address.
bool Need
This flag indicates if we need to add the runtime check.
This node represents a polynomial recurrence on the trip count of the specified loop.
const SCEV * getStart() const
const SCEV * getStepRecurrence(ScalarEvolution &SE) const
Constructs and returns the recurrence indicating how much this expression steps by.
const Loop * getLoop() const
virtual unsigned getComplexity() const
Returns the estimated complexity of this predicate.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This visitor recursively visits a SCEV expression and re-writes it.
const SCEV * visit(const SCEV *S)
This means that we are dealing with an entirely unknown SCEV value, and only represent it as its LLVM...
This class represents an analyzed expression in the program.
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
const SCEV * getPredicatedExitCount(const Loop *L, const BasicBlock *ExitingBlock, SmallVectorImpl< const SCEVPredicate * > *Predicates, ExitCountKind Kind=Exact)
Same as above except this uses the predicated backedge taken info and may require predicates.
const SCEV * getCouldNotCompute()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
static constexpr size_t npos
Provides information about what library functions are available for the current target.
bool hasOptimizedCodeGen(LibFunc F) const
Tests if the function is both available and a candidate for optimized code generation.
void getWidestVF(StringRef ScalarF, ElementCount &FixedVF, ElementCount &ScalableVF) const
Returns the largest vectorization factor used in the list of vector functions.
bool getLibFunc(StringRef funcName, LibFunc &F) const
Searches for a particular function name.
bool isFunctionVectorizable(StringRef F, const ElementCount &VF) const
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool isLegalNTLoad(Type *DataType, Align Alignment) const
Return true if the target supports nontemporal load.
bool isLegalNTStore(Type *DataType, Align Alignment) const
Return true if the target supports nontemporal store.
bool enableScalableVectorization() const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
std::string str() const
Return the twine contents as a std::string.
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
bool isPointerTy() const
True if this is an instance of PointerType.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Value * getOperand(unsigned i) const
static bool hasMaskedVariant(const CallInst &CI, std::optional< ElementCount > VF=std::nullopt)
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
iterator_range< user_iterator > users()
StringRef getName() const
Return a constant reference to the value's name.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
constexpr bool isZero() const
const ParentTy * getParent() const
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
match_combine_or< match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > >, OpTy > m_ZExtOrSExtOrSelf(const OpTy &Op)
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
NodeAddr< FuncNode * > Func
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
cl::opt< bool > HintsAllowReordering("hints-allow-reordering", cl::init(true), cl::Hidden, cl::desc("Allow enabling loop hints to reorder " "FP operations during vectorization."))
static Type * getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1)
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
static Type * convertPointerToIntegerType(const DataLayout &DL, Type *Ty)
static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp)
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
static bool isUniformLoop(Loop *Lp, Loop *OuterLp)
bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
static bool canWidenCallReturnType(Type *Ty)
Returns true if the call return type Ty can be widened by the loop vectorizer.
bool isDereferenceableReadOnlyLoop(Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if the loop L cannot fault on any iteration and only contains read-only memory accesses.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
auto reverse(ContainerTy &&C)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, SmallPtrSetImpl< Value * > &AllowedExit)
Check that the instruction has outside loop users and is not an identified reduction variable.
static bool storeToSameAddress(ScalarEvolution *SE, StoreInst *A, StoreInst *B)
Returns true if A and B have same pointer operands or same SCEVs addresses.
bool canVectorizeTy(Type *Ty)
Returns true if Ty is a valid vector element type, void, or an unpacked literal struct where all elem...
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
llvm::MDNode * makePostTransformationMetadata(llvm::LLVMContext &Context, MDNode *OrigLoopID, llvm::ArrayRef< llvm::StringRef > RemovePrefixes, llvm::ArrayRef< llvm::MDNode * > AddAttrs)
Create a new LoopID after the loop has been transformed.
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
static bool findHistogram(LoadInst *LI, StoreInst *HSt, Loop *TheLoop, const PredicatedScalarEvolution &PSE, SmallVectorImpl< HistogramInfo > &Histograms)
Find histogram operations that match high-level code in loops:
static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI)
Checks if a function is scalarizable according to the TLI, in the sense that it should be vectorized ...
bool isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L, ScalarEvolution &SE, DominatorTree &DT, AssumptionCache *AC=nullptr, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Return true if we can prove that the given load (which is assumed to be within the specified loop) wo...
bool SCEVExprContains(const SCEV *Root, PredTy Pred)
Return true if any node in Root satisfies the predicate Pred.
Dependece between memory access instructions.
Instruction * getDestination(const MemoryDepChecker &DepChecker) const
Return the destination instruction of the dependence.
Instruction * getSource(const MemoryDepChecker &DepChecker) const
Return the source instruction of the dependence.
static VectorizationSafetyStatus isSafeForVectorization(DepType Type)
Dependence types that don't prevent vectorization.
An object of this class is returned by queries that could not be answered.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
Collection of parameters shared beetween the Loop Vectorizer and the Loop Access Analysis.
static const unsigned MaxVectorWidth
Maximum SIMD width.
static bool isInterleaveForced()
True if force-vector-interleave was specified by the user.
static unsigned VectorizationInterleave
Interleave factor as overridden by the user.