1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===// 3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4// See https://llvm.org/LICENSE.txt for license information. 5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7//===----------------------------------------------------------------------===// 9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops 10// and generates target-independent LLVM-IR. 11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs 12// of instructions in order to estimate the profitability of vectorization. 14// The loop vectorizer combines consecutive loop iterations into a single 15// 'wide' iteration. After this transformation the index is incremented 16// by the SIMD vector width, and not by one. 18// This pass has three parts: 19// 1. The main loop pass that drives the different parts. 20// 2. LoopVectorizationLegality - A unit that checks for the legality 21// of the vectorization. 22// 3. InnerLoopVectorizer - A unit that performs the actual 23// widening of instructions. 24// 4. LoopVectorizationCostModel - A unit that checks for the profitability 25// of vectorization. It decides on the optimal vector width, which 26// can be one, if vectorization is not profitable. 28// There is a development effort going on to migrate loop vectorizer to the 29// VPlan infrastructure and to introduce outer loop vectorization support (see 30// docs/VectorizationPlan.rst and 31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this 32// purpose, we temporarily introduced the VPlan-native vectorization path: an 33// alternative vectorization path that is natively implemented on top of the 34// VPlan infrastructure. See EnableVPlanNativePath for enabling. 36//===----------------------------------------------------------------------===// 38// The reduction-variable vectorization is based on the paper: 39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. 41// Variable uniformity checks are inspired by: 42// Karrenberg, R. and Hack, S. Whole Function Vectorization. 44// The interleaved access vectorization is based on the paper: 45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved 48// Other ideas/concepts are from: 49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. 51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of 52// Vectorizing Compilers. 54//===----------------------------------------------------------------------===// 158#define LV_NAME "loop-vectorize" 159#define DEBUG_TYPE LV_NAME 166/// Metadata attribute names 169"llvm.loop.vectorize.followup_vectorized";
171"llvm.loop.vectorize.followup_epilogue";
175STATISTIC(LoopsAnalyzed,
"Number of loops analyzed for vectorization");
176STATISTIC(LoopsEpilogueVectorized,
"Number of epilogues vectorized");
180cl::desc(
"Enable vectorization of epilogue loops."));
184cl::desc(
"When epilogue vectorization is enabled, and a value greater than " 185"1 is specified, forces the given VF for all applicable epilogue " 189"epilogue-vectorization-minimum-VF",
cl::Hidden,
190cl::desc(
"Only loops with vectorization factor equal to or larger than " 191"the specified value are considered for epilogue vectorization."));
193/// Loops with a known constant trip count below this number are vectorized only 194/// if no scalar iteration overheads are incurred. 197cl::desc(
"Loops with a constant trip count that is smaller than this " 198"value are vectorized only if no scalar iteration overheads " 203cl::desc(
"The maximum allowed number of runtime memory checks"));
205// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, 206// that predication is preferred, and this lists all options. I.e., the 207// vectorizer will try to fold the tail-loop (epilogue) into the vector body 208// and predicate the instructions accordingly. If tail-folding fails, there are 209// different fallback strategies depending on these values: 216}
// namespace PreferPredicateTy 219"prefer-predicate-over-epilogue",
222cl::desc(
"Tail-folding and predication preferences over creating a scalar " 226"Don't tail-predicate loops, create scalar epilogue"),
228"predicate-else-scalar-epilogue",
229"prefer tail-folding, create scalar epilogue if tail " 232"predicate-dont-vectorize",
233"prefers tail-folding, don't attempt vectorization if " 234"tail-folding fails.")));
237"force-tail-folding-style",
cl::desc(
"Force the tail folding style"),
240clEnumValN(TailFoldingStyle::None,
"none",
"Disable tail folding"),
242 TailFoldingStyle::Data,
"data",
243"Create lane mask for data only, using active.lane.mask intrinsic"),
244clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
245"data-without-lane-mask",
246"Create lane mask with compare/stepvector"),
247clEnumValN(TailFoldingStyle::DataAndControlFlow,
"data-and-control",
248"Create lane mask using active.lane.mask intrinsic, and use " 249"it for both data and control flow"),
250clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
251"data-and-control-without-rt-check",
252"Similar to data-and-control, but remove the runtime check"),
253clEnumValN(TailFoldingStyle::DataWithEVL,
"data-with-evl",
254"Use predicated EVL instructions for tail folding. If EVL " 255"is unsupported, fallback to data-without-lane-mask.")));
259cl::desc(
"Maximize bandwidth when selecting vectorization factor which " 260"will be determined by the smallest type in loop."));
264cl::desc(
"Enable vectorization on interleaved memory accesses in a loop"));
266/// An interleave-group may need masking if it resides in a block that needs 267/// predication, or in order to mask away gaps. 270cl::desc(
"Enable vectorization on masked interleaved memory accesses in a loop"));
274cl::desc(
"A flag that overrides the target's number of scalar registers."));
278cl::desc(
"A flag that overrides the target's number of vector registers."));
282cl::desc(
"A flag that overrides the target's max interleave factor for " 287cl::desc(
"A flag that overrides the target's max interleave factor for " 288"vectorized loops."));
292cl::desc(
"A flag that overrides the target's expected cost for " 293"an instruction to a single constant value. Mostly " 294"useful for getting consistent testing."));
299"Pretend that scalable vectors are supported, even if the target does " 300"not support them. This flag should only be used for testing."));
305"The cost of a loop that is considered 'small' by the interleaver."));
309cl::desc(
"Enable the use of the block frequency analysis to access PGO " 310"heuristics minimizing code growth in cold regions and being more " 311"aggressive in hot regions."));
313// Runtime interleave loops for load/store throughput. 317"Enable runtime interleaving until load/store ports are saturated"));
319/// The number of stores in a loop that are allowed to need predication. 322cl::desc(
"Max number of stores to be predicated behind an if."));
326cl::desc(
"Count the induction variable only once when interleaving"));
330cl::desc(
"Enable if predication of stores during vectorization."));
334cl::desc(
"The maximum interleave count to use when interleaving a scalar " 335"reduction in a nested loop."));
340cl::desc(
"Prefer in-loop vector reductions, " 341"overriding the targets preference."));
345cl::desc(
"Enable the vectorisation of loops with in-order (strict) " 351"Prefer predicating a reduction operation over an after loop select."));
356cl::desc(
"Enable VPlan-native vectorization path with " 357"support for outer loop vectorization."));
360// This flag enables the stress testing of the VPlan H-CFG construction in the 361// VPlan-native vectorization path. It must be used in conjuction with 362// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the 363// verification of the H-CFGs built. 367"Build VPlan for every supported loop nest in the function and bail " 368"out right after the build (stress test the VPlan H-CFG construction " 369"in the VPlan-native vectorization path)."));
373cl::desc(
"Enable loop interleaving in Loop vectorization passes"));
376cl::desc(
"Run the Loop vectorization passes"));
379"force-widen-divrem-via-safe-divisor",
cl::Hidden,
381"Override cost based safe divisor widening for div/rem instructions"));
384"vectorizer-maximize-bandwidth-for-vector-calls",
cl::init(
true),
386cl::desc(
"Try wider VFs if they enable the use of vector variants"));
391"Enable vectorization of early exit loops with uncountable exits."));
393// Likelyhood of bypassing the vectorized loop because assumptions about SCEV 394// variables not overflowing do not hold. See `emitSCEVChecks`. 396// Likelyhood of bypassing the vectorized loop because pointers overlap. See 397// `emitMemRuntimeChecks`. 399// Likelyhood of bypassing the vectorized loop because there are zero trips left 400// after prolog. See `emitIterationCountCheck`. 403/// A helper function that returns true if the given type is irregular. The 404/// type is irregular if its allocated size doesn't equal the store size of an 405/// element of the corresponding vector type. 407// Determine if an array of N elements of type Ty is "bitcast compatible" 408// with a <N x Ty> vector. 409// This is only true if there is no padding between the array elements. 410returnDL.getTypeAllocSizeInBits(Ty) !=
DL.getTypeSizeInBits(Ty);
413/// Returns "best known" trip count for the specified loop \p L as defined by 414/// the following procedure: 415/// 1) Returns exact trip count if it is known. 416/// 2) Returns expected trip count according to profile data if any. 417/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax. 418/// 4) Returns std::nullopt if all of the above failed. 419static std::optional<unsigned>
421bool CanUseConstantMax =
true) {
422// Check if exact trip count is known. 426// Check if there is an expected trip count available from profile data. 431if (!CanUseConstantMax)
434// Check if upper bound estimate is known. 442// Forward declare GeneratedRTChecks. 443classGeneratedRTChecks;
452/// InnerLoopVectorizer vectorizes loops which contain only one basic 453/// block to a specified vectorization factor (VF). 454/// This class performs the widening of scalars into vectors, or multiple 455/// scalars. This class also implements the following features: 456/// * It inserts an epilogue loop for handling loops that don't have iteration 457/// counts that are known to be a multiple of the vectorization factor. 458/// * It handles the code generation for reduction variables. 459/// * Scalarization (implementation using scalars) of un-vectorizable 461/// InnerLoopVectorizer does not perform any vectorization-legality 462/// checks, and relies on the caller to check for the different legality 463/// aspects. The InnerLoopVectorizer relies on the 464/// LoopVectorizationLegality class to provide information about the induction 465/// and reduction variables that were found to a given vectorization factor. 484// Query this against the original loop and save it here because the profile 485// of the original loop header may change as the transformation happens. 492 /// Create a new empty loop that will contain vectorized instructions later 493 /// on, while the old loop will be used as the scalar remainder. Control flow 494 /// is generated around the vectorized (and scalar epilogue) loops consisting 495 /// of various checks and bypasses. Return the pre-header block of the new 496 /// loop. In the case of epilogue vectorization, this function is overriden to 497 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is 498 /// used to look up SCEV expansions for expressions needed during skeleton 503 /// Fix the vectorized code, taking care of header phi's, and more. 506// Return true if any runtime check is added. 509 /// A helper function to scalarize a single Instruction in the innermost loop. 510 /// Generates a sequence of scalar instances for each lane between \p MinLane 511 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart, 512 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p 513 /// Instr's operands. 518 /// Fix the non-induction PHIs in \p Plan. 521 /// Returns the original loop trip count. 524 /// Used to set the trip count after ILV's construction and after the 525 /// preheader block has been executed. Note that this always holds the trip 526 /// count of the original loop for both main loop and epilogue vectorization. 529// Retrieve the additional bypass value associated with an original 530 /// induction header phi. 535 /// Return the additional bypass block which targets the scalar loop by 536 /// skipping the epilogue loop after completing the main loop. 539"Trying to access AdditionalBypassBlock but it has not been set");
546 /// Iteratively sink the scalarized operands of a predicated instruction into 547 /// the block that was created for it. 550 /// Returns (and creates if needed) the trip count of the widened loop. 553 /// Emit a bypass check to see if the vector trip count is zero, including if 557 /// Emit a bypass check to see if all of the SCEV assumptions we've 558 /// had to make are correct. Returns the block containing the checks or 559 /// nullptr if no checks have been added. 562 /// Emit bypass checks to check any memory assumptions we may have made. 563 /// Returns the block containing the checks or nullptr if no checks have been 567 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check, 568 /// vector loop preheader, middle block and scalar preheader. 571 /// Create and record the values for induction variables to resume coming from 572 /// the additional bypass block. 574Value *MainVectorTripCount);
576 /// Allow subclasses to override and print debug traces before/after vplan 577 /// execution, when trace information is requested. 581 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the 582 /// vector preheader and its predecessor, also connecting the new block to the 583 /// scalar preheader. 586 /// The original loop. 589 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies 590 /// dynamic knowledge to simplify SCEV expressions and converts them to a 591 /// more usable form. 600 /// Target Library Info. 603 /// Target Transform Info. 606 /// Assumption Cache. 609 /// Interface to emit optimization remarks. 612 /// The vectorization SIMD factor to use. Each vector will have this many 618 /// The vectorization unroll factor to use. Each scalar is vectorized to this 619 /// many different vector instructions. 622 /// The builder that we use 625// --- Vectorization state --- 627 /// The vector-loop preheader. 630 /// The scalar-loop preheader. 633 /// Middle Block between the vector and the scalar. 636 /// A list of all bypass blocks. The first block is the entry of the loop. 639 /// Store instructions that were predicated. 642 /// Trip count of the original loop. 645 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF)) 648 /// The legality analysis. 651 /// The profitablity analysis. 654// Record whether runtime checks are added. 657 /// BFI and PSI are used to check for profile guided size optimizations. 661// Whether this loop should be optimized for size based on profile guided size 665 /// Structure to hold information about generated runtime checks, responsible 666 /// for cleaning the checks, if vectorization turns out unprofitable. 669 /// Mapping of induction phis to their additional bypass values. They 670 /// need to be added as operands to phi nodes in the scalar loop preheader 671 /// after the epilogue skeleton has been created. 674 /// The additional bypass block which conditionally skips over the epilogue 675 /// loop after executing the main loop. Needed to resume inductions and 676 /// reductions during epilogue vectorization. 681 /// The vector preheader block of \p Plan, used as target for check blocks 682 /// introduced during skeleton creation. 686/// Encapsulate information regarding vectorization of a loop and its epilogue. 687/// This information is meant to be updated and used across two stages of 688/// epilogue vectorization. 708"A high UF for the epilogue loop is likely not beneficial.");
712/// An extension of the inner loop vectorizer that creates a skeleton for a 713/// vectorized loop that has its epilogue (residual) also vectorized. 714/// The idea is to run the vplan on a given loop twice, firstly to setup the 715/// skeleton and vectorize the main loop, and secondly to complete the skeleton 716/// from the first step and vectorize the epilogue. This is achieved by 717/// deriving two concrete strategy classes from this base class and invoking 718/// them in succession from the loop vectorizer planner. 730EPI.MainLoopVF,
EPI.MainLoopVF,
EPI.MainLoopUF, LVL,
734// Override this function to handle the more complex control flow around the 741 /// The interface for creating a vectorized skeleton using one of two 742 /// different strategies, each corresponding to one execution of the vplan 743 /// as described above. 747 /// Holds and updates state information required to vectorize the main loop 748 /// and its epilogue in two separate passes. This setup helps us avoid 749 /// regenerating and recomputing runtime safety checks. It also helps us to 750 /// shorten the iteration-count-check path length for the cases where the 751 /// iteration count of the loop is so small that the main vector loop is 752 /// completely skipped. 756/// A specialized derived class of inner loop vectorizer that performs 757/// vectorization of *main* loops in the process of vectorizing loops and their 771 /// Implements the interface for creating a vectorized skeleton using the 772 /// *main loop* strategy (ie the first pass of vplan execution). 777 /// Emits an iteration count bypass check once for the main loop (when \p 778 /// ForEpilogue is false) and once for the epilogue loop (when \p 779 /// ForEpilogue is true). 785// A specialized derived class of inner loop vectorizer that performs 786// vectorization of *epilogue* loops in the process of vectorizing loops and 802 /// Implements the interface for creating a vectorized skeleton using the 803 /// *epilogue loop* strategy (ie the second pass of vplan execution). 808 /// Emits an iteration count bypass check after the main vector loop has 809 /// finished to see if there are any iterations left to execute by either 810 /// the vector epilogue or the scalar epilogue. 817}
// end namespace llvm 819/// Look for a meaningful debug location on the instruction or its operands. 825if (
I->getDebugLoc() != Empty)
826returnI->getDebugLoc();
828for (
Use &
Op :
I->operands()) {
830if (OpInst->getDebugLoc() != Empty)
831return OpInst->getDebugLoc();
834returnI->getDebugLoc();
837/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I 838/// is passed, the message relates to that particular instruction. 843dbgs() <<
"LV: " << Prefix << DebugMsg;
852/// Create an analysis remark that explains why vectorization failed 854/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p 855/// RemarkName is the identifier for the remark. If \p I is passed it is an 856/// instruction that prevents vectorization. Otherwise \p TheLoop is used for 857/// the location of the remark. If \p DL is passed, use it as debug location for 858/// the remark. \return the remark object that can be streamed to. 863// If debug location is attached to the instruction, use it. Otherwise if DL 864// was not provided, use the loop's. 865if (
I &&
I->getDebugLoc())
875/// Return a value for Step multiplied by VF. 882/// Return the runtime value for VF. 884returnB.CreateElementCount(Ty, VF);
895 <<
"loop not vectorized: " << OREMsg);
898/// Reports an informative message: print \p Msg for debugging purposes as well 899/// as an optimization remark. Uses either \p I as location of the remark, or 900/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the 901/// remark. If \p DL is passed, use it as debug location for the remark. 913/// Report successful vectorization of the loop. In case an outer loop is 914/// vectorized, prepend "outer" to the vectorization remark. 918"Vectorizing: ", TheLoop->
isInnermost() ?
"innermost loop" :
"outer loop",
924 <<
"vectorized " << LoopType <<
"loop (vectorization width: " 926 <<
", interleaved count: " <<
ore::NV(
"InterleaveCount", IC) <<
")";
930}
// end namespace llvm 934// Loop vectorization cost-model hints how the scalar epilogue loop should be 938// The default: allowing scalar epilogues. 941// Vectorization with OptForSize: don't allow epilogues. 944// A special case of vectorisation with OptForSize: loops with a very small 945// trip count are considered for vectorization under OptForSize, thereby 946// making sure the cost of their loop body is dominant, free of runtime 947// guards and scalar iteration overheads. 950// Loop hint predicate indicating an epilogue is undesired. 953// Directive indicating we must either tail fold or not vectorize 959/// LoopVectorizationCostModel - estimates the expected speedups due to 961/// In many cases vectorization is not profitable. This can happen because of 962/// a number of reasons. In this class we mainly attempt to predict the 963/// expected speedup/slowdowns due to the supported instruction set. We use the 964/// TargetTransformInfo to query the different backends for the cost of 965/// different operations. 983 /// \return An upper bound for the vectorization factors (both fixed and 984 /// scalable). If the factors are 0, vectorization and interleaving should be 985 /// avoided up front. 988 /// \return True if runtime checks are required for vectorization, and false 992 /// Setup cost-based decisions for user vectorization factor. 993 /// \return true if the UserVF is a feasible VF to be chosen. 1000 /// \return The size (in bits) of the smallest and widest types in the code 1001 /// that needs to be vectorized. We ignore values that remain scalar such as 1002 /// 64 bit loop indices. 1005 /// \return The desired interleave count. 1006 /// If interleave count has been specified by metadata it will be returned. 1007 /// Otherwise, the interleave count is computed and returned. VF and LoopCost 1008 /// are the selected vectorization factor and the cost of the selected VF. 1011 /// Memory access instruction may be vectorized in more than one way. 1012 /// Form of instruction after vectorization depends on cost. 1013 /// This function takes cost-based decisions for Load/Store instructions 1014 /// and collects them in a map. This decisions map is used for building 1015 /// the lists of loop-uniform and loop-scalar instructions. 1016 /// The calculated cost is saved with widening decision in order to 1017 /// avoid redundant calculations. 1020 /// A call may be vectorized in different ways depending on whether we have 1021 /// vectorized variants available and whether the target supports masking. 1022 /// This function analyzes all calls in the function at the supplied VF, 1023 /// makes a decision based on the costs of available options, and stores that 1024 /// decision in a map for use in planning and plan execution. 1027 /// A struct that represents some properties of the register usage 1030 /// Holds the number of loop invariant values that are used in the loop. 1031 /// The key is ClassID of target-provided register class. 1033 /// Holds the maximum number of concurrent live intervals in the loop. 1034 /// The key is ClassID of target-provided register class. 1038 /// \return Returns information about the register usages of the loop for the 1039 /// given vectorization factors. 1043 /// Collect values we want to ignore in the cost model. 1046 /// Collect all element types in the loop for which widening is needed. 1049 /// Split reductions into those that happen in the loop, and those that happen 1050 /// outside. In loop reductions are collected into InLoopReductions. 1053 /// Returns true if we should use strict in-order reductions for the given 1054 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, 1055 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering 1056 /// of FP operations. 1061 /// \returns The smallest bitwidth each instruction can be represented with. 1062 /// The vector equivalents of these instructions should be truncated to this 1068 /// \returns True if it is more profitable to scalarize instruction \p I for 1069 /// vectorization factor \p VF. 1072"Profitable to scalarize relevant only for VF > 1.");
1075"cost-model should not be used for outer loops (in VPlan-native path)");
1077auto Scalars = InstsToScalarize.find(VF);
1078assert(Scalars != InstsToScalarize.end() &&
1079"VF not yet analyzed for scalarization profitability");
1080return Scalars->second.contains(
I);
1083 /// Returns true if \p I is known to be uniform after vectorization. 1087"cost-model should not be used for outer loops (in VPlan-native path)");
1088// Pseudo probe needs to be duplicated for each unrolled iteration and 1089// vector lane so that profiled loop trip count can be accurately 1090// accumulated instead of being under counted. 1091if (isa<PseudoProbeInst>(
I))
1097auto UniformsPerVF = Uniforms.find(VF);
1098assert(UniformsPerVF != Uniforms.end() &&
1099"VF not yet analyzed for uniformity");
1100return UniformsPerVF->second.count(
I);
1103 /// Returns true if \p I is known to be scalar after vectorization. 1107"cost-model should not be used for outer loops (in VPlan-native path)");
1111auto ScalarsPerVF = Scalars.find(VF);
1112assert(ScalarsPerVF != Scalars.end() &&
1113"Scalar values are not calculated for VF");
1114return ScalarsPerVF->second.count(
I);
1117 /// \returns True if instruction \p I can be truncated to a smaller bitwidth 1118 /// for vectorization factor \p VF. 1120return VF.
isVector() && MinBWs.contains(
I) &&
1125 /// Decision that was taken during cost calculation for memory instruction. 1137 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1138 /// instruction \p I and vector width \p VF. 1142 WideningDecisions[std::make_pair(
I, VF)] = std::make_pair(W,
Cost);
1145 /// Save vectorization decision \p W and \p Cost taken by the cost model for 1146 /// interleaving group \p Grp and vector width \p VF. 1151 /// Broadcast this decicion to all instructions inside the group. 1152 /// When interleaving, the cost will only be assigned one instruction, the 1153 /// insert position. For other cases, add the appropriate fraction of the 1154 /// total cost to each instruction. This ensures accurate costs are used, 1155 /// even if the insert position instruction is not used. 1164 WideningDecisions[std::make_pair(
I, VF)] =
1165 std::make_pair(W, InsertPosCost);
1167 WideningDecisions[std::make_pair(
I, VF)] =
1168 std::make_pair(W, OtherMemberCost);
1173 /// Return the cost model decision for the given instruction \p I and vector 1174 /// width \p VF. Return CM_Unknown if this instruction did not pass 1175 /// through the cost modeling. 1180"cost-model should not be used for outer loops (in VPlan-native path)");
1182 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I, VF);
1183auto Itr = WideningDecisions.
find(InstOnVF);
1184if (Itr == WideningDecisions.
end())
1186return Itr->second.first;
1189 /// Return the vectorization cost for the given instruction \p I and vector 1193 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(
I, VF);
1195"The cost is not calculated");
1196return WideningDecisions[InstOnVF].second;
1209 std::optional<unsigned> MaskPos,
1212 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,
1219return CallWideningDecisions.
at(std::make_pair(CI, VF));
1222 /// Return True if instruction \p I is an optimizable truncate whose operand 1223 /// is an induction variable. Such a truncate will be removed by adding a new 1224 /// induction variable with the destination type. 1226// If the instruction is not a truncate, return false. 1227auto *Trunc = dyn_cast<TruncInst>(
I);
1231// Get the source and destination types of the truncate. 1235// If the truncate is free for the given types, return false. Replacing a 1236// free truncate with an induction variable would add an induction variable 1237// update instruction to each iteration of the loop. We exclude from this 1238// check the primary induction variable since it will need an update 1239// instruction regardless. 1240Value *
Op = Trunc->getOperand(0);
1244// If the truncated value is not an induction variable, return false. 1248 /// Collects the instructions to scalarize for each predicated instruction in 1252 /// Collect Uniform and Scalar values for the given \p VF. 1253 /// The sets depend on CM decision for Load/Store instructions 1254 /// that may be vectorized as interleave, gather-scatter or scalarized. 1255 /// Also make a decision on what to do about call instructions in the loop 1256 /// at that VF -- scalarize, call a known vector routine, or call a 1257 /// vector intrinsic. 1259// Do the analysis once. 1260if (VF.
isScalar() || Uniforms.contains(VF))
1263 collectLoopUniforms(VF);
1265 collectLoopScalars(VF);
1268 /// Returns true if the target machine supports masked store operation 1269 /// for the given \p DataType and kind of access to \p Ptr. 1275 /// Returns true if the target machine supports masked load operation 1276 /// for the given \p DataType and kind of access to \p Ptr. 1282 /// Returns true if the target machine can represent \p V as a masked gather 1283 /// or scatter operation. 1285boolLI = isa<LoadInst>(V);
1286bool SI = isa<StoreInst>(V);
1297 /// Returns true if the target machine supports all of the reduction 1298 /// variables found for the given VF. 1301 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1302 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1306 /// Given costs for both strategies, return true if the scalar predication 1307 /// lowering should be used for div/rem. This incorporates an override 1308 /// option so it is not simply a cost comparison. 1313return ScalarCost < SafeDivisorCost;
1322 /// Returns true if \p I is an instruction which requires predication and 1323 /// for which our chosen predication strategy is scalarization (i.e. we 1324 /// don't have an alternate strategy such as masking available). 1325 /// \p VF is the vectorization factor that will be used to vectorize \p I. 1328 /// Returns true if \p I is an instruction that needs to be predicated 1329 /// at runtime. The result is independent of the predication mechanism. 1330 /// Superset of instructions that return true for isScalarWithPredication. 1333 /// Return the costs for our two available strategies for lowering a 1334 /// div/rem operation which requires speculating at least one lane. 1335 /// First result is for scalarization (will be invalid for scalable 1336 /// vectors); second is for the safe-divisor strategy. 1337 std::pair<InstructionCost, InstructionCost>
1341 /// Returns true if \p I is a memory instruction with consecutive memory 1342 /// access that can be widened. 1345 /// Returns true if \p I is a memory instruction in an interleaved-group 1346 /// of memory accesses that can be vectorized with wide vector loads/stores 1350 /// Check if \p Instr belongs to any interleaved access group. 1355 /// Get the interleaved access group that \p Instr belongs to. 1361 /// Returns true if we're required to use a scalar epilogue for at least 1362 /// the final iteration of the original loop. 1365LLVM_DEBUG(
dbgs() <<
"LV: Loop does not require scalar epilogue\n");
1368// If we might exit from anywhere but the latch and early exit vectorization 1369// is disabled, we must run the exiting iteration in scalar form. 1372LLVM_DEBUG(
dbgs() <<
"LV: Loop requires scalar epilogue: not exiting " 1373"from latch block\n");
1378"interleaved group requires scalar epilogue\n");
1381LLVM_DEBUG(
dbgs() <<
"LV: Loop does not require scalar epilogue\n");
1385 /// Returns true if we're required to use a scalar epilogue for at least 1386 /// the final iteration of the original loop for all VFs in \p Range. 1387 /// A scalar epilogue must either be required for all VFs in \p Range or for 1393bool IsRequired =
all_of(
Range, RequiresScalarEpilogue);
1395 (IsRequired ||
none_of(
Range, RequiresScalarEpilogue)) &&
1396"all VFs in range must agree on whether a scalar epilogue is required");
1400 /// Returns true if a scalar epilogue is not allowed due to optsize or a 1401 /// loop hint annotation. 1406 /// Returns the TailFoldingStyle that is best for the current loop. 1408if (!ChosenTailFoldingStyle)
1410return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1411 : ChosenTailFoldingStyle->second;
1414 /// Selects and saves TailFoldingStyle for 2 options - if IV update may 1415 /// overflow or not. 1416 /// \param IsScalableVF true if scalable vector factors enabled. 1417 /// \param UserIC User specific interleave count. 1419assert(!ChosenTailFoldingStyle &&
"Tail folding must not be selected yet.");
1421 ChosenTailFoldingStyle =
1427 ChosenTailFoldingStyle = std::make_pair(
1433// Set styles when forced. 1438// Override forced styles if needed. 1439// FIXME: use actual opcode/data type for analysis here. 1440// FIXME: Investigate opportunity for fixed vector factor. 1441// FIXME: support fixed-order recurrences by fixing splice of non VFxUF 1447// If for some reason EVL mode is unsupported, fallback to 1448// DataWithoutLaneMask to try to vectorize the loop with folded tail 1450 ChosenTailFoldingStyle =
1455 <<
"LV: Preference for VP intrinsics indicated. Will " 1456"not try to generate VP Intrinsics " 1458 ?
"since interleave count specified is greater than 1.\n" 1459 :
"due to non-interleaving reasons.\n"));
1463 /// Returns true if all loop blocks should be masked to fold tail loop. 1465// TODO: check if it is possible to check for None style independent of 1466// IVUpdateMayOverflow flag in getTailFoldingStyle. 1470 /// Return maximum safe number of elements to be processed per vector 1471 /// iteration, which do not prevent store-load forwarding and are safe with 1472 /// regard to the memory dependencies. Required for EVL-based VPlans to 1473 /// correctly calculate AVL (application vector length) as min(remaining AVL, 1474 /// MaxSafeElements). 1475 /// TODO: need to consider adjusting cost model to use this value as a 1476 /// vectorization factor for EVL-based vectorization. 1479 /// Returns true if the instructions in this block requires predication 1480 /// for any reason, e.g. because tail folding now requires a predicate 1481 /// or because the block in the original loop was predicated. 1486 /// Returns true if VP intrinsics with explicit vector length support should 1487 /// be generated in the tail folded loop. 1492 /// Returns true if the Phi is part of an inloop reduction. 1494return InLoopReductions.contains(Phi);
1497 /// Returns true if the predicated reduction select should be used to set the 1498 /// incoming value for the reduction phi. 1500// Force to use predicated reduction select since the EVL of the 1501// second-to-last iteration might not be VF*UF. 1509 /// Estimate cost of an intrinsic call instruction CI if it were vectorized 1510 /// with factor VF. Return the cost of the instruction, including 1511 /// scalarization overhead if it's needed. 1514 /// Estimate cost of a call instruction CI if it were vectorized with factor 1515 /// VF. Return the cost of the instruction, including scalarization overhead 1519 /// Invalidates decisions already taken by the cost model. 1521 WideningDecisions.
clear();
1522 CallWideningDecisions.
clear();
1527 /// Returns the expected execution cost. The unit of the cost does 1528 /// not matter because we use the 'cost' units to compare different 1529 /// vector widths. The cost that is returned is *not* normalized by 1530 /// the factor width. 1535 /// Returns true if epilogue vectorization is considered profitable, and 1536 /// false otherwise. 1537 /// \p VF is the vectorization factor chosen for the original loop. 1538 /// \p Multiplier is an aditional scaling factor applied to VF before 1539 /// comparing to EpilogueVectorizationMinVF. 1541constunsigned IC)
const;
1543 /// Returns the execution time cost of an instruction for a given vector 1544 /// width. Vector width of one means scalar. 1547 /// Return the cost of instructions in an inloop reduction pattern, if I is 1548 /// part of that pattern. 1551Type *VectorTy)
const;
1553 /// Returns true if \p Op should be considered invariant and if it is 1554 /// trivially hoistable. 1558unsigned NumPredStores = 0;
1560 /// \return An upper bound for the vectorization factors for both 1561 /// fixed and scalable vectorization, where the minimum-known number of 1562 /// elements is a power-of-2 larger than zero. If scalable vectorization is 1563 /// disabled or unsupported, then the scalable part will be equal to 1564 /// ElementCount::getScalable(0). 1567bool FoldTailByMasking);
1569 /// \return the maximized element count based on the targets vector 1570 /// registers and the loop trip-count, but limited to a maximum safe VF. 1571 /// This is a helper function of computeFeasibleMaxVF. 1572ElementCount getMaximizedVFForTarget(
unsigned MaxTripCount,
1573unsigned SmallestType,
1576bool FoldTailByMasking);
1578 /// Checks if scalable vectorization is supported and enabled. Caches the 1579 /// result to avoid repeated debug dumps for repeated queries. 1580bool isScalableVectorizationAllowed();
1582 /// \return the maximum legal scalable VF, based on the safe max number 1584ElementCount getMaxLegalScalableVF(
unsigned MaxSafeElements);
1586 /// Calculate vectorization cost of memory instruction \p I. 1589 /// The cost computation for scalarized memory instruction. 1592 /// The cost computation for interleaving group of memory instructions. 1595 /// The cost computation for Gather/Scatter instruction. 1598 /// The cost computation for widening instruction \p I with consecutive 1602 /// The cost calculation for Load/Store instruction \p I with uniform pointer - 1603 /// Load: scalar load + broadcast. 1604 /// Store: scalar store + (loop invariant value stored? 0 : extract of last 1608 /// Estimate the overhead of scalarizing an instruction. This is a 1609 /// convenience wrapper for the type-based getScalarizationOverhead API. 1613 /// Returns true if an artificially high cost for emulated masked memrefs 1617 /// Map of scalar integer values to the smallest bitwidth they can be legally 1618 /// represented as. The vector equivalents of these values should be truncated 1622 /// A type representing the costs for instructions if they were to be 1623 /// scalarized rather than vectorized. The entries are Instruction-Cost 1627 /// A set containing all BasicBlocks that are known to present after 1628 /// vectorization as a predicated block. 1630 PredicatedBBsAfterVectorization;
1632 /// Records whether it is allowed to have the original scalar loop execute at 1633 /// least once. This may be needed as a fallback loop in case runtime 1634 /// aliasing/dependence checks fail, or to handle the tail/remainder 1635 /// iterations when the trip count is unknown or doesn't divide by the VF, 1636 /// or as a peel-loop to handle gaps in interleave-groups. 1637 /// Under optsize and when the trip count is very small we don't allow any 1638 /// iterations to execute in the scalar loop. 1641 /// Control finally chosen tail folding style. The first element is used if 1642 /// the IV update may overflow, the second element - if it does not. 1643 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1644 ChosenTailFoldingStyle;
1646 /// true if scalable vectorization is supported and enabled. 1647 std::optional<bool> IsScalableVectorizationAllowed;
1649 /// Maximum safe number of elements to be processed per vector iteration, 1650 /// which do not prevent store-load forwarding and are safe with regard to the 1651 /// memory dependencies. Required for EVL-based veectorization, where this 1652 /// value is used as the upper bound of the safe AVL. 1653 std::optional<unsigned> MaxSafeElements;
1655 /// A map holding scalar costs for different vectorization factors. The 1656 /// presence of a cost for an instruction in the mapping indicates that the 1657 /// instruction will be scalarized when vectorizing with the associated 1658 /// vectorization factor. The entries are VF-ScalarCostTy pairs. 1661 /// Holds the instructions known to be uniform after vectorization. 1662 /// The data is collected per VF. 1665 /// Holds the instructions known to be scalar after vectorization. 1666 /// The data is collected per VF. 1669 /// Holds the instructions (address computations) that are forced to be 1673 /// PHINodes of the reductions that should be expanded in-loop. 1676 /// A Map of inloop reduction operations and their immediate chain operand. 1677 /// FIXME: This can be removed once reductions can be costed correctly in 1678 /// VPlan. This was added to allow quick lookup of the inloop operations. 1681 /// Returns the expected difference in cost from scalarizing the expression 1682 /// feeding a predicated instruction \p PredInst. The instructions to 1683 /// scalarize and their scalar costs are collected in \p ScalarCosts. A 1684 /// non-negative return value implies the expression will be scalarized. 1685 /// Currently, only single-use chains are considered for scalarization. 1687 ScalarCostsTy &ScalarCosts,
1690 /// Collect the instructions that are uniform after vectorization. An 1691 /// instruction is uniform if we represent it with a single scalar value in 1692 /// the vectorized loop corresponding to each vector iteration. Examples of 1693 /// uniform instructions include pointer operands of consecutive or 1694 /// interleaved memory accesses. Note that although uniformity implies an 1695 /// instruction will be scalar, the reverse is not true. In general, a 1696 /// scalarized instruction will be represented by VF scalar values in the 1697 /// vectorized loop, each corresponding to an iteration of the original 1701 /// Collect the instructions that are scalar after vectorization. An 1702 /// instruction is scalar if it is known to be uniform or will be scalarized 1703 /// during vectorization. collectLoopScalars should only add non-uniform nodes 1704 /// to the list if they are used by a load/store instruction that is marked as 1705 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by 1706 /// VF values in the vectorized loop, each corresponding to an iteration of 1707 /// the original scalar loop. 1710 /// Keeps cost model vectorization decision and cost for instructions. 1711 /// Right now it is used for memory instructions only. 1713 std::pair<InstWidening, InstructionCost>>;
1715 DecisionList WideningDecisions;
1717usingCallDecisionList =
1720 CallDecisionList CallWideningDecisions;
1722 /// Returns true if \p V is expected to be vectorized and it needs to be 1731// Assume we can vectorize V (and hence we need extraction) if the 1732// scalars are not computed yet. This can happen, because it is called 1733// via getScalarizationOverhead from setCostBasedWideningDecision, before 1734// the scalars are collected. That should be a safe assumption in most 1735// cases, because we check if the operands have vectorizable types 1736// beforehand in LoopVectorizationLegality. 1740 /// Returns a range containing only operands needing to be extracted. 1744 Ops, [
this, VF](
Value *V) {
return this->needsExtract(V, VF); }));
1748 /// The loop that we evaluate. 1751 /// Predicated scalar evolution analysis. 1754 /// Loop Info analysis. 1757 /// Vectorization legality. 1760 /// Vector target information. 1763 /// Target Library Info. 1766 /// Demanded bits analysis. 1769 /// Assumption cache. 1772 /// Interface to emit optimization remarks. 1777 /// Loop Vectorize Hint. 1780 /// The interleave access information contains groups of interleaved accesses 1781 /// with the same stride and close to each other. 1784 /// Values to ignore in the cost model. 1787 /// Values to ignore in the cost model when VF > 1. 1790 /// All element types found in the loop. 1793 /// The kind of cost that we are calculating 1796}
// end namespace llvm 1799/// Helper struct to manage generating runtime checks for vectorization. 1801/// The runtime checks are created up-front in temporary blocks to allow better 1802/// estimating the cost and un-linked from the existing IR. After deciding to 1803/// vectorize, the checks are moved back. If deciding not to vectorize, the 1804/// temporary blocks are completely removed. 1805classGeneratedRTChecks {
1806 /// Basic block which contains the generated SCEV checks, if any. 1809 /// The value representing the result of the generated SCEV checks. If it is 1810 /// nullptr, either no SCEV checks have been generated or they have been used. 1811Value *SCEVCheckCond =
nullptr;
1813 /// Basic block which contains the generated memory runtime checks, if any. 1816 /// The value representing the result of the generated memory runtime checks. 1817 /// If it is nullptr, either no memory runtime checks have been generated or 1818 /// they have been used. 1819Value *MemRuntimeCheckCond =
nullptr;
1828bool CostTooHigh =
false;
1829constbool AddBranchWeights;
1831Loop *OuterLoop =
nullptr;
1835 /// The kind of cost that we are calculating 1843 : DT(DT), LI(LI),
TTI(
TTI), SCEVExp(*PSE.
getSE(),
DL,
"scev.check"),
1844 MemCheckExp(*PSE.
getSE(),
DL,
"scev.check"),
1845 AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}
1847 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can 1848 /// accurately estimate the cost of the runtime checks. The blocks are 1849 /// un-linked from the IR and are added back during vector code generation. If 1850 /// there is no vector code generation, the check blocks are removed 1855// Hard cutoff to limit compile-time increase in case a very large number of 1856// runtime checks needs to be generated. 1857// TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to 1867// Use SplitBlock to create blocks for SCEV & memory runtime checks to 1868// ensure the blocks are properly added to LoopInfo & DominatorTree. Those 1869// may be used by SCEVExpander. The blocks will be un-linked from their 1870// predecessors and removed from LI & DT at the end of the function. 1873nullptr,
"vector.scevcheck");
1880if (RtPtrChecking.Need) {
1881auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1882 MemCheckBlock =
SplitBlock(Pred, Pred->getTerminator(), DT, LI,
nullptr,
1885auto DiffChecks = RtPtrChecking.getDiffChecks();
1887Value *RuntimeVF =
nullptr;
1892 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1898 MemCheckBlock->
getTerminator(), L, RtPtrChecking.getChecks(),
1901assert(MemRuntimeCheckCond &&
1902"no RT checks generated although RtPtrChecking " 1903"claimed checks are required");
1906if (!MemCheckBlock && !SCEVCheckBlock)
1909// Unhook the temporary block with the checks, update various places 1916if (SCEVCheckBlock) {
1934if (SCEVCheckBlock) {
1939// Outer loop is used as part of the later cost calculations. 1940 OuterLoop =
L->getParentLoop();
1944if (SCEVCheckBlock || MemCheckBlock)
1957if (SCEVCheckBlock->getTerminator() == &
I)
1966if (MemCheckBlock->getTerminator() == &
I)
1973// If the runtime memory checks are being created inside an outer loop 1974// we should find out if these checks are outer loop invariant. If so, 1975// the checks will likely be hoisted out and so the effective cost will 1976// reduce according to the outer loop trip count. 1979// TODO: If profitable, we could refine this further by analysing every 1980// individual memory check, since there could be a mixture of loop 1981// variant and invariant checks that mean the final condition is 1985// It seems reasonable to assume that we can reduce the effective 1986// cost of the checks even when we know nothing about the trip 1987// count. Assume that the outer loop executes at least twice. 1988unsigned BestTripCount = 2;
1990// Get the best known TC estimate. 1992 PSE, OuterLoop,
/* CanUseConstantMax = */false))
1993 BestTripCount = *EstimatedTC;
1995 BestTripCount = std::max(BestTripCount, 1U);
1998// Let's ensure the cost is always at least 1. 1999 NewMemCheckCost = std::max(*NewMemCheckCost.
getValue(),
2002if (BestTripCount > 1)
2004 <<
"We expect runtime memory checks to be hoisted " 2005 <<
"out of the outer loop. Cost reduced from " 2006 << MemCheckCost <<
" to " << NewMemCheckCost <<
'\n');
2008 MemCheckCost = NewMemCheckCost;
2012 RTCheckCost += MemCheckCost;
2015if (SCEVCheckBlock || MemCheckBlock)
2016LLVM_DEBUG(
dbgs() <<
"Total cost of runtime checks: " << RTCheckCost
2022 /// Remove the created SCEV & memory runtime check blocks & instructions, if 2024 ~GeneratedRTChecks() {
2028 SCEVCleaner.markResultUsed();
2030if (!MemRuntimeCheckCond)
2031 MemCheckCleaner.markResultUsed();
2033if (MemRuntimeCheckCond) {
2034auto &SE = *MemCheckExp.
getSE();
2035// Memory runtime check generation creates compares that use expanded 2036// values. Remove them before running the SCEVExpanderCleaners. 2044 MemCheckCleaner.cleanup();
2045 SCEVCleaner.cleanup();
2048 SCEVCheckBlock->eraseFromParent();
2049if (MemRuntimeCheckCond)
2050 MemCheckBlock->eraseFromParent();
2053 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and 2054 /// adjusts the branches to branch to the vector preheader or \p Bypass, 2055 /// depending on the generated condition. 2062// Mark the check as used, to prevent it from being removed during cleanup. 2063 SCEVCheckCond =
nullptr;
2064if (
auto *
C = dyn_cast<ConstantInt>(
Cond))
2071// Create new preheader for vector loop. 2075 SCEVCheckBlock->getTerminator()->eraseFromParent();
2076 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2077 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2084if (AddBranchWeights)
2087return SCEVCheckBlock;
2090 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts 2091 /// the branches to branch to the vector preheader or \p Bypass, depending on 2092 /// the generated condition. 2095// Check if we generated code that checks in runtime if arrays overlap. 2096if (!MemRuntimeCheckCond)
2105 MemCheckBlock->moveBefore(LoopVectorPreHeader);
2112if (AddBranchWeights) {
2116 MemCheckBlock->getTerminator()->setDebugLoc(
2117 Pred->getTerminator()->getDebugLoc());
2119// Mark the check as used, to prevent it from being removed during cleanup. 2120 MemRuntimeCheckCond =
nullptr;
2121return MemCheckBlock;
2127return Style == TailFoldingStyle::Data ||
2128 Style == TailFoldingStyle::DataAndControlFlow ||
2129 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2133return Style == TailFoldingStyle::DataAndControlFlow ||
2134 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2137// Return true if \p OuterLp is an outer loop annotated with hints for explicit 2138// vectorization. The loop needs to be annotated with #pragma omp simd 2139// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the 2140// vector length information is not provided, vectorization is not considered 2141// explicit. Interleave hints are not allowed either. These limitations will be 2142// relaxed in the future. 2143// Please, note that we are currently forced to abuse the pragma 'clang 2144// vectorize' semantics. This pragma provides *auto-vectorization hints* 2145// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd' 2146// provides *explicit vectorization hints* (LV can bypass legal checks and 2147// assume that vectorization is legal). However, both hints are implemented 2148// using the same metadata (llvm.loop.vectorize, processed by 2149// LoopVectorizeHints). This will be fixed in the future when the native IR 2150// representation for pragma 'omp simd' is introduced. 2156// Only outer loops with an explicit vectorization hint are supported. 2157// Unannotated outer loops are ignored. 2163true/*VectorizeOnlyWhenForced*/)) {
2164LLVM_DEBUG(
dbgs() <<
"LV: Loop hints prevent outer loop vectorization.\n");
2169// TODO: Interleave support is future work. 2170LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Interleave is not supported for " 2182// Collect inner loops and outer loops without irreducible control flow. For 2183// now, only collect outer loops that have explicit vectorization hints. If we 2184// are stress testing the VPlan H-CFG construction, we collect the outermost 2185// loop of every loop nest. 2190if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2192// TODO: Collect inner loops inside marked outer loops in case 2193// vectorization fails for the outer loop. Do not invoke 2194// 'containsIrreducibleCFG' again for inner loops when the outer loop is 2195// already known to be reducible. We can use an inherited attribute for 2200for (
Loop *InnerL : L)
2204//===----------------------------------------------------------------------===// 2205// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and 2206// LoopVectorizationCostModel and LoopVectorizationPlanner. 2207//===----------------------------------------------------------------------===// 2209/// Compute the transformed value of Index at offset StartValue using step 2211/// For integer induction, returns StartValue + Index * StepValue. 2212/// For pointer induction, returns StartValue[Index * StepValue]. 2213/// FIXME: The newly created binary instructions should contain nsw/nuw 2214/// flags, which can be found from the original scalar operations. 2222 ?
B.CreateSExtOrTrunc(Index, StepTy)
2223 :
B.CreateCast(Instruction::SIToFP, Index, StepTy);
2224if (CastedIndex != Index) {
2226 Index = CastedIndex;
2229// Note: the IR at this point is broken. We cannot use SE to create any new 2230// SCEV and then expand it, hoping that SCEV's simplification will give us 2231// a more optimal code. Unfortunately, attempt of doing so on invalid IR may 2232// lead to various SCEV crashes. So all we can do is to use builder and rely 2233// on InstCombine for future simplifications. Here we handle some trivial 2236assert(
X->getType() ==
Y->getType() &&
"Types don't match!");
2237if (
auto *CX = dyn_cast<ConstantInt>(
X))
2240if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2243returnB.CreateAdd(
X,
Y);
2246// We allow X to be a vector type, in which case Y will potentially be 2247// splatted into a vector with the same element count. 2249assert(
X->getType()->getScalarType() ==
Y->getType() &&
2250"Types don't match!");
2251if (
auto *CX = dyn_cast<ConstantInt>(
X))
2254if (
auto *CY = dyn_cast<ConstantInt>(
Y))
2257VectorType *XVTy = dyn_cast<VectorType>(
X->getType());
2258if (XVTy && !isa<VectorType>(
Y->getType()))
2259Y =
B.CreateVectorSplat(XVTy->getElementCount(),
Y);
2260returnB.CreateMul(
X,
Y);
2263switch (InductionKind) {
2265assert(!isa<VectorType>(Index->getType()) &&
2266"Vector indices not supported for integer inductions yet");
2268"Index type does not match StartValue type");
2269if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2270returnB.CreateSub(StartValue, Index);
2275returnB.CreatePtrAdd(StartValue,
CreateMul(Index, Step));
2277assert(!isa<VectorType>(Index->getType()) &&
2278"Vector indices not supported for FP inductions yet");
2281 (InductionBinOp->
getOpcode() == Instruction::FAdd ||
2282 InductionBinOp->
getOpcode() == Instruction::FSub) &&
2283"Original bin op should be defined for FP induction");
2285Value *MulExp =
B.CreateFMul(Step, Index);
2286returnB.CreateBinOp(InductionBinOp->
getOpcode(), StartValue, MulExp,
2300if (
F.hasFnAttribute(Attribute::VScaleRange))
2301returnF.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
2306/// For the given VF and UF and maximum trip count computed for the loop, return 2307/// whether the induction variable might overflow in the vectorized loop. If not, 2308/// then we know a runtime overflow check always evaluates to false and can be 2312ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2313// Always be conservative if we don't know the exact unroll factor. 2314unsigned MaxUF = UF ? *UF :
Cost->TTI.getMaxInterleaveFactor(VF);
2316Type *IdxTy =
Cost->Legal->getWidestInductionType();
2317APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();
2319// We know the runtime overflow check is known false iff the (max) trip-count 2320// is known and (max) trip-count + (VF * UF) does not overflow in the type of 2321// the vector loop induction variable. 2322if (
unsigned TC =
Cost->PSE.getSmallConstantMaxTripCount()) {
2325 std::optional<unsigned> MaxVScale =
2329 MaxVF *= *MaxVScale;
2332return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);
2338// Return whether we allow using masked interleave-groups (for dealing with 2339// strided loads/stores that reside in predicated blocks, or for dealing 2342// If an override option has been passed in for interleaved accesses, use it. 2353assert(!Instr->getType()->isAggregateType() &&
"Can't handle vectors");
2355// Does this instruction return a value ? 2356bool IsVoidRetTy = Instr->getType()->isVoidTy();
2360 Cloned->
setName(Instr->getName() +
".cloned");
2362// Verify that VPlan type inference results agree with the type of the 2365"inferred type and type from generated instructions do not match");
2371if (
autoDL = Instr->getDebugLoc())
2374// Replace the operands of the cloned instructions with their scalar 2375// equivalents in the new loop. 2377auto InputLane = Lane;
2385// Place the cloned scalar in the new loop. 2388 State.
set(RepRecipe, Cloned, Lane);
2390// If we just cloned a new assumption, add it the assumption cache. 2391if (
auto *
II = dyn_cast<AssumeInst>(Cloned))
2396bool IfPredicateInstr = Parent ? Parent->
isReplicator() :
false;
2400 [](
VPValue *
Op) { return Op->isDefinedOutsideLoopRegions(); })) &&
2401"Expected a recipe is either within a region or all of its operands " 2402"are defined outside the vectorized region.");
2403if (IfPredicateInstr)
2416// This is where we can make the step a runtime constant. 2419// If the tail is to be folded by masking, round the number of iterations N 2420// up to a multiple of Step instead of rounding down. This is done by first 2421// adding Step-1 and then rounding down. Note that it's ok if this addition 2422// overflows: the vector induction variable will eventually wrap to zero given 2423// that it starts at zero and its Step is a power of two; the loop will then 2424// exit, with the last early-exit vector comparison also producing all-true. 2425// For scalable vectors the VF is not guaranteed to be a power of 2, but this 2426// is accounted for in emitIterationCountCheck that adds an overflow check. 2427if (
Cost->foldTailByMasking()) {
2429"VF*UF must be a power of 2 when folding tail by masking");
2434// Now we need to generate the expression for the part of the loop that the 2435// vectorized body will execute. This is equal to N - (N % Step) if scalar 2436// iterations are not required for correctness, or N - Step, otherwise. Step 2437// is equal to the vectorization factor (number of SIMD elements) times the 2438// unroll factor (number of SIMD instructions). 2441// There are cases where we *must* run at least one iteration in the remainder 2442// loop. See the cost model for when this can happen. If the step evenly 2443// divides the trip count, we set the remainder to be equal to the step. If 2444// the step does not evenly divide the trip count, no adjustment is necessary 2445// since there will already be scalar iterations. Note that the minimum 2446// iterations check ensures that N >= Step. 2463"Unexpected successor");
2466 PreVectorPH = CheckVPIRBB;
2474// Reuse existing vector loop preheader for TC checks. 2475// Note that new preheader block is generated for vector loop. 2479// Generate code to check if the loop's trip count is less than VF * UF, or 2480// equal to it in case a scalar epilogue is required; this implies that the 2481// vector trip count is zero. This check also covers the case where adding one 2482// to the backedge-taken count overflowed leading to an incorrect trip count 2483// of zero. In this case we will also jump to the scalar loop. 2487// If tail is to be folded, vector loop takes care of all iterations. 2490auto CreateStep = [&]() ->
Value * {
2491// Create step with max(MinProTripCount, UF * VF). 2505Value *Step = CreateStep();
2507// TODO: Emit unconditional branch to vector preheader instead of 2508// conditional branch with known condition. 2510// Check if the trip count is < the step. 2512// TODO: Ensure step is at most the trip count when determining max VF and 2513// UF, w/o tail folding. 2516 TripCountSCEV, SE.
getSCEV(Step))) {
2517// Generate the minimum iteration check only if we cannot prove the 2518// check is known to be true, or known to be false. 2520 }
// else step known to be < trip count, use CheckMinIters preset to false. 2524// vscale is not necessarily a power-of-2, which means we cannot guarantee 2525// an overflow to zero when updating induction variables and so an 2526// additional overflow check is required before entering the vector loop. 2528// Get the maximum unsigned value for the type. 2529Value *MaxUIntTripCount =
2530 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2533// Don't execute the vector loop if (UMax - n) < (VF * UF). 2537// Create new preheader for vector loop. 2544"TC check is expected to dominate Bypass");
2553// TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here. 2566"Cannot SCEV check stride or overflow when optimizing for size");
2568"Should already be a bypass block due to iteration count check");
2573return SCEVCheckBlock;
2577// VPlan-native path does not do any analysis for runtime checks currently. 2584// Check if we generated code that checks in runtime if arrays overlap. We put 2585// the checks into a separate block to make the more common case of few 2592"Cannot emit memory checks when optimizing for size, unless forced " 2598 <<
"Code-size may be reduced by not forcing " 2599"vectorization, or by source-code modifications " 2600"eliminating the need for runtime checks " 2601"(e.g., adding 'restrict').";
2610return MemCheckBlock;
2613/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p 2614/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must 2615/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All 2616/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock. 2620assert(!R.isPhi() &&
"Tried to move phi recipe to end of block");
2621 R.moveBefore(*IRVPBB, IRVPBB->
end());
2625// VPBB is now dead and will be cleaned up when the plan gets destroyed. 2633"loops not exiting via the latch without required epilogue?");
2637LI,
nullptr,
Twine(Prefix) +
"middle.block");
2641nullptr,
Twine(Prefix) +
"scalar.ph");
2645/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV 2646/// expansion results. 2648const SCEV2ValueTy &ExpandedSCEVs) {
2649constSCEV *Step =
ID.getStep();
2650if (
auto *
C = dyn_cast<SCEVConstant>(Step))
2652if (
auto *U = dyn_cast<SCEVUnknown>(Step))
2653return U->getValue();
2654autoI = ExpandedSCEVs.find(Step);
2655assert(
I != ExpandedSCEVs.end() &&
"SCEV must be expanded at this point");
2659/// Knowing that loop \p L executes a single vector iteration, add instructions 2660/// that will get simplified and thus should not have any cost to \p 2665auto *Cmp = L->getLatchCmpInst();
2667 InstsToIgnore.
insert(Cmp);
2668for (
constauto &KV : IL) {
2669// Extract the key by hand so that it can be used in the lambda below. Note 2670// that captured structured bindings are a C++20 extension. 2673// Get next iteration value of the induction variable. 2675 cast<Instruction>(
IV->getIncomingValueForBlock(L->getLoopLatch()));
2677 [&](
constUser *U) { return U == IV || U == Cmp; }))
2678 InstsToIgnore.
insert(IVInst);
2683const SCEV2ValueTy &ExpandedSCEVs,
Value *MainVectorTripCount) {
2684assert(MainVectorTripCount &&
"Must have bypass information");
2690PHINode *OrigPhi = InductionEntry.first;
2693// For the primary induction the additional bypass end value is known. 2694// Otherwise it is computed. 2695Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2696if (OrigPhi != OldInduction) {
2697auto *BinOp =
II.getInductionBinOp();
2698// Fast-math-flags propagate from the original induction instruction. 2699if (isa_and_nonnull<FPMathOperator>(BinOp))
2702// Compute the end value for the additional bypass. 2703 EndValueFromAdditionalBypass =
2705II.getStartValue(), Step,
II.getKind(), BinOp);
2706 EndValueFromAdditionalBypass->
setName(
"ind.end");
2709// Store the bypass value here, as it needs to be added as operand to its 2710// scalar preheader phi node after the epilogue skeleton has been created. 2711// TODO: Directly add as extra operand to the VPResumePHI recipe. 2713"entry for OrigPhi already exits");
2719const SCEV2ValueTy &ExpandedSCEVs) {
2721 In this function we generate a new loop. The new loop will contain 2722 the vectorized instructions while the old loop will continue to run the 2725 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's 2726 / | preheader are expanded here. Eventually all required SCEV 2727 / | expansion should happen here. 2729 | [ ] <-- vector loop bypass (may consist of multiple blocks). 2732 || [ ] <-- vector pre header. 2736 | [ ]_| <-- vector loop (created during VPlan execution). 2739 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to 2740 | | successors created during VPlan execution) 2743 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock). 2745 (opt) v <-- edge from middle to exit iff epilogue is not required. 2747 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header 2748 | | wrapped in VPIRBasicBlock). 2751 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock) 2755// Create an empty vector loop, and prepare basic blocks for the runtime 2759// Now, compare the new count to zero. If it is zero skip the vector loop and 2760// jump to the scalar loop. This check also covers the case where the 2761// backedge-taken count is uint##_max: adding one to it will overflow leading 2762// to an incorrect trip count of zero. In this (rare) case we will also jump 2763// to the scalar loop. 2766// Generate the code to check any assumptions that we've made for SCEV 2770// Generate the code that checks in runtime if arrays overlap. We put the 2771// checks into a separate block to make the more common case of few elements 2780structCSEDenseMapInfo {
2782return isa<InsertElementInst>(
I) || isa<ExtractElementInst>(
I) ||
2783 isa<ShuffleVectorInst>(
I) || isa<GetElementPtrInst>(
I);
2795assert(canHandle(
I) &&
"Unknown instruction!");
2801if (
LHS == getEmptyKey() ||
RHS == getEmptyKey() ||
2802LHS == getTombstoneKey() ||
RHS == getTombstoneKey())
2804returnLHS->isIdenticalTo(
RHS);
2808}
// end anonymous namespace 2810///Perform cse of induction variable instructions. 2812// Perform simple cse. 2815if (!CSEDenseMapInfo::canHandle(&In))
2818// Check if we can replace this instruction with any of the 2819// visited instructions. 2821 In.replaceAllUsesWith(V);
2822 In.eraseFromParent();
2833// We only need to calculate a cost if the VF is scalar; for actual vectors 2834// we should already have a pre-calculated cost at each VF. 2836return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;
2844for (
auto &ArgOp : CI->
args())
2845 Tys.push_back(ArgOp->getType());
2850// If this is an intrinsic we may have a lower cost for it. 2853return std::min(ScalarCallCost, IntrinsicCost);
2855return ScalarCallCost;
2868assert(
ID &&
"Expected intrinsic call!");
2871if (
auto *FPMO = dyn_cast<FPMathOperator>(CI))
2872 FMF = FPMO->getFastMathFlags();
2878 std::back_inserter(ParamTys),
2879 [&](
Type *Ty) { return maybeVectorizeType(Ty, VF); });
2882 dyn_cast<IntrinsicInst>(CI));
2887// Fix widened non-induction PHIs by setting up the PHI operands. 2891// Forget the original basic block. 2895// After vectorization, the exit blocks of the original loop will have 2896// additional predecessors. Invalidate SCEVs for the exit phis in case SE 2897// looked through single-entry phis. 2901for (
PHINode &PN : Exit->phis())
2904// Don't apply optimizations below when no vector region remains, as they all 2905// require a vector loop at the moment. 2916// Remove redundant induction instructions. 2919// Set/update profile weights for the vector and remainder loops as original 2920// loop iterations are now distributed among them. Note that original loop 2921// becomes the scalar remainder loop after vectorization. 2923// For cases like foldTailByMasking() and requiresScalarEpiloque() we may 2924// end up getting slightly roughened result but that should be OK since 2925// profile is not inherently precise anyway. Note also possible bypass of 2926// vector code caused by legality checks is ignored, assigning all the weight 2927// to the vector loop, optimistically. 2929// For scalable vectorization we can't know at compile time how many 2930// iterations of the loop are handled in one vector iteration, so instead 2931// assume a pessimistic vscale of '1'. 2938// The basic block and loop containing the predicated instruction. 2942// Initialize a worklist with the operands of the predicated instruction. 2945// Holds instructions that we need to analyze again. An instruction may be 2946// reanalyzed if we don't yet know if we can sink it or not. 2949// Returns true if a given use occurs in the predicated block. Phi nodes use 2950// their operands in their corresponding predecessor blocks. 2951auto IsBlockOfUsePredicated = [&](
Use &U) ->
bool {
2952auto *
I = cast<Instruction>(U.getUser());
2954if (
auto *Phi = dyn_cast<PHINode>(
I))
2955 BB = Phi->getIncomingBlock(
2960// Iteratively sink the scalarized operands of the predicated instruction 2961// into the block we created for it. When an instruction is sunk, it's 2962// operands are then added to the worklist. The algorithm ends after one pass 2963// through the worklist doesn't sink a single instruction. 2966// Add the instructions that need to be reanalyzed to the worklist, and 2967// reset the changed indicator. 2968 Worklist.
insert(InstsToReanalyze.
begin(), InstsToReanalyze.
end());
2969 InstsToReanalyze.
clear();
2972while (!Worklist.
empty()) {
2975// We can't sink an instruction if it is a phi node, is not in the loop, 2976// may have side effects or may read from memory. 2977// TODO: Could do more granular checking to allow sinking 2978// a load past non-store instructions. 2979if (!
I || isa<PHINode>(
I) || !VectorLoop->contains(
I) ||
2980I->mayHaveSideEffects() ||
I->mayReadFromMemory())
2983// If the instruction is already in PredBB, check if we can sink its 2984// operands. In that case, VPlan's sinkScalarOperands() succeeded in 2985// sinking the scalar instruction I, hence it appears in PredBB; but it 2986// may have failed to sink I's operands (recursively), which we try 2988if (
I->getParent() == PredBB) {
2989 Worklist.
insert(
I->op_begin(),
I->op_end());
2993// It's legal to sink the instruction if all its uses occur in the 2994// predicated block. Otherwise, there's nothing to do yet, and we may 2995// need to reanalyze the instruction. 3001// Move the instruction to the beginning of the predicated block, and add 3002// it's operands to the worklist. 3003I->moveBefore(PredBB->getFirstInsertionPt());
3004 Worklist.
insert(
I->op_begin(),
I->op_end());
3006// The sinking may have enabled other instructions to be sunk, so we will 3015for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
3020PHINode *NewPhi = cast<PHINode>(State.
get(VPPhi));
3021// Make sure the builder has a valid insert point. 3032void LoopVectorizationCostModel::collectLoopScalars(
ElementCount VF) {
3033// We should not collect Scalars more than once per VF. Right now, this 3034// function is called from collectUniformsAndScalars(), which already does 3035// this check. Collecting Scalars for VF=1 does not make any sense. 3037"This function should not be visited twice for the same VF");
3039// This avoids any chances of creating a REPLICATE recipe during planning 3040// since that would result in generation of scalarized code during execution, 3041// which is not supported for scalable vectors. 3043 Scalars[VF].
insert(Uniforms[VF].begin(), Uniforms[VF].end());
3049// These sets are used to seed the analysis with pointers used by memory 3050// accesses that will remain scalar. 3055// A helper that returns true if the use of Ptr by MemAccess will be scalar. 3056// The pointer operands of loads and stores will be scalar as long as the 3057// memory access is not a gather or scatter operation. The value operand of a 3058// store will remain scalar if the store is scalarized. 3062"Widening decision should be ready at this moment");
3063if (
auto *Store = dyn_cast<StoreInst>(MemAccess))
3064if (
Ptr == Store->getValueOperand())
3067"Ptr is neither a value or pointer operand");
3071// A helper that returns true if the given value is a getelementptr 3072// instruction contained in the loop. 3073auto IsLoopVaryingGEP = [&](
Value *
V) {
3077// A helper that evaluates a memory access's use of a pointer. If the use will 3078// be a scalar use and the pointer is only used by memory accesses, we place 3079// the pointer in ScalarPtrs. Otherwise, the pointer is placed in 3080// PossibleNonScalarPtrs. 3082// We only care about bitcast and getelementptr instructions contained in 3084if (!IsLoopVaryingGEP(
Ptr))
3087// If the pointer has already been identified as scalar (e.g., if it was 3088// also identified as uniform), there's nothing to do. 3089auto *
I = cast<Instruction>(
Ptr);
3093// If the use of the pointer will be a scalar use, and all users of the 3094// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise, 3095// place the pointer in PossibleNonScalarPtrs. 3096if (IsScalarUse(MemAccess,
Ptr) &&
3097all_of(
I->users(), IsaPred<LoadInst, StoreInst>))
3100 PossibleNonScalarPtrs.
insert(
I);
3103// We seed the scalars analysis with three classes of instructions: (1) 3104// instructions marked uniform-after-vectorization and (2) bitcast, 3105// getelementptr and (pointer) phi instructions used by memory accesses 3106// requiring a scalar use. 3108// (1) Add to the worklist all instructions that have been identified as 3109// uniform-after-vectorization. 3112// (2) Add to the worklist all bitcast and getelementptr instructions used by 3113// memory accesses requiring a scalar use. The pointer operands of loads and 3114// stores will be scalar unless the operation is a gather or scatter. 3115// The value operand of a store will remain scalar if the store is scalarized. 3117for (
auto &
I : *BB) {
3118if (
auto *Load = dyn_cast<LoadInst>(&
I)) {
3119 EvaluatePtrUse(Load,
Load->getPointerOperand());
3120 }
elseif (
auto *Store = dyn_cast<StoreInst>(&
I)) {
3121 EvaluatePtrUse(Store,
Store->getPointerOperand());
3122 EvaluatePtrUse(Store,
Store->getValueOperand());
3125for (
auto *
I : ScalarPtrs)
3126if (!PossibleNonScalarPtrs.
count(
I)) {
3131// Insert the forced scalars. 3132// FIXME: Currently VPWidenPHIRecipe() often creates a dead vector 3133// induction variable when the PHI user is scalarized. 3134auto ForcedScalar = ForcedScalars.
find(VF);
3135if (ForcedScalar != ForcedScalars.
end())
3136for (
auto *
I : ForcedScalar->second) {
3137LLVM_DEBUG(
dbgs() <<
"LV: Found (forced) scalar instruction: " << *
I <<
"\n");
3141// Expand the worklist by looking through any bitcasts and getelementptr 3142// instructions we've already identified as scalar. This is similar to the 3143// expansion step in collectLoopUniforms(); however, here we're only 3144// expanding to include additional bitcasts and getelementptr instructions. 3146while (
Idx != Worklist.
size()) {
3148if (!IsLoopVaryingGEP(Dst->getOperand(0)))
3150auto *Src = cast<Instruction>(Dst->getOperand(0));
3152 auto *J = cast<Instruction>(U);
3153 return !TheLoop->contains(J) || Worklist.count(J) ||
3154 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
3155 IsScalarUse(J, Src));
3158LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Src <<
"\n");
3162// An induction variable will remain scalar if all users of the induction 3163// variable and induction variable update remain scalar. 3165auto *Ind = Induction.first;
3166auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3168// If tail-folding is applied, the primary induction variable will be used 3169// to feed a vector compare. 3173// Returns true if \p Indvar is a pointer induction that is used directly by 3174// load/store instruction \p I. 3175auto IsDirectLoadStoreFromPtrIndvar = [&](
Instruction *Indvar,
3177return Induction.second.getKind() ==
3179 (isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
3183// Determine if all users of the induction variable are scalar after 3185bool ScalarInd =
all_of(Ind->users(), [&](
User *U) ->
bool {
3186 auto *I = cast<Instruction>(U);
3187 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3188 IsDirectLoadStoreFromPtrIndvar(Ind, I);
3193// If the induction variable update is a fixed-order recurrence, neither the 3194// induction variable or its update should be marked scalar after 3196auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);
3200// Determine if all users of the induction variable update instruction are 3201// scalar after vectorization. 3202bool ScalarIndUpdate =
all_of(IndUpdate->users(), [&](
User *U) ->
bool {
3203 auto *I = cast<Instruction>(U);
3204 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
3205 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
3207if (!ScalarIndUpdate)
3210// The induction variable and its update instruction will remain scalar. 3212 Worklist.
insert(IndUpdate);
3213LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *Ind <<
"\n");
3214LLVM_DEBUG(
dbgs() <<
"LV: Found scalar instruction: " << *IndUpdate
3226// Do we have a non-scalar lowering for this predicated 3227// instruction? No - it is scalar with predication. 3228switch(
I->getOpcode()) {
3231case Instruction::Call:
3234return CallWideningDecisions.at(std::make_pair(cast<CallInst>(
I), VF))
3236case Instruction::Load:
3237case Instruction::Store: {
3249case Instruction::UDiv:
3250case Instruction::SDiv:
3251case Instruction::SRem:
3252case Instruction::URem: {
3253// We have the option to use the safe-divisor idiom to avoid predication. 3254// The cost based decision here will always select safe-divisor for 3255// scalable vectors as scalarization isn't legal. 3262// TODO: Fold into LoopVectorizationLegality::isMaskRequired. 3264// If predication is not needed, avoid it. 3265// TODO: We can use the loop-preheader as context point here and get 3266// context sensitive reasoning for isSafeToSpeculativelyExecute. 3270 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(
I))
3273// If the instruction was executed conditionally in the original scalar loop, 3274// predication is needed with a mask whose lanes are all possibly inactive. 3278// All that remain are instructions with side-effects originally executed in 3279// the loop unconditionally, but now execute under a tail-fold mask (only) 3280// having at least one active lane (the first). If the side-effects of the 3281// instruction are invariant, executing it w/o (the tail-folding) mask is safe 3282// - it will cause the same side-effects as when masked. 3283switch(
I->getOpcode()) {
3286"instruction should have been considered by earlier checks");
3287case Instruction::Call:
3288// Side-effects of a Call are assumed to be non-invariant, needing a 3291"should have returned earlier for calls not needing a mask");
3293case Instruction::Load:
3294// If the address is loop invariant no predication is needed. 3296case Instruction::Store: {
3297// For stores, we need to prove both speculation safety (which follows from 3298// the same argument as loads), but also must prove the value being stored 3299// is correct. The easiest form of the later is to require that all values 3300// stored are the same. 3304case Instruction::UDiv:
3305case Instruction::SDiv:
3306case Instruction::SRem:
3307case Instruction::URem:
3308// If the divisor is loop-invariant no predication is needed. 3313std::pair<InstructionCost, InstructionCost>
3316assert(
I->getOpcode() == Instruction::UDiv ||
3317I->getOpcode() == Instruction::SDiv ||
3318I->getOpcode() == Instruction::SRem ||
3319I->getOpcode() == Instruction::URem);
3322// Scalarization isn't legal for scalable vector types 3325// Get the scalarization cost and scale this amount by the probability of 3326// executing the predicated block. If the instruction is not predicated, 3327// we fall through to the next case. 3328 ScalarizationCost = 0;
3330// These instructions have a non-void type, so account for the phi nodes 3331// that we will create. This cost is likely to be zero. The phi node 3332// cost, if any, should be scaled by the block probability because it 3333// models a copy at the end of each predicated block. 3337// The cost of the non-predicated instruction. 3341// The cost of insertelement and extractelement instructions needed for 3343 ScalarizationCost += getScalarizationOverhead(
I, VF);
3345// Scale the cost by the probability of executing the predicated blocks. 3346// This assumes the predicated block for each vector lane is equally 3354// The cost of the select guard to ensure all lanes are well defined 3355// after we speculate above any internal control flow. 3361// Certain instructions can be cheaper to vectorize if they have a constant 3362// second vector operand. One example of this are shifts on x86. 3363Value *Op2 =
I->getOperand(1);
3372 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
3374return {ScalarizationCost, SafeDivisorCost};
3381"Decision should not be set yet.");
3383assert(Group &&
"Must have a group.");
3384unsigned InterleaveFactor = Group->getFactor();
3386// If the instruction's allocated size doesn't equal its type size, it 3387// requires padding and will be scalarized. 3388auto &
DL =
I->getDataLayout();
3393// We currently only know how to emit interleave/deinterleave with 3394// Factor=2 for scalable vectors. This is purely an implementation 3399// If the group involves a non-integral pointer, we may not be able to 3400// losslessly cast all values to a common type. 3401bool ScalarNI =
DL.isNonIntegralPointerType(ScalarTy);
3402for (
unsignedIdx = 0;
Idx < InterleaveFactor;
Idx++) {
3407bool MemberNI =
DL.isNonIntegralPointerType(
MemberTy);
3408// Don't coerce non-integral pointers to integers or vice versa. 3409if (MemberNI != ScalarNI)
3410// TODO: Consider adding special nullptr value case here 3412if (MemberNI && ScalarNI &&
3413 ScalarTy->getPointerAddressSpace() !=
3418// Check if masking is required. 3419// A Group may need masking for one of two reasons: it resides in a block that 3420// needs predication, or it was decided to use masking to deal with gaps 3421// (either a gap at the end of a load-access that may result in a speculative 3422// load, or any gaps in a store-access). 3423bool PredicatedAccessRequiresMasking =
3426bool LoadAccessWithGapsRequiresEpilogMasking =
3427 isa<LoadInst>(
I) && Group->requiresScalarEpilogue() &&
3429bool StoreAccessWithGapsRequiresMasking =
3430 isa<StoreInst>(
I) && (Group->getNumMembers() < Group->getFactor());
3431if (!PredicatedAccessRequiresMasking &&
3432 !LoadAccessWithGapsRequiresEpilogMasking &&
3433 !StoreAccessWithGapsRequiresMasking)
3436// If masked interleaving is required, we expect that the user/target had 3437// enabled it, because otherwise it either wouldn't have been created or 3438// it should have been invalidated by the CostModel. 3440"Masked interleave-groups for predicated accesses are not enabled.");
3442if (Group->isReverse())
3453// Get and ensure we have a valid memory instruction. 3454assert((isa<LoadInst, StoreInst>(
I)) &&
"Invalid memory instruction");
3459// In order to be widened, the pointer should be consecutive, first of all. 3463// If the instruction is a store located in a predicated block, it will be 3468// If the instruction's allocated size doesn't equal it's type size, it 3469// requires padding and will be scalarized. 3470auto &
DL =
I->getDataLayout();
3477void LoopVectorizationCostModel::collectLoopUniforms(
ElementCount VF) {
3478// We should not collect Uniforms more than once per VF. Right now, 3479// this function is called from collectUniformsAndScalars(), which 3480// already does this check. Collecting Uniforms for VF=1 does not make any 3484"This function should not be visited twice for the same VF");
3486// Visit the list of Uniforms. If we find no uniform value, we won't 3487// analyze again. Uniforms.count(VF) will return 1. 3488 Uniforms[VF].
clear();
3490// Now we know that the loop is vectorizable! 3491// Collect instructions inside the loop that will remain uniform after 3494// Global values, params and instructions outside of current loop are out of 3496auto IsOutOfScope = [&](
Value *V) ->
bool {
3501// Worklist containing uniform instructions demanding lane 0. 3504// Add uniform instructions demanding lane 0 to the worklist. Instructions 3505// that require predication must not be considered uniform after 3506// vectorization, because that would create an erroneous replicating region 3507// where only a single instance out of VF should be formed. 3508auto AddToWorklistIfAllowed = [&](
Instruction *
I) ->
void {
3509if (IsOutOfScope(
I)) {
3516dbgs() <<
"LV: Found not uniform due to requiring predication: " << *
I 3524// Start with the conditional branches exiting the loop. If the branch 3525// condition is an instruction contained in the loop that is only used by the 3526// branch, it is uniform. Note conditions from uncountable early exits are not 3533auto *
Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));
3535 AddToWorklistIfAllowed(Cmp);
3539// Return true if all lanes perform the same memory operation, and we can 3540// thus choose to execute only one. 3542// If the value was already known to not be uniform for the previous 3543// (smaller VF), it cannot be uniform for the larger VF. 3544if (PrevVF.isVector()) {
3545auto Iter = Uniforms.
find(PrevVF);
3546if (Iter != Uniforms.
end() && !Iter->second.contains(
I))
3551if (isa<LoadInst>(
I))
3552// Loading the same address always produces the same result - at least 3553// assuming aliasing and ordering which have already been checked. 3555// Storing the same value on every iteration. 3562"Widening decision should be ready at this moment");
3564if (IsUniformMemOpUse(
I))
3567return (WideningDecision ==
CM_Widen ||
3572// Returns true if Ptr is the pointer operand of a memory access instruction 3573// I, I is known to not require scalarization, and the pointer is not also 3576if (isa<StoreInst>(
I) &&
I->getOperand(0) ==
Ptr)
3582// Holds a list of values which are known to have at least one uniform use. 3583// Note that there may be other uses which aren't uniform. A "uniform use" 3584// here is something which only demands lane 0 of the unrolled iterations; 3585// it does not imply that all lanes produce the same value (e.g. this is not 3586// the usual meaning of uniform) 3589// Scan the loop for instructions which are either a) known to have only 3590// lane 0 demanded or b) are uses which demand only lane 0 of their operand. 3592for (
auto &
I : *BB) {
3594switch (
II->getIntrinsicID()) {
3595case Intrinsic::sideeffect:
3596case Intrinsic::experimental_noalias_scope_decl:
3597case Intrinsic::assume:
3598case Intrinsic::lifetime_start:
3599case Intrinsic::lifetime_end:
3601 AddToWorklistIfAllowed(&
I);
3608// ExtractValue instructions must be uniform, because the operands are 3609// known to be loop-invariant. 3610if (
auto *EVI = dyn_cast<ExtractValueInst>(&
I)) {
3611assert(IsOutOfScope(EVI->getAggregateOperand()) &&
3612"Expected aggregate value to be loop invariant");
3613 AddToWorklistIfAllowed(EVI);
3617// If there's no pointer operand, there's nothing to do. 3622if (IsUniformMemOpUse(&
I))
3623 AddToWorklistIfAllowed(&
I);
3625if (IsVectorizedMemAccessUse(&
I,
Ptr))
3629// Add to the worklist any operands which have *only* uniform (e.g. lane 0 3630// demanding) users. Since loops are assumed to be in LCSSA form, this 3631// disallows uses outside the loop as well. 3632for (
auto *V : HasUniformUse) {
3635auto *
I = cast<Instruction>(V);
3636bool UsersAreMemAccesses =
all_of(
I->users(), [&](
User *U) ->
bool {
3637 auto *UI = cast<Instruction>(U);
3638 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);
3640if (UsersAreMemAccesses)
3641 AddToWorklistIfAllowed(
I);
3644// Expand Worklist in topological order: whenever a new instruction 3645// is added , its users should be already inside Worklist. It ensures 3646// a uniform instruction will only be used by uniform instructions. 3648while (
Idx != Worklist.
size()) {
3651for (
auto *OV :
I->operand_values()) {
3652// isOutOfScope operands cannot be uniform instructions. 3653if (IsOutOfScope(OV))
3655// First order recurrence Phi's should typically be considered 3657auto *
OP = dyn_cast<PHINode>(OV);
3660// If all the users of the operand are uniform, then add the 3661// operand into the uniform worklist. 3662auto *OI = cast<Instruction>(OV);
3664 auto *J = cast<Instruction>(U);
3665 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);
3667 AddToWorklistIfAllowed(OI);
3671// For an instruction to be added into Worklist above, all its users inside 3672// the loop should also be in Worklist. However, this condition cannot be 3673// true for phi nodes that form a cyclic dependence. We must process phi 3674// nodes separately. An induction variable will remain uniform if all users 3675// of the induction variable and induction variable update remain uniform. 3676// The code below handles both pointer and non-pointer induction variables. 3679auto *Ind = Induction.first;
3680auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
3682// Determine if all users of the induction variable are uniform after 3684bool UniformInd =
all_of(Ind->users(), [&](
User *U) ->
bool {
3685 auto *I = cast<Instruction>(U);
3686 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
3687 IsVectorizedMemAccessUse(I, Ind);
3692// Determine if all users of the induction variable update instruction are 3693// uniform after vectorization. 3694bool UniformIndUpdate =
all_of(IndUpdate->users(), [&](
User *U) ->
bool {
3695 auto *I = cast<Instruction>(U);
3696 return I == Ind || Worklist.count(I) ||
3697 IsVectorizedMemAccessUse(I, IndUpdate);
3699if (!UniformIndUpdate)
3702// The induction variable and its update instruction will remain uniform. 3703 AddToWorklistIfAllowed(Ind);
3704 AddToWorklistIfAllowed(IndUpdate);
3715"runtime pointer checks needed. Enable vectorization of this " 3716"loop with '#pragma clang loop vectorize(enable)' when " 3717"compiling with -Os/-Oz",
3724"runtime SCEV checks needed. Enable vectorization of this " 3725"loop with '#pragma clang loop vectorize(enable)' when " 3726"compiling with -Os/-Oz",
3731// FIXME: Avoid specializing for stride==1 instead of bailing out. 3734"runtime stride == 1 checks needed. Enable vectorization of " 3735"this loop without such check by compiling with -Os/-Oz",
3743bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3744if (IsScalableVectorizationAllowed)
3745return *IsScalableVectorizationAllowed;
3747 IsScalableVectorizationAllowed =
false;
3760 std::numeric_limits<ElementCount::ScalarTy>::max());
3762// Test that the loop-vectorizer can legalize all operations for this MaxVF. 3763// FIXME: While for scalable vectors this is currently sufficient, this should 3764// be replaced by a more detailed mechanism that filters out specific VFs, 3765// instead of invalidating vectorization for a whole set of VFs based on the 3768// Disable scalable vectorization if the loop contains unsupported reductions. 3771"Scalable vectorization not supported for the reduction " 3772"operations found in this loop.",
3777// Disable scalable vectorization if the loop contains any instructions 3778// with element types not supported for scalable vectors. 3784"for all element types found in this loop.",
3791"for safe distance analysis.",
3796 IsScalableVectorizationAllowed =
true;
3801LoopVectorizationCostModel::getMaxLegalScalableVF(
unsigned MaxSafeElements) {
3802if (!isScalableVectorizationAllowed())
3806 std::numeric_limits<ElementCount::ScalarTy>::max());
3808return MaxScalableVF;
3811// Limit MaxScalableVF by the maximum safe dependence distance. 3816"Max legal vector width too small, scalable vectorization " 3820return MaxScalableVF;
3824unsigned MaxTripCount,
ElementCount UserVF,
bool FoldTailByMasking) {
3826unsigned SmallestType, WidestType;
3829// Get the maximum safe dependence distance in bits computed by LAA. 3830// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from 3831// the memory accesses that is most restrictive (involved in the smallest 3832// dependence distance). 3833unsigned MaxSafeElements =
3837auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
3839 this->MaxSafeElements = MaxSafeElements;
3841LLVM_DEBUG(
dbgs() <<
"LV: The max safe fixed VF is: " << MaxSafeFixedVF
3843LLVM_DEBUG(
dbgs() <<
"LV: The max safe scalable VF is: " << MaxSafeScalableVF
3846// First analyze the UserVF, fall back if the UserVF should be ignored. 3849 UserVF.
isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3852// If `VF=vscale x N` is safe, then so is `VF=N` 3862// Only clamp if the UserVF is not scalable. If the UserVF is scalable, it 3863// is better to ignore the hint and let the compiler choose a suitable VF. 3866 <<
" is unsafe, clamping to max safe VF=" 3867 << MaxSafeFixedVF <<
".\n");
3872 <<
"User-specified vectorization factor " 3873 <<
ore::NV(
"UserVectorizationFactor", UserVF)
3874 <<
" is unsafe, clamping to maximum safe vectorization factor " 3875 <<
ore::NV(
"VectorizationFactor", MaxSafeFixedVF);
3877return MaxSafeFixedVF;
3882 <<
" is ignored because scalable vectors are not " 3888 <<
"User-specified vectorization factor " 3889 <<
ore::NV(
"UserVectorizationFactor", UserVF)
3890 <<
" is ignored because the target does not support scalable " 3891"vectors. The compiler will pick a more suitable value.";
3895 <<
" is unsafe. Ignoring scalable UserVF.\n");
3900 <<
"User-specified vectorization factor " 3901 <<
ore::NV(
"UserVectorizationFactor", UserVF)
3902 <<
" is unsafe. Ignoring the hint to let the compiler pick a " 3903"more suitable value.";
3908LLVM_DEBUG(
dbgs() <<
"LV: The Smallest and Widest types: " << SmallestType
3909 <<
" / " << WidestType <<
" bits.\n");
3914 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3915 MaxSafeFixedVF, FoldTailByMasking))
3919 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3920 MaxSafeScalableVF, FoldTailByMasking))
3921if (MaxVF.isScalable()) {
3922Result.ScalableVF = MaxVF;
3933// TODO: It may be useful to do since it's still likely to be dynamically 3934// uniform if the target can skip. 3936"Not inserting runtime ptr check for divergent target",
3937"runtime pointer checks needed. Not enabled for divergent target",
3938"CantVersionLoopWithDivergentTarget",
ORE,
TheLoop);
3947LLVM_DEBUG(
dbgs() <<
"LV: Found maximum trip count: " << MaxTC <<
'\n');
3950"loop trip count is one, irrelevant for vectorization",
3955// If BTC matches the widest induction type and is -1 then the trip count 3956// computation will wrap to 0 and the vector trip count will be 0. Do not try 3959if (!isa<SCEVCouldNotCompute>(BTC) &&
3965"Trip count computation wrapped",
3966"backedge-taken count is -1, loop trip count wrapped to 0",
3971switch (ScalarEpilogueStatus) {
3973return computeFeasibleMaxVF(MaxTC, UserVF,
false);
3978dbgs() <<
"LV: vector predicate hint/switch found.\n" 3979 <<
"LV: Not allowing scalar epilogue, creating predicated " 3980 <<
"vector loop.\n");
3983// fallthrough as a special case of OptForSize 3987dbgs() <<
"LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3989LLVM_DEBUG(
dbgs() <<
"LV: Not allowing scalar epilogue due to low trip " 3992// Bail if runtime checks are required, which are not good when optimising 4000// The only loops we can vectorize without a scalar epilogue, are loops with 4001// a bottom-test and a single exiting block. We'd have to handle the fact 4002// that not every instruction executes on the last iteration. This will 4003// require a lane mask which varies through the vector loop body. (TODO) 4005// If there was a tail-folding hint/switch, but we can't fold the tail by 4006// masking, fallback to a vectorization with a scalar epilogue. 4008LLVM_DEBUG(
dbgs() <<
"LV: Cannot fold tail by masking: vectorize with a " 4009"scalar epilogue instead.\n");
4011return computeFeasibleMaxVF(MaxTC, UserVF,
false);
4016// Now try the tail folding 4018// Invalidate interleave groups that require an epilogue if we can't mask 4019// the interleave-group. 4022"No decisions should have been taken at this point");
4023// Note: There is no need to invalidate any cost modeling decisions here, as 4024// none were taken so far. 4030// Avoid tail folding if the trip count is known to be a multiple of any VF 4032 std::optional<unsigned> MaxPowerOf2RuntimeVF =
4037 MaxPowerOf2RuntimeVF = std::max<unsigned>(
4038 *MaxPowerOf2RuntimeVF,
4041 MaxPowerOf2RuntimeVF = std::nullopt;
// Stick with tail-folding for now. 4044if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4046"MaxFixedVF must be a power of 2");
4047unsigned MaxVFtimesIC =
4048 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4050// Currently only loops with countable exits are vectorized, but calling 4051// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with 4052// uncountable exits whilst also ensuring the symbolic maximum and known 4053// back-edge taken count remain identical for loops with countable exits. 4056"Invalid loop count");
4058 BackedgeTakenCount, SE->
getOne(BackedgeTakenCount->
getType()));
4063// Accept MaxFixedVF if we do not have a tail. 4064LLVM_DEBUG(
dbgs() <<
"LV: No tail will remain for any chosen VF.\n");
4069// If we don't know the precise trip count, or if the trip count that we 4070// found modulo the vectorization factor is not zero, try to fold the tail 4072// FIXME: look for a smaller MaxVF that does divide TC rather than masking. 4078 <<
"LV: tail is folded with EVL, forcing unroll factor to be 1. Will " 4079"try to generate VP Intrinsics with scalable vector " 4081// Tail folded loop using VP intrinsics restricts the VF to be scalable 4083// TODO: extend it for fixed vectors, if required. 4085"Expected scalable vector factor.");
4092// If there was a tail-folding hint/switch, but we can't fold the tail by 4093// masking, fallback to a vectorization with a scalar epilogue. 4095LLVM_DEBUG(
dbgs() <<
"LV: Cannot fold tail by masking: vectorize with a " 4096"scalar epilogue instead.\n");
4102LLVM_DEBUG(
dbgs() <<
"LV: Can't fold tail by masking: don't vectorize\n");
4108"unable to calculate the loop count due to complex control flow",
4114"Cannot optimize for size and vectorize at the same time.",
4115"cannot optimize for size and vectorize at the same time. " 4116"Enable vectorization of this loop with '#pragma clang loop " 4117"vectorize(enable)' when compiling with -Os/-Oz",
4122ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4123unsigned MaxTripCount,
unsigned SmallestType,
unsigned WidestType,
4125bool ComputeScalableMaxVF = MaxSafeVF.
isScalable();
4130// Convenience function to return the minimum of two ElementCounts. 4133"Scalable flags must match");
4137// Ensure MaxVF is a power of 2; the dependence distance bound may not be. 4138// Note that both WidestRegister and WidestType may not be a powers of 2. 4141 ComputeScalableMaxVF);
4142 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
4144 << (MaxVectorElementCount * WidestType) <<
" bits.\n");
4146if (!MaxVectorElementCount) {
4148 << (ComputeScalableMaxVF ?
"scalable" :
"fixed")
4149 <<
" vector registers.\n");
4153unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
4154if (MaxVectorElementCount.isScalable() &&
4158 WidestRegisterMinEC *= Min;
4161// When a scalar epilogue is required, at least one iteration of the scalar 4162// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a 4163// max VF that results in a dead vector loop. 4167if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
4169// If upper bound loop trip count (TC) is known at compile time there is no 4170// point in choosing VF greater than TC (as done in the loop below). Select 4171// maximum power of two which doesn't exceed TC. If MaxVectorElementCount is 4172// scalable, we only fall back on a fixed VF when the TC is less than or 4173// equal to the known number of lanes. 4175LLVM_DEBUG(
dbgs() <<
"LV: Clamping the MaxVF to maximum power of two not " 4176"exceeding the constant trip count: " 4177 << ClampedUpperTripCount <<
"\n");
4179 ClampedUpperTripCount,
4180 FoldTailByMasking ? MaxVectorElementCount.isScalable() :
false);
4193 ComputeScalableMaxVF);
4194 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
4196// Collect all viable vectorization factors larger than the default MaxVF 4197// (i.e. MaxVectorElementCount). 4203// For each VF calculate its register usage. 4206// Select the largest VF which doesn't require more registers than existing 4208for (
intI = RUs.size() - 1;
I >= 0; --
I) {
4209constauto &MLU = RUs[
I].MaxLocalUsers;
4210if (
all_of(MLU, [&](
decltype(MLU.front()) &LU) {
4211 return LU.second <= TTI.getNumberOfRegisters(LU.first);
4221 <<
") with target's minimum: " << MinVF <<
'\n');
4226// Invalidate any widening decisions we might have made, in case the loop 4227// requires prediction (decided later), but we have already made some 4228// load/store widening decisions. 4234/// Convenience function that returns the value of vscale_range iff 4235/// vscale_range.min == vscale_range.max or otherwise returns the value 4236/// returned by the corresponding TTI method. 4237static std::optional<unsigned>
4239constFunction *Fn = L->getHeader()->getParent();
4243auto Max = Attr.getVScaleRangeMax();
4244if (Max && Min == Max)
4251/// This function attempts to return a value that represents the vectorization 4252/// factor at runtime. For fixed-width VFs we know this precisely at compile 4253/// time, but for scalable VFs we calculate it based on an estimate of the 4261 EstimatedVF *= *VScale;
4262assert(EstimatedVF >= 1 &&
"Estimated VF shouldn't be less than 1");
4266bool LoopVectorizationPlanner::isMoreProfitable(
4268constunsigned MaxTripCount)
const{
4272// Improve estimate for the vector width if it is scalable. 4273unsigned EstimatedWidthA =
A.Width.getKnownMinValue();
4274unsigned EstimatedWidthB =
B.Width.getKnownMinValue();
4276if (
A.Width.isScalable())
4277 EstimatedWidthA *= *VScale;
4278if (
B.Width.isScalable())
4279 EstimatedWidthB *= *VScale;
4282// Assume vscale may be larger than 1 (or the value being tuned for), 4283// so that scalable vectorization is slightly favorable over fixed-width 4286A.Width.isScalable() && !
B.Width.isScalable();
4293// To avoid the need for FP division: 4294// (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB) 4295// <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA) 4297return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
4299auto GetCostForTC = [MaxTripCount,
this](
unsigned VF,
4302// If the trip count is a known (possibly small) constant, the trip count 4303// will be rounded up to an integer number of iterations under 4304// FoldTailByMasking. The total cost in that case will be 4305// VecCost*ceil(TripCount/VF). When not folding the tail, the total 4306// cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be 4307// some extra overheads, but for the purpose of comparing the costs of 4308// different VFs we can use this to compare the total loop-body cost 4309// expected after vectorization. 4311return VectorCost *
divideCeil(MaxTripCount, VF);
4312return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);
4315auto RTCostA = GetCostForTC(EstimatedWidthA, CostA,
A.ScalarCost);
4316auto RTCostB = GetCostForTC(EstimatedWidthB, CostB,
B.ScalarCost);
4317return CmpFn(RTCostA, RTCostB);
4320bool LoopVectorizationPlanner::isMoreProfitable(
4323return LoopVectorizationPlanner::isMoreProfitable(
A,
B, MaxTripCount);
4328usingRecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4330for (
constauto &Plan : VPlans) {
4334 precomputeCosts(*Plan, VF, CostCtx);
4336for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4337for (
auto &R : *VPBB) {
4338if (!R.cost(VF, CostCtx).isValid())
4344if (InvalidCosts.
empty())
4347// Emit a report of VFs with invalid costs in the loop. 4349// Group the remarks per recipe, keeping the recipe order from InvalidCosts. 4352for (
auto &Pair : InvalidCosts)
4353if (!Numbering.
count(Pair.first))
4354 Numbering[Pair.first] =
I++;
4356// Sort the list, first on recipe(number) then on VF. 4357sort(InvalidCosts, [&Numbering](RecipeVFPair &
A, RecipeVFPair &
B) {
4358if (Numbering[
A.first] != Numbering[
B.first])
4359return Numbering[
A.first] < Numbering[
B.first];
4360constauto &
LHS =
A.second;
4361constauto &
RHS =
B.second;
4362return std::make_tuple(
LHS.isScalable(),
LHS.getKnownMinValue()) <
4363 std::make_tuple(
RHS.isScalable(),
RHS.getKnownMinValue());
4366// For a list of ordered recipe-VF pairs: 4367// [(load, VF1), (load, VF2), (store, VF1)] 4368// group the recipes together to emit separate remarks for: 4375 Subset =
Tail.take_front(1);
4382 [](
constauto *R) {
return Instruction::PHI; })
4383 .Case<VPWidenSelectRecipe>(
4384 [](
constauto *R) {
return Instruction::Select; })
4385 .Case<VPWidenStoreRecipe>(
4386 [](
constauto *R) {
return Instruction::Store; })
4387 .Case<VPWidenLoadRecipe>(
4388 [](
constauto *R) {
return Instruction::Load; })
4389 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4390 [](
constauto *R) {
return Instruction::Call; })
4393 [](
constauto *R) {
return R->getOpcode(); })
4395return R->getStoredValues().empty() ? Instruction::Load
4396 : Instruction::Store;
4399// If the next recipe is different, or if there are no other pairs, 4400// emit a remark for the collated subset. e.g. 4401// [(load, VF1), (load, VF2))] 4403// remark: invalid costs for 'load' at VF=(VF1, VF2) 4404if (Subset ==
Tail ||
Tail[Subset.size()].first != R) {
4405 std::string OutString;
4407assert(!Subset.empty() &&
"Unexpected empty range");
4408OS <<
"Recipe with invalid costs prevented vectorization at VF=(";
4409for (
constauto &Pair : Subset)
4410OS << (Pair.second == Subset.front().second ?
"" :
", ") << Pair.second;
4412if (Opcode == Instruction::Call) {
4414if (
auto *
Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {
4417auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);
4419 WidenCall ? WidenCall->getCalledScalarFunction()
4420 : cast<Function>(R->getOperand(R->getNumOperands() - 1)
4421 ->getLiveInIRValue());
4429Tail =
Tail.drop_front(Subset.size());
4432// Grow the subset by one element 4433 Subset =
Tail.take_front(Subset.size() + 1);
4434 }
while (!
Tail.empty());
4437/// Check if any recipe of \p Plan will generate a vector value, which will be 4438/// assigned a vector register. 4445// Set of already visited types. 4447for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4452// Continue early if the recipe is considered to not produce a vector 4453// result. Note that this includes VPInstruction where some opcodes may 4454// produce a vector, to preserve existing behavior as VPInstructions model 4455// aspects not directly mapped to existing IR instructions. 4456switch (R.getVPDefID()) {
4457case VPDef::VPDerivedIVSC:
4458case VPDef::VPScalarIVStepsSC:
4459case VPDef::VPScalarCastSC:
4460case VPDef::VPReplicateSC:
4461case VPDef::VPInstructionSC:
4462case VPDef::VPCanonicalIVPHISC:
4463case VPDef::VPVectorPointerSC:
4464case VPDef::VPReverseVectorPointerSC:
4465case VPDef::VPExpandSCEVSC:
4466case VPDef::VPEVLBasedIVPHISC:
4467case VPDef::VPPredInstPHISC:
4468case VPDef::VPBranchOnMaskSC:
4470case VPDef::VPReductionSC:
4471case VPDef::VPActiveLaneMaskPHISC:
4472case VPDef::VPWidenCallSC:
4473case VPDef::VPWidenCanonicalIVSC:
4474case VPDef::VPWidenCastSC:
4475case VPDef::VPWidenGEPSC:
4476case VPDef::VPWidenIntrinsicSC:
4477case VPDef::VPWidenSC:
4478case VPDef::VPWidenSelectSC:
4479case VPDef::VPBlendSC:
4480case VPDef::VPFirstOrderRecurrencePHISC:
4481case VPDef::VPWidenPHISC:
4482case VPDef::VPWidenIntOrFpInductionSC:
4483case VPDef::VPWidenPointerInductionSC:
4484case VPDef::VPReductionPHISC:
4485case VPDef::VPInterleaveSC:
4486case VPDef::VPWidenLoadEVLSC:
4487case VPDef::VPWidenLoadSC:
4488case VPDef::VPWidenStoreEVLSC:
4489case VPDef::VPWidenStoreSC:
4495auto WillWiden = [&
TTI, VF](
Type *ScalarTy) {
4501// <vscale x 1 x iN> is assumed to be profitable over iN because 4502// scalable registers are a distinct register class from scalar 4503// ones. If we ever find a target which wants to lower scalable 4504// vectors back to scalars, we'll need to update this code to 4505// explicitly ask TTI about the register class uses for each part. 4508// Two or more parts that share a register - are vectorized. 4512// If no def nor is a store, e.g., branches, continue - no value to check. 4513if (R.getNumDefinedValues() == 0 &&
4514 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4517// For multi-def recipes, currently only interleaved loads, suffice to 4518// check first def only. 4519// For stores check their stored value; for interleaved stores suffice 4520// the check first stored value only. In all cases this is the second 4523 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4525if (!Visited.
insert({ScalarTy}).second)
4527if (WillWiden(ScalarTy))
4538LLVM_DEBUG(
dbgs() <<
"LV: Scalar loop costs: " << ExpectedCost <<
".\n");
4539assert(ExpectedCost.
isValid() &&
"Unexpected invalid cost for scalar loop");
4541 [](std::unique_ptr<VPlan> &
P) {
4544"Expected Scalar VF to be a candidate");
4551if (ForceVectorization &&
4552 (VPlans.
size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4553// Ignore scalar width, because the user explicitly wants vectorization. 4554// Initialize cost to max so that VF = 2 is, at least, chosen during cost 4559for (
auto &
P : VPlans) {
4561// The cost for scalar VF=1 is already calculated, so ignore it. 4570 <<
" costs: " << (Candidate.Cost / Width));
4580 <<
"LV: Not considering vector loop of width " << VF
4581 <<
" because it will not generate any vector instructions.\n");
4585if (isMoreProfitable(Candidate, ChosenFactor))
4586 ChosenFactor = Candidate;
4592"There are conditional stores.",
4593"store that is conditionally executed prevents vectorization",
4594"ConditionalStore", ORE, OrigLoop);
4595 ChosenFactor = ScalarCost;
4599 !isMoreProfitable(ChosenFactor, ScalarCost))
dbgs()
4600 <<
"LV: Vectorization seems to be not beneficial, " 4601 <<
"but was forced by a user.\n");
4606bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4608// Cross iteration phis such as reductions need special handling and are 4609// currently unsupported. 4611 [&](
PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))
4614// Phis with uses outside of the loop require special handling and are 4615// currently unsupported. 4617// Look for uses of the value of the induction at the last iteration. 4621if (!OrigLoop->
contains(cast<Instruction>(U)))
4623// Look for uses of penultimate value of the induction. 4625if (!OrigLoop->
contains(cast<Instruction>(U)))
4629// Epilogue vectorization code has not been auditted to ensure it handles 4630// non-latch exits properly. It may be fine, but it needs auditted and 4632// TODO: Add support for loops with an early exit. 4641// FIXME: We need a much better cost-model to take different parameters such 4642// as register pressure, code size increase and cost of extra branches into 4643// account. For now we apply a very crude heuristic and only consider loops 4644// with vectorization factors larger than a certain value. 4646// Allow the target to opt out entirely. 4650// We also consider epilogue vectorization unprofitable for targets that don't 4651// consider interleaving beneficial (eg. MVE). 4655// TODO: PR #108190 introduced a discrepancy between fixed-width and scalable 4656// VFs when deciding profitability. 4657// See related "TODO: extend to support scalable VFs." in 4658// selectEpilogueVectorizationFactor. 4659unsigned Multiplier = VF.
isFixed() ? IC : 1;
4670LLVM_DEBUG(
dbgs() <<
"LEV: Epilogue vectorization is disabled.\n");
4675LLVM_DEBUG(
dbgs() <<
"LEV: Unable to vectorize epilogue because no " 4676"epilogue is allowed.\n");
4680// Not really a cost consideration, but check for unsupported cases here to 4681// simplify the logic. 4682if (!isCandidateForEpilogueVectorization(MainLoopVF)) {
4683LLVM_DEBUG(
dbgs() <<
"LEV: Unable to vectorize epilogue because the loop " 4684"is not a supported candidate.\n");
4689LLVM_DEBUG(
dbgs() <<
"LEV: Epilogue vectorization factor is forced.\n");
4692return {ForcedEC, 0, 0};
4694LLVM_DEBUG(
dbgs() <<
"LEV: Epilogue vectorization forced factor is not " 4702dbgs() <<
"LEV: Epilogue vectorization skipped due to opt for size.\n");
4707LLVM_DEBUG(
dbgs() <<
"LEV: Epilogue vectorization is not profitable for " 4712// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know 4713// the main loop handles 8 lanes per iteration. We could still benefit from 4714// vectorizing the epilogue loop with VF=4. 4720constSCEV *RemainingIterations =
nullptr;
4721unsigned MaxTripCount = 0;
4722for (
auto &NextVF : ProfitableVFs) {
4723// Skip candidate VFs without a corresponding VPlan. 4727// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable 4728// vectors) or > the VF of the main loop (fixed vectors). 4729if ((!NextVF.Width.isScalable() && MainLoopVF.
isScalable() &&
4731 (NextVF.Width.isScalable() &&
4733 (!NextVF.Width.isScalable() && !MainLoopVF.
isScalable() &&
4737// If NextVF is greater than the number of remaining iterations, the 4738// epilogue loop would be dead. Skip such factors. 4739if (!MainLoopVF.
isScalable() && !NextVF.Width.isScalable()) {
4740// TODO: extend to support scalable VFs. 4741if (!RemainingIterations) {
4744assert(!isa<SCEVCouldNotCompute>(TC) &&
4745"Trip count SCEV must be computable");
4755 << MaxTripCount <<
"\n");
4759 SE.
getConstant(TCType, NextVF.Width.getKnownMinValue()),
4760 RemainingIterations))
4764if (Result.Width.isScalar() ||
4765 isMoreProfitable(NextVF, Result, MaxTripCount))
4771 << Result.Width <<
"\n");
4775std::pair<unsigned, unsigned>
4777unsigned MinWidth = -1U;
4778unsigned MaxWidth = 8;
4780// For in-loop reductions, no element types are added to ElementTypesInLoop 4781// if there are no loads/stores in the loop. In this case, check through the 4782// reduction variables to determine the maximum width. 4784// Reset MaxWidth so that we can find the smallest type used by recurrences 4789// When finding the min width used by the recurrence we need to account 4790// for casts on the input operands of the recurrence. 4791 MaxWidth = std::min<unsigned>(
4792 MaxWidth, std::min<unsigned>(
4798 MinWidth = std::min<unsigned>(
4799 MinWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
4800 MaxWidth = std::max<unsigned>(
4801 MaxWidth,
DL.getTypeSizeInBits(
T->getScalarType()).getFixedValue());
4804return {MinWidth, MaxWidth};
4811// For each instruction in the loop. 4815// Skip ignored values. 4819// Only examine Loads, Stores and PHINodes. 4820if (!isa<LoadInst>(
I) && !isa<StoreInst>(
I) && !isa<PHINode>(
I))
4823// Examine PHI nodes that are reduction variables. Update the type to 4824// account for the recurrence type. 4825if (
auto *PN = dyn_cast<PHINode>(&
I)) {
4838// Examine the stored values. 4839if (
auto *ST = dyn_cast<StoreInst>(&
I))
4840T = ST->getValueOperand()->getType();
4843"Expected the load/store/recurrence type to be sized");
4853// -- The interleave heuristics -- 4854// We interleave the loop in order to expose ILP and reduce the loop overhead. 4855// There are many micro-architectural considerations that we can't predict 4856// at this level. For example, frontend pressure (on decode or fetch) due to 4857// code size, or the number and capabilities of the execution ports. 4859// We use the following heuristics to select the interleave count: 4860// 1. If the code has reductions, then we interleave to break the cross 4861// iteration dependency. 4862// 2. If the loop is really small, then we interleave to reduce the loop 4864// 3. We don't interleave if we think that we will spill registers to memory 4865// due to the increased register pressure. 4870// Do not interleave if EVL is preferred and no User IC is specified. 4872LLVM_DEBUG(
dbgs() <<
"LV: Preference for VP intrinsics indicated. " 4873"Unroll factor forced to be 1.\n");
4877// We used the distance for the interleave count. 4881// We don't attempt to perform interleaving for loops with uncountable early 4882// exits because the VPInstruction::AnyOf code cannot currently handle 4890// If we did not calculate the cost for VF (because the user selected the VF) 4891// then we calculate the cost of VF here. 4894assert(LoopCost.
isValid() &&
"Expected to have chosen a VF with valid cost");
4896// Loop body is free and there is no need for interleaving. 4902// We divide by these constants so assume that we have at least one 4903// instruction that uses at least one register. 4904for (
auto &Pair : R.MaxLocalUsers) {
4905 Pair.second = std::max(Pair.second, 1U);
4908// We calculate the interleave count using the following formula. 4909// Subtract the number of loop invariants from the number of available 4910// registers. These registers are used by all of the interleaved instances. 4911// Next, divide the remaining registers by the number of registers that is 4912// required by the loop, in order to estimate how many parallel instances 4913// fit without causing spills. All of this is rounded down if necessary to be 4914// a power of two. We want power of two interleave count to simplify any 4915// addressing operations or alignment considerations. 4916// We also want power of two interleave counts to ensure that the induction 4917// variable of the vector loop wraps to zero, when tail is folded by masking; 4918// this currently happens when OptForSize, in which case IC is set to 1 above. 4919unsigned IC = UINT_MAX;
4921for (
constauto &Pair : R.MaxLocalUsers) {
4926 <<
" register class\n");
4934unsigned MaxLocalUsers = Pair.second;
4935unsigned LoopInvariantRegs = 0;
4936if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())
4937 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4939unsigned TmpIC =
llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /
4941// Don't count the induction variable as interleaved. 4944 std::max(1U, (MaxLocalUsers - 1)));
4947 IC = std::min(IC, TmpIC);
4950// Clamp the interleave ranges to reasonable counts. 4953// Check if the user has overridden the max. 4965// At least one iteration must be scalar when this constraint holds. So the 4966// maximum available iterations for interleaving is one less. 4967unsigned AvailableTC =
4970// If trip count is known we select between two prospective ICs, where 4971// 1) the aggressive IC is capped by the trip count divided by VF 4972// 2) the conservative IC is capped by the trip count divided by (VF * 2) 4973// The final IC is selected in a way that the epilogue loop trip count is 4974// minimized while maximizing the IC itself, so that we either run the 4975// vector loop at least once if it generates a small epilogue loop, or else 4976// we run the vector loop at least twice. 4979 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));
4980unsigned InterleaveCountLB =
bit_floor(std::max(
4981 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
4982 MaxInterleaveCount = InterleaveCountLB;
4984if (InterleaveCountUB != InterleaveCountLB) {
4985unsigned TailTripCountUB =
4986 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4987unsigned TailTripCountLB =
4988 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4989// If both produce same scalar tail, maximize the IC to do the same work 4990// in fewer vector loop iterations 4991if (TailTripCountUB == TailTripCountLB)
4992 MaxInterleaveCount = InterleaveCountUB;
4994 }
elseif (BestKnownTC && *BestKnownTC > 0) {
4995// At least one iteration must be scalar when this constraint holds. So the 4996// maximum available iterations for interleaving is one less. 4998 ? (*BestKnownTC) - 1
5001// If trip count is an estimated compile time constant, limit the 5002// IC to be capped by the trip count divided by VF * 2, such that the vector 5003// loop runs at least twice to make interleaving seem profitable when there 5004// is an epilogue loop present. Since exact Trip count is not known we 5005// choose to be conservative in our IC estimate. 5007 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));
5010assert(MaxInterleaveCount > 0 &&
5011"Maximum interleave count must be greater than 0");
5013// Clamp the calculated IC to be between the 1 and the max interleave count 5014// that the target and trip count allows. 5015if (IC > MaxInterleaveCount)
5016 IC = MaxInterleaveCount;
5018// Make sure IC is greater than 0. 5019 IC = std::max(1u, IC);
5021assert(IC > 0 &&
"Interleave count must be greater than 0.");
5023// Interleave if we vectorized this loop and there is a reduction that could 5024// benefit from interleaving. 5025if (VF.
isVector() && HasReductions) {
5030// For any scalar loop that either requires runtime checks or predication we 5031// are better off leaving this to the unroller. Note that if we've already 5032// vectorized the loop we will have done the runtime check and so interleaving 5033// won't require further checks. 5034bool ScalarInterleavingRequiresPredication =
5036 return Legal->blockNeedsPredication(BB);
5038bool ScalarInterleavingRequiresRuntimePointerCheck =
5041// We want to interleave small loops in order to reduce the loop overhead and 5042// potentially expose ILP opportunities. 5044 <<
"LV: IC is " << IC <<
'\n' 5045 <<
"LV: VF is " << VF <<
'\n');
5046constbool AggressivelyInterleaveReductions =
5048if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5049 !ScalarInterleavingRequiresPredication && LoopCost <
SmallLoopCost) {
5050// We assume that the cost overhead is 1 and we use the cost model 5051// to estimate the cost of the loop and interleave until the cost of the 5052// loop overhead is about 5% of the cost of the loop. 5053unsigned SmallIC = std::min(IC, (
unsigned)llvm::bit_floor<uint64_t>(
5056// Interleave until store/load ports (estimated by max interleave count) are 5060unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5061unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5063// There is little point in interleaving for reductions containing selects 5064// and compares when VF=1 since it may just create more overhead than it's 5065// worth for loops with small trip counts. This is because we still have to 5066// do the final reduction after the loop. 5067bool HasSelectCmpReductions =
5070 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5071 RecurKind RK = RdxDesc.getRecurrenceKind();
5072 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
5073 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);
5075if (HasSelectCmpReductions) {
5076LLVM_DEBUG(
dbgs() <<
"LV: Not interleaving select-cmp reductions.\n");
5080// If we have a scalar reduction (vector reductions are already dealt with 5081// by this point), we can increase the critical path length if the loop 5082// we're interleaving is inside another loop. For tree-wise reductions 5083// set the limit to 2, and for ordered reductions it's best to disable 5084// interleaving entirely. 5086bool HasOrderedReductions =
5088 const RecurrenceDescriptor &RdxDesc = Reduction.second;
5089 return RdxDesc.isOrdered();
5091if (HasOrderedReductions) {
5093dbgs() <<
"LV: Not interleaving scalar ordered reductions.\n");
5098 SmallIC = std::min(SmallIC,
F);
5099 StoresIC = std::min(StoresIC,
F);
5100 LoadsIC = std::min(LoadsIC,
F);
5104 std::max(StoresIC, LoadsIC) > SmallIC) {
5106dbgs() <<
"LV: Interleaving to saturate store or load ports.\n");
5107return std::max(StoresIC, LoadsIC);
5110// If there are scalar reductions and TTI has enabled aggressive 5111// interleaving for reductions, we will interleave to expose ILP. 5112if (VF.
isScalar() && AggressivelyInterleaveReductions) {
5114// Interleave no less than SmallIC but not as aggressive as the normal IC 5115// to satisfy the rare situation when resources are too limited. 5116return std::max(IC / 2, SmallIC);
5123// Interleave if this is a large loop (small loops are already dealt with by 5124// this point) that could benefit from interleaving. 5125if (AggressivelyInterleaveReductions) {
5136// This function calculates the register usage by measuring the highest number 5137// of values that are alive at a single location. Obviously, this is a very 5138// rough estimation. We scan the loop in a topological order in order and 5139// assign a number to each instruction. We use RPO to ensure that defs are 5140// met before their users. We assume that each instruction that has in-loop 5141// users starts an interval. We record every time that an in-loop value is 5142// used, so we have a list of the first and last occurrences of each 5143// instruction. Next, we transpose this data structure into a multi map that 5144// holds the list of intervals that *end* at a specific location. This multi 5145// map allows us to perform a linear search. We scan the instructions linearly 5146// and record each time that a new interval starts, by placing it in a set. 5147// If we find this value in the multi-map then we remove it from the set. 5148// The max register usage is the maximum size of the set. 5149// We also search for instructions that are defined outside the loop, but are 5150// used inside the loop. We need this number separately from the max-interval 5151// usage number because when we unroll, loop-invariant values do not take 5158// Each 'key' in the map opens a new interval. The values 5159// of the map are the index of the 'last seen' usage of the 5160// instruction that is the key. 5163// Maps instruction to its index. 5165// Marks the end of each interval. 5167// Saves the list of instruction indices that are used in the loop. 5169// Saves the list of values that are used in the loop but are defined outside 5170// the loop (not including non-instruction values such as arguments and 5178// Save the end location of each USE. 5179for (
Value *U :
I.operands()) {
5180auto *Instr = dyn_cast<Instruction>(U);
5182// Ignore non-instruction values such as arguments, constants, etc. 5183// FIXME: Might need some motivation why these values are ignored. If 5184// for example an argument is used inside the loop it will increase the 5185// register pressure (so shouldn't we add it to LoopInvariants). 5189// If this instruction is outside the loop then record it and continue. 5191 LoopInvariants.
insert(Instr);
5195// Overwrite previous end points. 5196 EndPoint[Instr] = IdxToInstr.
size();
5202// Saves the list of intervals that end with the index in 'key'. 5206// Transpose the EndPoints to a list of values that end at each index. 5214LLVM_DEBUG(
dbgs() <<
"LV(REG): Calculating max register usage:\n");
5216constauto &TTICapture =
TTI;
5220 !TTICapture.isElementTypeLegalForScalableVector(Ty)))
5225for (
unsignedintIdx = 0, Sz = IdxToInstr.
size();
Idx < Sz; ++
Idx) {
5228// Remove all of the instructions that end at this location. 5229 InstrList &
List = TransposeEnds[
Idx];
5233// Ignore instructions that are never used within the loop. 5237// Skip ignored values. 5243// For each VF find the maximum usage of registers. 5244for (
unsigned J = 0, E = VFs.
size(); J < E; ++J) {
5245// Count the number of registers used, per register class, given all open 5247// Note that elements in this SmallMapVector will be default constructed 5248// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if 5249// there is no previous entry for ClassID. 5252if (VFs[J].isScalar()) {
5253for (
auto *Inst : OpenIntervals) {
5256// FIXME: The target might use more than one register for the type 5257// even in the scalar case. 5262for (
auto *Inst : OpenIntervals) {
5263// Skip ignored values for VF > 1. 5269// FIXME: The target might use more than one register for the type 5270// even in the scalar case. 5275RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);
5281auto &Entry = MaxUsages[J][Pair.first];
5282 Entry = std::max(Entry, Pair.second);
5287 << OpenIntervals.
size() <<
'\n');
5289// Add the current instruction to the list of open intervals. 5294// Note that elements in this SmallMapVector will be default constructed 5295// as 0. So we can use "Invariant[ClassID] += n" in the code below even if 5296// there is no previous entry for ClassID. 5299for (
auto *Inst : LoopInvariants) {
5300// FIXME: The target might use more than one register for the type 5301// even in the scalar case. 5302bool IsScalar =
all_of(Inst->users(), [&](
User *U) {
5303 auto *I = cast<Instruction>(U);
5304 return TheLoop != LI->getLoopFor(I->getParent()) ||
5305 isScalarAfterVectorization(I, VFs[Idx]);
5311 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);
5315dbgs() <<
"LV(REG): VF = " << VFs[
Idx] <<
'\n';
5316dbgs() <<
"LV(REG): Found max usage: " << MaxUsages[
Idx].
size()
5318for (
constauto &pair : MaxUsages[
Idx]) {
5319dbgs() <<
"LV(REG): RegisterClass: " 5323dbgs() <<
"LV(REG): Found invariant usage: " << Invariant.
size()
5325for (
constauto &pair : Invariant) {
5326dbgs() <<
"LV(REG): RegisterClass: " 5340bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(
Instruction *
I,
5342// TODO: Cost model for emulated masked load/store is completely 5343// broken. This hack guides the cost model to use an artificially 5344// high enough value to practically disable vectorization with such 5345// operations, except where previously deployed legality hack allowed 5346// using very low cost values. This is to avoid regressions coming simply 5347// from moving "masked load/store" check from legality to cost model. 5348// Masked Load/Gather emulation was previously never allowed. 5349// Limited number of Masked Store/Scatter emulation was allowed. 5351"Expecting a scalar emulated instruction");
5352return isa<LoadInst>(
I) ||
5353 (isa<StoreInst>(
I) &&
5358// If we aren't vectorizing the loop, or if we've already collected the 5359// instructions to scalarize, there's nothing to do. Collection may already 5360// have occurred if we have a user-selected VF and are now computing the 5361// expected cost for interleaving. 5365// Initialize a mapping for VF in InstsToScalalarize. If we find that it's 5366// not profitable to scalarize any instructions, the presence of VF in the 5367// map will indicate that we've analyzed it already. 5370 PredicatedBBsAfterVectorization[VF].
clear();
5372// Find all the instructions that are scalar with predication in the loop and 5373// determine if it would be better to not if-convert the blocks they are in. 5374// If so, we also record the instructions to scalarize. 5381// Do not apply discount logic for: 5382// 1. Scalars after vectorization, as there will only be a single copy 5383// of the instruction. 5384// 2. Scalable VF, as that would lead to invalid scalarization costs. 5385// 3. Emulated masked memrefs, if a hacked cost is needed. 5387 !useEmulatedMaskMemRefHack(&
I, VF) &&
5388 computePredInstDiscount(&
I, ScalarCosts, VF) >= 0) {
5390// Check if we decided to scalarize a call. If so, update the widening 5391// decision of the call to CM_Scalarize with the computed scalar cost. 5392for (
constauto &[
I,
_] : ScalarCosts) {
5393auto *CI = dyn_cast<CallInst>(
I);
5394if (!CI || !CallWideningDecisions.contains({CI, VF}))
5397 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];
5400// Remember that BB will remain after vectorization. 5401 PredicatedBBsAfterVectorization[VF].
insert(BB);
5403if (Pred->getSingleSuccessor() == BB)
5404 PredicatedBBsAfterVectorization[VF].
insert(Pred);
5413"Instruction marked uniform-after-vectorization will be predicated");
5415// Initialize the discount to zero, meaning that the scalar version and the 5416// vector version cost the same. 5419// Holds instructions to analyze. The instructions we visit are mapped in 5420// ScalarCosts. Those instructions are the ones that would be scalarized if 5421// we find that the scalar version costs less. 5424// Returns true if the given instruction can be scalarized. 5426// We only attempt to scalarize instructions forming a single-use chain 5427// from the original predicated block that would otherwise be vectorized. 5428// Although not strictly necessary, we give up on instructions we know will 5429// already be scalar to avoid traversing chains that are unlikely to be 5431if (!
I->hasOneUse() || PredInst->
getParent() !=
I->getParent() ||
5435// If the instruction is scalar with predication, it will be analyzed 5436// separately. We ignore it within the context of PredInst. 5440// If any of the instruction's operands are uniform after vectorization, 5441// the instruction cannot be scalarized. This prevents, for example, a 5442// masked load from being scalarized. 5444// We assume we will only emit a value for lane zero of an instruction 5445// marked uniform after vectorization, rather than VF identical values. 5446// Thus, if we scalarize an instruction that uses a uniform, we would 5447// create uses of values corresponding to the lanes we aren't emitting code 5448// for. This behavior can be changed by allowing getScalarValue to clone 5449// the lane zero values for uniforms rather than asserting. 5450for (
Use &U :
I->operands())
5451if (
auto *J = dyn_cast<Instruction>(U.get()))
5455// Otherwise, we can scalarize the instruction. 5459// Compute the expected cost discount from scalarizing the entire expression 5460// feeding the predicated instruction. We currently only consider expressions 5461// that are single-use instruction chains. 5463while (!Worklist.
empty()) {
5466// If we've already analyzed the instruction, there's nothing to do. 5467if (ScalarCosts.contains(
I))
5470// Compute the cost of the vector instruction. Note that this cost already 5471// includes the scalarization overhead of the predicated instruction. 5474// Compute the cost of the scalarized instruction. This cost is the cost of 5475// the instruction as if it wasn't if-converted and instead remained in the 5476// predicated block. We will scale this cost by block probability after 5477// computing the scalarization overhead. 5481// Compute the scalarization overhead of needed insertelement instructions 5492// Compute the scalarization overhead of needed extractelement 5493// instructions. For each of the instruction's operands, if the operand can 5494// be scalarized, add it to the worklist; otherwise, account for the 5496for (
Use &U :
I->operands())
5497if (
auto *J = dyn_cast<Instruction>(
U.get())) {
5499"Instruction has non-scalar type");
5500if (CanBeScalarized(J))
5502elseif (needsExtract(J, VF)) {
5504 cast<VectorType>(
toVectorTy(J->getType(), VF)),
5510// Scale the total scalar cost by block probability. 5513// Compute the discount. A non-negative discount means the vector version 5514// of the instruction costs more, and scalarizing would be beneficial. 5515 Discount += VectorCost - ScalarCost;
5516 ScalarCosts[
I] = ScalarCost;
5525// If the vector loop gets executed exactly once with the given VF, ignore the 5526// costs of comparison and induction instructions, as they'll get simplified 5532 ValuesToIgnoreForVF);
5538// For each instruction in the old loop. 5540// Skip ignored values. 5547// Check if we should override the cost. 5553 << VF <<
" For instruction: " <<
I <<
'\n');
5556// If we are vectorizing a predicated block, it will have been 5557// if-converted. This means that the block's instructions (aside from 5558// stores and instructions that may divide by zero) will now be 5559// unconditionally executed. For the scalar case, we may not always execute 5560// the predicated block, if it is an if-else block. Thus, scale the block's 5561// cost by the probability of executing it. blockNeedsPredication from 5562// Legal is used so as to not include all blocks in tail folded loops. 5572/// Gets Address Access SCEV after verifying that the access pattern 5573/// is loop invariant except the induction variable dependence. 5575/// This SCEV can be sent to the Target in order to estimate the address 5576/// calculation cost. 5581constLoop *TheLoop) {
5583auto *Gep = dyn_cast<GetElementPtrInst>(
Ptr);
5587// We are looking for a gep with all loop invariant indices except for one 5588// which should be an induction variable. 5589auto *SE = PSE.
getSE();
5590unsigned NumOperands = Gep->getNumOperands();
5591for (
unsignedIdx = 1;
Idx < NumOperands; ++
Idx) {
5594 !
Legal->isInductionVariable(Opd))
5598// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV. 5603LoopVectorizationCostModel::getMemInstScalarizationCost(
Instruction *
I,
5606"Scalarization cost of instruction implies vectorization.");
5616// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost` 5617// that it is being called from this specific place. 5619// Figure out whether the access is strided and get the stride value 5620// if it's known in compile time 5623// Get the cost of the scalar memory instruction and address computation. 5627// Don't pass *I here, since it is scalar but will actually be part of a 5628// vectorized loop where the user of it is a vectorized instruction. 5634// Get the overhead of the extractelement and insertelement instructions 5635// we might create due to scalarization. 5636Cost += getScalarizationOverhead(
I, VF);
5638// If we have a predicated load/store, it will need extra i1 extracts and 5639// conditional branches, but may not be executed for each vector lane. Scale 5640// the cost by the probability of executing the predicated block. 5644// Add the cost of an i1 extract and a branch 5649/*Insert=*/false,
/*Extract=*/true,
CostKind);
5652if (useEmulatedMaskMemRefHack(
I, VF))
5653// Artificially setting to a high enough value to practically disable 5654// vectorization with such operations. 5662LoopVectorizationCostModel::getConsecutiveMemOpCost(
Instruction *
I,
5665auto *VectorTy = cast<VectorType>(
toVectorTy(ValTy, VF));
5670assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5671"Stride should be 1 or -1 for consecutive memory access");
5683boolReverse = ConsecutiveStride < 0;
5691LoopVectorizationCostModel::getUniformMemOpCost(
Instruction *
I,
5696auto *VectorTy = cast<VectorType>(
toVectorTy(ValTy, VF));
5699if (isa<LoadInst>(
I)) {
5712 (IsLoopInvariantStoreValue
5719LoopVectorizationCostModel::getGatherScatterCost(
Instruction *
I,
5722auto *VectorTy = cast<VectorType>(
toVectorTy(ValTy, VF));
5733LoopVectorizationCostModel::getInterleaveGroupCost(
Instruction *
I,
5736assert(Group &&
"Fail to get an interleaved access group.");
5740auto *VectorTy = cast<VectorType>(
toVectorTy(ValTy, VF));
5743unsigned InterleaveFactor = Group->getFactor();
5746// Holds the indices of existing members in the interleaved group. 5748for (
unsigned IF = 0;
IF < InterleaveFactor;
IF++)
5749if (Group->getMember(IF))
5752// Calculate the cost of the whole interleaved group. 5753bool UseMaskForGaps =
5755 (isa<StoreInst>(
I) && (Group->getNumMembers() < Group->getFactor()));
5757 InsertPos->
getOpcode(), WideVecTy, Group->getFactor(), Indices,
5761if (Group->isReverse()) {
5762// TODO: Add support for reversed masked interleaved access. 5764"Reverse masked interleaved access not supported.");
5765Cost += Group->getNumMembers() *
5772std::optional<InstructionCost>
5777// Early exit for no inloop reductions 5778if (InLoopReductions.
empty() || VF.
isScalar() || !isa<VectorType>(Ty))
5780auto *VectorTy = cast<VectorType>(Ty);
5782// We are looking for a pattern of, and finding the minimal acceptable cost: 5783// reduce(mul(ext(A), ext(B))) or 5784// reduce(mul(A, B)) or 5787// The basic idea is that we walk down the tree to do that, finding the root 5788// reduction instruction in InLoopReductionImmediateChains. From there we find 5789// the pattern of mul/ext and test the cost of the entire pattern vs the cost 5790// of the components. If the reduction cost is lower then we return it for the 5791// reduction instruction and 0 for the other instructions in the pattern. If 5792// it is not we return an invalid cost specifying the orignal cost method 5806// Test if the found instruction is a reduction, and if not return an invalid 5807// cost specifying the parent to use the original cost modelling. 5808if (!InLoopReductionImmediateChains.
count(RetI))
5811// Find the reduction this chain is a part of and calculate the basic cost of 5812// the reduction on its own. 5813Instruction *LastChain = InLoopReductionImmediateChains.
at(RetI);
5815while (!isa<PHINode>(ReductionPhi))
5816 ReductionPhi = InLoopReductionImmediateChains.
at(ReductionPhi);
5832// For a call to the llvm.fmuladd intrinsic we need to add the cost of a 5833// normal fmul instruction to the cost of the fadd reduction. 5838// If we're using ordered reductions then we can just return the base cost 5839// here, since getArithmeticReductionCost calculates the full ordered 5840// reduction cost when FP reassociation is not allowed. 5844// Get the operand that was not the reduction chain and match it to one of the 5845// patterns, returning the better cost if it is found. 5848 : dyn_cast<Instruction>(RetI->
getOperand(1));
5853if (RedOp && RdxDesc.
getOpcode() == Instruction::Add &&
5862// Matched reduce.add(ext(mul(ext(A), ext(B))) 5863// Note that the extend opcodes need to all match, or if A==B they will have 5864// been converted to zext(mul(sext(A), sext(A))) as it is known positive, 5865// which is equally fine. 5866bool IsUnsigned = isa<ZExtInst>(Op0);
5883 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5884returnI == RetI ? RedCost : 0;
5887// Matched reduce(ext(A)) 5888bool IsUnsigned = isa<ZExtInst>(RedOp);
5897if (RedCost.
isValid() && RedCost < BaseCost + ExtCost)
5898returnI == RetI ? RedCost : 0;
5899 }
elseif (RedOp && RdxDesc.
getOpcode() == Instruction::Add &&
5904bool IsUnsigned = isa<ZExtInst>(Op0);
5912// Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of 5913// different sizes. We take the largest type as the ext to reduce, and add 5914// the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))). 5927if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5928Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5936 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5937returnI == RetI ? RedCost : 0;
5939// Matched reduce.add(mul()) 5946if (RedCost.
isValid() && RedCost < MulCost + BaseCost)
5947returnI == RetI ? RedCost : 0;
5951returnI == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5955LoopVectorizationCostModel::getMemoryInstructionCost(
Instruction *
I,
5957// Calculate scalar cost only. Vectorization cost should be ready at this 5973LoopVectorizationCostModel::getScalarizationOverhead(
Instruction *
I,
5976// There is no mechanism yet to create a scalable scalarization loop, 5977// so this is currently Invalid. 5986if (!
RetTy->isVoidTy() &&
5993// Some targets keep addresses scalar. 5997// Some targets support efficient element stores. 6001// Collect operands to consider. 6005// Skip operands that do not require extraction/scalarization and do not incur 6008for (
auto *V : filterExtractingOperands(Ops, VF))
6011 filterExtractingOperands(Ops, VF), Tys,
CostKind);
6019// For each instruction in the old loop. 6025// TODO: We should generate better code and update the cost model for 6026// predicated uniform stores. Today they are treated as any other 6027// predicated store (see added test cases in 6028// invariant-store-vectorization.ll). 6033auto IsLegalToScalarize = [&]() {
6035// Scalarization of fixed length vectors "just works". 6038// We have dedicated lowering for unpredicated uniform loads and 6039// stores. Note that even with tail folding we know that at least 6040// one lane is active (i.e. generalized predication is not possible 6041// here), and the logic below depends on this fact. 6045// For scalable vectors, a uniform memop load is always 6046// uniform-by-parts and we know how to scalarize that. 6047if (isa<LoadInst>(
I))
6050// A uniform store isn't neccessarily uniform-by-part 6051// and we can't assume scalarization. 6052auto &SI = cast<StoreInst>(
I);
6060// Load: Scalar load + broadcast 6061// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract 6062// FIXME: This cost is a significant under-estimate for tail folded 6065 IsLegalToScalarize() ? getUniformMemOpCost(&
I, VF)
6068// Choose better solution for the current VF, Note that Invalid 6069// costs compare as maximumal large. If both are invalid, we get 6070// scalable invalid which signals a failure and a vectorization abort. 6071if (GatherScatterCost < ScalarizationCost)
6078// We assume that widening is the best solution when possible. 6083assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6084"Expected consecutive stride.");
6091// Choose between Interleaving, Gather/Scatter or Scalarization. 6093unsigned NumAccesses = 1;
6096assert(Group &&
"Fail to get an interleaved access group.");
6098// Make one decision for the whole group. 6102 NumAccesses = Group->getNumMembers();
6104 InterleaveCost = getInterleaveGroupCost(&
I, VF);
6109 ? getGatherScatterCost(&
I, VF) * NumAccesses
6113 getMemInstScalarizationCost(&
I, VF) * NumAccesses;
6115// Choose better solution for the current VF, 6116// write down this decision and use it during vectorization. 6119if (InterleaveCost <= GatherScatterCost &&
6120 InterleaveCost < ScalarizationCost) {
6122Cost = InterleaveCost;
6123 }
elseif (GatherScatterCost < ScalarizationCost) {
6125Cost = GatherScatterCost;
6128Cost = ScalarizationCost;
6130// If the instructions belongs to an interleave group, the whole group 6131// receives the same decision. The whole group receives the cost, but 6132// the cost will actually be assigned to one instruction. 6140// Make sure that any load of address and any other address computation 6141// remains scalar unless there is gather/scatter support. This avoids 6142// inevitable extracts into address registers, and also has the benefit of 6143// activating LSR more, since that pass can't optimize vectorized 6148// Start with all scalar pointer uses. 6159// Add all instructions used to generate the addresses. 6162while (!Worklist.
empty()) {
6164for (
auto &
Op :
I->operands())
6165if (
auto *InstOp = dyn_cast<Instruction>(
Op))
6166if ((InstOp->getParent() ==
I->getParent()) && !isa<PHINode>(InstOp) &&
6167 AddrDefs.
insert(InstOp).second)
6171for (
auto *
I : AddrDefs) {
6172if (isa<LoadInst>(
I)) {
6173// Setting the desired widening decision should ideally be handled in 6174// by cost functions, but since this involves the task of finding out 6175// if the loaded register is involved in an address computation, it is 6176// instead changed here when we know this is the case. 6179// Scalarize a widened load of address. 6185// Scalarize an interleave group of address loads. 6186for (
unsignedI = 0;
I < Group->getFactor(); ++
I) {
6195// Make sure I gets scalarized and a cost estimate without 6196// scalarization overhead. 6203"Trying to set a vectorization decision for a scalar VF");
6205auto ForcedScalar = ForcedScalars.
find(VF);
6207// For each instruction in the old loop. 6220for (
auto &ArgOp : CI->
args())
6223// Estimate cost of scalarized vector call. The source operands are 6224// assumed to be vectors, so we need to extract individual elements from 6225// there, execute VF scalar calls, and then gather the result into the 6226// vector return value. 6230// Compute costs of unpacking argument values for the scalar calls and 6231// packing the return values to a vector. 6235// Honor ForcedScalars and UniformAfterVectorization decisions. 6236// TODO: For calls, it might still be more profitable to widen. Use 6237// VPlan-based cost model to compare different options. 6238if (VF.
isVector() && ((ForcedScalar != ForcedScalars.
end() &&
6239 ForcedScalar->second.contains(CI)) ||
6248// Compute corresponding vector type for return value and arguments. 6250for (
Type *ScalarTy : ScalarTys)
6253// An in-loop reduction using an fmuladd intrinsic is a special case; 6254// we don't want the normal cost for that intrinsic. 6259 std::nullopt, *RedCost);
6263// Find the cost of vectorizing the call, if we can find a suitable 6264// vector variant of the function. 6265bool UsesMask =
false;
6268// Search through any available variants for one we can use at this VF. 6270// Must match requested VF. 6271if (
Info.Shape.VF != VF)
6274// Must take a mask argument if one is required 6275if (MaskRequired && !
Info.isMasked())
6278// Check that all parameter kinds are supported 6281switch (Param.ParamKind) {
6286// Make sure the scalar parameter in the loop is invariant. 6294// Find the stride for the scalar parameter in this loop and see if 6295// it matches the stride for the variant. 6296// TODO: do we need to figure out the cost of an extract to get the 6297// first lane? Or do we hope that it will be folded away? 6300 dyn_cast<SCEVAddRecExpr>(SE->
getSCEV(ScalarParam));
6302if (!SAR || SAR->getLoop() !=
TheLoop) {
6308 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));
6328// Found a suitable candidate, stop here. 6334// Add in the cost of synthesizing a mask if one wasn't required. 6336if (VecFunc && UsesMask && !MaskRequired)
6348// Find the cost of an intrinsic; some targets may have instructions that 6349// perform the operation without needing an actual call. 6357if (VectorCost <=
Cost) {
6362if (IntrinsicCost <=
Cost) {
6363Cost = IntrinsicCost;
6376// Consider Op invariant, if it or its operands aren't predicated 6377// instruction in the loop. In that case, it is not trivially hoistable. 6378auto *OpI = dyn_cast<Instruction>(
Op);
6389// If we know that this instruction will remain uniform, check the cost of 6390// the scalar version. 6395return InstsToScalarize[VF][
I];
6397// Forced scalars do not have any scalarization overhead. 6398auto ForcedScalar = ForcedScalars.
find(VF);
6399if (VF.
isVector() && ForcedScalar != ForcedScalars.
end()) {
6400auto InstSet = ForcedScalar->second;
6401if (InstSet.count(
I))
6411auto HasSingleCopyAfterVectorization = [
this](
Instruction *
I,
6416auto Scalarized = InstsToScalarize.
find(VF);
6417assert(Scalarized != InstsToScalarize.
end() &&
6418"VF not yet analyzed for scalarization profitability");
6419return !Scalarized->second.count(
I) &&
6421 auto *UI = cast<Instruction>(U);
6422 return !Scalarized->second.count(UI);
6425 (void)HasSingleCopyAfterVectorization;
6429// With the exception of GEPs and PHIs, after scalarization there should 6430// only be one copy of the instruction generated in the loop. This is 6431// because the VF is either 1, or any instructions that need scalarizing 6432// have already been dealt with by the time we get here. As a result, 6433// it means we don't have to multiply the instruction cost by VF. 6434assert(
I->getOpcode() == Instruction::GetElementPtr ||
6435I->getOpcode() == Instruction::PHI ||
6436 (
I->getOpcode() == Instruction::BitCast &&
6437I->getType()->isPointerTy()) ||
6438 HasSingleCopyAfterVectorization(
I, VF));
6447// TODO: We need to estimate the cost of intrinsic calls. 6448switch (
I->getOpcode()) {
6449case Instruction::GetElementPtr:
6450// We mark this instruction as zero-cost because the cost of GEPs in 6451// vectorized code depends on whether the corresponding memory instruction 6452// is scalarized or not. Therefore, we handle GEPs with the memory 6455case Instruction::Br: {
6456// In cases of scalarized and predicated instructions, there will be VF 6457// predicated blocks in the vectorized loop. Each branch around these 6458// blocks requires also an extract of its vector compare i1 element. 6459// Note that the conditional branch from the loop latch will be replaced by 6460// a single branch controlling the loop, so there is no extra overhead from 6462bool ScalarPredicatedBB =
false;
6468 ScalarPredicatedBB =
true;
6470if (ScalarPredicatedBB) {
6471// Not possible to scalarize scalable vector with predicated instructions. 6474// Return cost for branches around scalarized and predicated blocks. 6480/*Insert*/false,
/*Extract*/true,
CostKind) +
6485// The back-edge branch will remain, as will all scalar branches. 6488// This branch will be eliminated by if-conversion. 6490// Note: We currently assume zero cost for an unconditional branch inside 6491// a predicated block since it will become a fall-through, although we 6492// may decide in the future to call TTI for all branches. 6494case Instruction::Switch: {
6497auto *Switch = cast<SwitchInst>(
I);
6498return Switch->getNumCases() *
6501toVectorTy(Switch->getCondition()->getType(), VF),
6505case Instruction::PHI: {
6506auto *Phi = cast<PHINode>(
I);
6508// First-order recurrences are replaced by vector shuffles inside the loop. 6510// For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the 6511// penultimate value of the recurrence. 6512// TODO: Consider vscale_range info. 6518 cast<VectorType>(VectorTy), Mask,
CostKind,
6522// Phi nodes in non-header blocks (not inductions, reductions, etc.) are 6523// converted into select instructions. We require N - 1 selects per phi 6524// node, where N is the number of incoming values. 6526Type *ResultTy = Phi->getType();
6528// All instructions in an Any-of reduction chain are narrowed to bool. 6529// Check if that is the case for this phi node. 6530auto *HeaderUser = cast_if_present<PHINode>(
6531 find_singleton<User>(Phi->users(), [
this](
User *U,
bool) ->
User * {
6532 auto *Phi = dyn_cast<PHINode>(U);
6533 if (Phi && Phi->getParent() == TheLoop->getHeader())
6539auto Iter = ReductionVars.
find(HeaderUser);
6540if (Iter != ReductionVars.end() &&
6542 Iter->second.getRecurrenceKind()))
6545return (Phi->getNumIncomingValues() - 1) *
6547 Instruction::Select,
toVectorTy(ResultTy, VF),
6552// When tail folding with EVL, if the phi is part of an out of loop 6553// reduction then it will be transformed into a wide vp_merge. 6557 Intrinsic::vp_merge,
toVectorTy(Phi->getType(), VF),
6558 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});
6564case Instruction::UDiv:
6565case Instruction::SDiv:
6566case Instruction::URem:
6567case Instruction::SRem:
6571 ScalarCost : SafeDivisorCost;
6573// We've proven all lanes safe to speculate, fall through. 6575case Instruction::Add:
6576case Instruction::Sub: {
6580// Assume that a non-constant update value (or a constant != 1) requires 6581// a multiply, and add that into the cost. 6584if (!
RHS ||
RHS->getZExtValue() != 1)
6588// Find the cost of the histogram operation itself. 6590Type *ScalarTy =
I->getType();
6594 {PtrTy, ScalarTy, MaskTy});
6596// Add the costs together with the add/sub operation. 6602case Instruction::FAdd:
6603case Instruction::FSub:
6604case Instruction::Mul:
6605case Instruction::FMul:
6606case Instruction::FDiv:
6607case Instruction::FRem:
6608case Instruction::Shl:
6609case Instruction::LShr:
6610case Instruction::AShr:
6611case Instruction::And:
6612case Instruction::Or:
6613case Instruction::Xor: {
6614// If we're speculating on the stride being 1, the multiplication may 6615// fold away. We can generalize this for all operations using the notion 6616// of neutral elements. (TODO) 6617if (
I->getOpcode() == Instruction::Mul &&
6622// Detect reduction patterns 6626// Certain instructions can be cheaper to vectorize if they have a constant 6627// second vector operand. One example of this are shifts on x86. 6628Value *Op2 =
I->getOperand(1);
6631 Op2 = cast<SCEVConstant>(
PSE.
getSCEV(Op2))->getValue();
6641 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6644case Instruction::FNeg: {
6647 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6648 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
6651case Instruction::Select: {
6653constSCEV *CondSCEV = SE->
getSCEV(SI->getCondition());
6656constValue *Op0, *Op1;
6660// select x, y, false --> x & y 6661// select x, true, y --> x | y 6673Type *CondTy = SI->getCondition()->getType();
6678if (
auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
6679 Pred = Cmp->getPredicate();
6681CostKind, {TTI::OK_AnyValue, TTI::OP_None},
6682 {TTI::OK_AnyValue, TTI::OP_None},
I);
6684case Instruction::ICmp:
6685case Instruction::FCmp: {
6686Type *ValTy =
I->getOperand(0)->getType();
6689Instruction *Op0AsInstruction = dyn_cast<Instruction>(
I->getOperand(0));
6690 (void)Op0AsInstruction;
6692 MinBWs[
I] == MinBWs[Op0AsInstruction]) &&
6693"if both the operand and the compare are marked for " 6694"truncation, they must have the same bitwidth");
6700 cast<CmpInst>(
I)->getPredicate(),
CostKind,
6701 {TTI::OK_AnyValue, TTI::OP_None},
6702 {TTI::OK_AnyValue, TTI::OP_None},
I);
6704case Instruction::Store:
6705case Instruction::Load: {
6710"CM decision should be taken at this point");
6717return getMemoryInstructionCost(
I, VF);
6719case Instruction::BitCast:
6720if (
I->getType()->isPointerTy())
6723case Instruction::ZExt:
6724case Instruction::SExt:
6725case Instruction::FPToUI:
6726case Instruction::FPToSI:
6727case Instruction::FPExt:
6728case Instruction::PtrToInt:
6729case Instruction::IntToPtr:
6730case Instruction::SIToFP:
6731case Instruction::UIToFP:
6732case Instruction::Trunc:
6733case Instruction::FPTrunc: {
6734// Computes the CastContextHint from a Load/Store instruction. 6736assert((isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
6737"Expected a load or a store!");
6763unsigned Opcode =
I->getOpcode();
6765// For Trunc, the context is the only user, which must be a StoreInst. 6766if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6768if (
StoreInst *Store = dyn_cast<StoreInst>(*
I->user_begin()))
6769 CCH = ComputeCCH(Store);
6771// For Z/Sext, the context is the operand, which must be a LoadInst. 6772elseif (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6773 Opcode == Instruction::FPExt) {
6774if (
LoadInst *Load = dyn_cast<LoadInst>(
I->getOperand(0)))
6775 CCH = ComputeCCH(Load);
6778// We optimize the truncation of induction variables having constant 6779// integer steps. The cost of these truncations is the same as the scalar 6782auto *Trunc = cast<TruncInst>(
I);
6784 Trunc->getSrcTy(), CCH,
CostKind, Trunc);
6787// Detect reduction patterns 6791Type *SrcScalarTy =
I->getOperand(0)->getType();
6792Instruction *Op0AsInstruction = dyn_cast<Instruction>(
I->getOperand(0));
6800// If the result type is <= the source type, there will be no extend 6801// after truncating the users to the minimal required bitwidth. 6803 (
I->getOpcode() == Instruction::ZExt ||
6804I->getOpcode() == Instruction::SExt))
6810case Instruction::Call:
6812case Instruction::ExtractValue:
6814case Instruction::Alloca:
6815// We cannot easily widen alloca to a scalable alloca, as 6816// the result would need to be a vector of pointers. 6821// This opcode is unknown. Assume that it is the same as 'mul'. 6827// Ignore ephemeral values. 6833// If a scalar epilogue is required, users outside the loop won't use 6834// live-outs from the vector loop but from the scalar epilogue. Ignore them if 6837auto IsLiveOutDead = [
this, RequiresScalarEpilogue](
User *U) {
6838return RequiresScalarEpilogue &&
6847// Find all stores to invariant variables. Since they are going to sink 6848// outside the loop we do not need calculate cost for them. 6850if ((SI = dyn_cast<StoreInst>(&
I)) &&
6853 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6854 SI->getValueOperand());
6860// Add instructions that would be trivially dead and are only used by 6861// values already ignored to DeadOps to seed worklist. 6864 return VecValuesToIgnore.contains(U) ||
6865 ValuesToIgnore.contains(U) || IsLiveOutDead(U);
6869// For interleave groups, we only create a pointer for the start of the 6870// interleave group. Queue up addresses of group members except the insert 6871// position for further processing. 6874if (Group->getInsertPos() == &
I)
6877 DeadInterleavePointerOps.
push_back(PointerOp);
6880// Queue branches for analysis. They are dead, if their successors only 6881// contain dead instructions. 6882if (
auto *Br = dyn_cast<BranchInst>(&
I)) {
6883if (Br->isConditional())
6888// Mark ops feeding interleave group members as free, if they are only used 6889// by other dead computations. 6890for (
unsignedI = 0;
I != DeadInterleavePointerOps.
size(); ++
I) {
6891auto *
Op = dyn_cast<Instruction>(DeadInterleavePointerOps[
I]);
6893 Instruction *UI = cast<Instruction>(U);
6894 return !VecValuesToIgnore.contains(U) &&
6895 (!isAccessInterleaved(UI) ||
6896 getInterleavedAccessGroup(UI)->getInsertPos() == UI);
6900 DeadInterleavePointerOps.
append(
Op->op_begin(),
Op->op_end());
6903for (
constauto &[
_, Ops] : DeadInvariantStoreOps) {
6907// Mark ops that would be trivially dead and are only used by ignored 6908// instructions as free. 6911// Returns true if the block contains only dead instructions. Such blocks will 6912// be removed by VPlan-to-VPlan transforms and won't be considered by the 6913// VPlan-based cost model, so skip them in the legacy cost-model as well. 6917 (isa<BranchInst>(&
I) && !cast<BranchInst>(&
I)->isConditional());
6920for (
unsignedI = 0;
I != DeadOps.
size(); ++
I) {
6921auto *
Op = dyn_cast<Instruction>(DeadOps[
I]);
6923// Check if the branch should be considered dead. 6924if (
auto *Br = dyn_cast_or_null<BranchInst>(
Op)) {
6927// Don't considers branches leaving the loop for simplification. 6932if ((ThenEmpty && ElseEmpty) ||
6934 ElseBB->
phis().empty()) ||
6936 ThenBB->
phis().empty())) {
6943// Skip any op that shouldn't be considered dead. 6945 (isa<PHINode>(
Op) &&
Op->getParent() == Header) ||
6948 return !VecValuesToIgnore.contains(U) &&
6949 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);
6956// If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore 6957// which applies for both scalar and vector versions. Otherwise it is only 6958// dead in vector versions, so only add it to VecValuesToIgnore. 6960 [
this](
User *U) { return ValuesToIgnore.contains(U); }))
6967// Ignore type-promoting instructions we identified during reduction 6974// Ignore type-casting instructions we identified during induction 6988// We don't collect reductions that are type promoted (yet). 6992// If the target would prefer this reduction to happen "in-loop", then we 6993// want to record it as such. 7000// Check that we can correctly put the reductions into the loop, by 7001// finding the chain of operations that leads from the phi to the loop 7005bool InLoop = !ReductionOperations.
empty();
7008 InLoopReductions.
insert(Phi);
7009// Add the elements to InLoopReductionImmediateChains for cost modelling. 7011for (
auto *
I : ReductionOperations) {
7012 InLoopReductionImmediateChains[
I] = LastChain;
7016LLVM_DEBUG(
dbgs() <<
"LV: Using " << (InLoop ?
"inloop" :
"out of loop")
7017 <<
" reduction for phi: " << *Phi <<
"\n");
7021// This function will select a scalable VF if the target supports scalable 7022// vectors and a fixed one otherwise. 7023// TODO: we could return a pair of values that specify the max VF and 7024// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of 7025// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment 7026// doesn't have a cost model that can choose which plan to execute if 7027// more than one is generated. 7039unsignedN =
RegSize.getKnownMinValue() / WidestType;
7046// Outer loop handling: They may require CFG and instruction level 7047// transformations before even evaluating whether vectorization is profitable. 7048// Since we cannot modify the incoming IR, we need to build VPlan upfront in 7049// the vectorization pipeline. 7051// If the user doesn't provide a vectorization factor, determine a 7057// Make sure we have a VF > 1 for stress testing. 7060 <<
"overriding computed VF.\n");
7065LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing. Scalable VF requested, but " 7066 <<
"not supported by the target.\n");
7068"Scalable vectorization requested but not supported by the target",
7069"the scalable user-specified vectorization width for outer-loop " 7070"vectorization cannot be used because the target does not support " 7072"ScalableVFUnfeasible", ORE, OrigLoop);
7077"VF needs to be a power of two");
7079 <<
"VF " << VF <<
" to build VPlans.\n");
7082// For VPlan build stress testing, we bail out after VPlan construction. 7086return {VF, 0
/*Cost*/, 0
/* ScalarCost */};
7090dbgs() <<
"LV: Not vectorizing. Inner loops aren't supported in the " 7091"VPlan-native path.\n");
7101if (!MaxFactors)
// Cases that should not to be vectorized nor interleaved. 7104// Invalidate interleave groups if all blocks of loop will be predicated. 7109 <<
"LV: Invalidate all interleaved groups due to fold-tail by masking " 7110"which requires masked-interleaved support.\n");
7112// Invalidating interleave groups also requires invalidating all decisions 7113// based on them, which includes widening decisions and uniform and scalar 7126"UserVF ignored because it may be larger than the maximal safe VF",
7127"InvalidUserVF", ORE, OrigLoop);
7130"VF needs to be a power of two");
7131// Collect the instructions (and their associated costs) that will be more 7132// profitable to scalarize. 7136 buildVPlansWithVPRecipes(UserVF, UserVF);
7141"InvalidCost", ORE, OrigLoop);
7145// Collect the Vectorization Factor Candidates. 7155for (
constauto &VF : VFCandidates) {
7156// Collect Uniform and Scalar instructions after vectorization with VF. 7159// Collect the instructions (and their associated costs) that will be more 7160// profitable to scalarize. 7188// Cost modeling for inductions is inaccurate in the legacy cost model 7189// compared to the recipes that are generated. To match here initially during 7190// VPlan cost model bring up directly use the induction costs from the legacy 7191// cost model. Note that we do this as pre-processing; the VPlan may not have 7192// any recipes associated with the original induction increment instruction 7193// and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute 7194// the cost of induction phis and increments (both that are represented by 7195// recipes and those that are not), to avoid distinguishing between them here, 7196// and skip all recipes that represent induction phis and increments (the 7197// former case) later on, if they exist, to avoid counting them twice. 7198// Similarly we pre-compute the cost of any optimized truncates. 7199// TODO: Switch to more accurate costing based on VPlan. 7204for (
unsignedI = 0;
I != IVInsts.
size();
I++) {
7205for (
Value *
Op : IVInsts[
I]->operands()) {
7206auto *OpI = dyn_cast<Instruction>(
Op);
7207if (
Op ==
IV || !OpI || !OrigLoop->
contains(OpI) || !
Op->hasOneUse())
7213for (
User *U :
IV->users()) {
7214auto *CI = cast<Instruction>(U);
7220// If the vector loop gets executed exactly once with the given VF, ignore 7221// the costs of comparison and induction instructions, as they'll get 7223// TODO: Remove this code after stepping away from the legacy cost model and 7224// adding code to simplify VPlans before calculating their costs. 7235dbgs() <<
"Cost of " << InductionCost <<
" for VF " << VF
7236 <<
": induction instruction " << *IVInst <<
"\n";
7238Cost += InductionCost;
7243 /// Compute the cost of all exiting conditions of the loop using the legacy 7244 /// cost model. This is to match the legacy behavior, which adds the cost of 7245 /// all exit conditions. Note that this over-estimates the cost, as there will 7246 /// be a single condition to control the vector loop. 7250// Collect all exit conditions. 7252auto *
Term = dyn_cast<BranchInst>(EB->getTerminator());
7255if (
auto *CondI = dyn_cast<Instruction>(
Term->getOperand(0))) {
7256 ExitInstrs.
insert(CondI);
7259// Compute the cost of all instructions only feeding the exit conditions. 7260for (
unsignedI = 0;
I != ExitInstrs.
size(); ++
I) {
7267dbgs() <<
"Cost of " << CondICost <<
" for VF " << VF
7268 <<
": exit condition instruction " << *CondI <<
"\n";
7272auto *OpI = dyn_cast<Instruction>(
Op);
7273if (!OpI ||
any_of(OpI->users(), [&ExitInstrs,
this](
User *U) {
7274 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
7275 !ExitInstrs.contains(cast<Instruction>(U));
7282// The legacy cost model has special logic to compute the cost of in-loop 7283// reductions, which may be smaller than the sum of all instructions involved 7285// TODO: Switch to costing based on VPlan once the logic has been ported. 7293constauto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7296auto IsZExtOrSExt = [](
constunsigned Opcode) ->
bool {
7297return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;
7299// Also include the operands of instructions in the chain, as the cost-model 7300// may mark extends as free. 7302// For ARM, some of the instruction can folded into the reducion 7303// instruction. So we need to mark all folded instructions free. 7304// For example: We can fold reduce(mul(ext(A), ext(B))) into one 7306for (
auto *ChainOp : ChainOps) {
7307for (
Value *
Op : ChainOp->operands()) {
7308if (
auto *
I = dyn_cast<Instruction>(
Op)) {
7309 ChainOpsAndOperands.insert(
I);
7310if (
I->getOpcode() == Instruction::Mul) {
7311auto *Ext0 = dyn_cast<Instruction>(
I->getOperand(0));
7312auto *Ext1 = dyn_cast<Instruction>(
I->getOperand(1));
7313if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&
7314 Ext0->getOpcode() == Ext1->getOpcode()) {
7315 ChainOpsAndOperands.insert(Ext0);
7316 ChainOpsAndOperands.insert(Ext1);
7323// Pre-compute the cost for I, if it has a reduction pattern cost. 7331"reduction op visited multiple times");
7333LLVM_DEBUG(
dbgs() <<
"Cost of " << ReductionCost <<
" for VF " << VF
7334 <<
":\n in-loop reduction " << *
I <<
"\n");
7335Cost += *ReductionCost;
7339// Pre-compute the costs for branches except for the backedge, as the number 7340// of replicate regions in a VPlan may not directly match the number of 7341// branches, which would lead to different decisions. 7342// TODO: Compute cost of branches for each replicate region in the VPlan, 7343// which is more accurate than the legacy cost model. 7350auto BranchCost = CostCtx.
getLegacyCost(BB->getTerminator(), VF);
7354// Pre-compute costs for instructions that are forced-scalar or profitable to 7355// scalarize. Their costs will be computed separately in the legacy cost 7357for (
Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
7363dbgs() <<
"Cost of " << ForcedCost <<
" for VF " << VF
7364 <<
": forced scalar " << *ForcedScalar <<
"\n";
7368for (
constauto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
7373dbgs() <<
"Cost of " << ScalarCost <<
" for VF " << VF
7374 <<
": profitable to scalarize " << *Scalarized <<
"\n";
7388// Now compute and add the VPlan-based cost. 7393 <<
" (Estimated cost per lane: ");
7395double CostPerLane = double(*
Cost.
getValue()) / EstimatedWidth;
7397 }
else/* No point dividing an invalid cost - it will still be invalid */ 7405/// Return true if the original loop \ TheLoop contains any instructions that do 7406/// not have corresponding recipes in \p Plan and are not marked to be ignored 7407/// in \p CostCtx. This means the VPlan contains simplification that the legacy 7408/// cost-model did not account for. 7412// First collect all instructions for the recipes in Plan. 7414if (
auto *S = dyn_cast<VPSingleDefRecipe>(R))
7415return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
7416if (
auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
7417return &WidenMem->getIngredient();
7423for (
VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
7425if (
auto *
IR = dyn_cast<VPInterleaveRecipe>(&R)) {
7426auto *IG =
IR->getInterleaveGroup();
7427unsigned NumMembers = IG->getNumMembers();
7428for (
unsignedI = 0;
I != NumMembers; ++
I) {
7434// The VPlan-based cost model is more accurate for partial reduction and 7435// comparing against the legacy cost isn't desirable. 7436if (isa<VPPartialReductionRecipe>(&R))
7443// Return true if the loop contains any instructions that are not also part of 7444// the VPlan or are skipped for VPlan-based cost computations. This indicates 7445// that the VPlan contains extra simplifications. 7448 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
7449 if (isa<PHINode>(&I) && BB == TheLoop->getHeader())
7451 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
7460// If there is a single VPlan with a single VF, return it directly. 7461VPlan &FirstPlan = *VPlans[0];
7467 ?
"Reciprocal Throughput\n" 7469 ?
"Instruction Latency\n" 7472 ?
"Code Size and Latency\n" 7477"More than a single plan/VF w/o any plan having scalar VF");
7479// TODO: Compute scalar cost using VPlan-based cost model. 7481LLVM_DEBUG(
dbgs() <<
"LV: Scalar loop costs: " << ScalarCost <<
".\n");
7486if (ForceVectorization) {
7487// Ignore scalar width, because the user explicitly wants vectorization. 7488// Initialize cost to max so that VF = 2 is, at least, chosen during cost 7493for (
auto &
P : VPlans) {
7500 <<
"LV: Not considering vector loop of width " << VF
7501 <<
" because it will not generate any vector instructions.\n");
7507if (isMoreProfitable(CurrentFactor, BestFactor))
7508 BestFactor = CurrentFactor;
7510// If profitable add it to ProfitableVF list. 7511if (isMoreProfitable(CurrentFactor, ScalarFactor))
7512 ProfitableVFs.push_back(CurrentFactor);
7517// Select the optimal vectorization factor according to the legacy cost-model. 7518// This is now only used to verify the decisions by the new VPlan-based 7519// cost-model and will be retired once the VPlan-based cost-model is 7524// Pre-compute the cost and use it to check if BestPlan contains any 7525// simplifications not accounted for in the legacy cost model. If that's the 7526// case, don't trigger the assertion, as the extra simplifications may cause a 7527// different VF to be picked by the VPlan-based cost model. 7530 precomputeCosts(BestPlan, BestFactor.
Width, CostCtx);
7533 CostCtx, OrigLoop) ||
7535 CostCtx, OrigLoop)) &&
7536" VPlan cost model and legacy cost model disagreed");
7538"when vectorizing, the scalar cost must be computed.");
7547// Reserve first location for self reference to the LoopID metadata node. 7549bool IsUnrollMetadata =
false;
7550MDNode *LoopID = L->getLoopID();
7552// First find existing loop unrolling disable metadata. 7554auto *MD = dyn_cast<MDNode>(LoopID->
getOperand(
I));
7556constauto *S = dyn_cast<MDString>(MD->getOperand(0));
7558 S && S->getString().starts_with(
"llvm.loop.unroll.disable");
7564if (!IsUnrollMetadata) {
7565// Add runtime unroll disable metadata. 7566LLVMContext &Context = L->getHeader()->getContext();
7573// Set operand 0 to refer to the loop id itself. 7575 L->setLoopID(NewLoopID);
7579// If \p R is a ComputeReductionResult when vectorizing the epilog loop, 7580// fix the reduction's scalar PHI node by adding the incoming value from the 7585auto *EpiRedResult = dyn_cast<VPInstruction>(R);
7590auto *EpiRedHeaderPhi =
7591 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));
7593 EpiRedHeaderPhi->getRecurrenceDescriptor();
7594Value *MainResumeValue =
7595 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7598auto *Cmp = cast<ICmpInst>(MainResumeValue);
7600"AnyOf expected to start with ICMP_NE");
7602"AnyOf expected to start by comparing main resume value to original " 7604 MainResumeValue = Cmp->getOperand(0);
7608Value *Cmp, *OrigResumeV;
7609bool IsExpectedPattern =
7616assert(IsExpectedPattern &&
"Unexpected reduction resume pattern");
7617 (void)IsExpectedPattern;
7618 MainResumeValue = OrigResumeV;
7620PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);
7622// When fixing reductions in the epilogue loop we should already have 7623// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry 7624// over the incoming values correctly. 7625using namespaceVPlanPatternMatch;
7626auto IsResumePhi = [](
VPUser *U) {
7628 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));
7631"ResumePhi must have a single user");
7632auto *EpiResumePhiVPI =
7633 cast<VPInstruction>(*
find_if(EpiRedResult->users(), IsResumePhi));
7634auto *EpiResumePhi = cast<PHINode>(State.
get(EpiResumePhiVPI,
true));
7635 EpiResumePhi->setIncomingValueForBlock(
7644"Trying to execute plan with unsupported VF");
7646"Trying to execute plan with unsupported UF");
7648 ((VectorizingEpilogue && ExpandedSCEVs) ||
7649 (!VectorizingEpilogue && !ExpandedSCEVs)) &&
7650"expanded SCEVs to reuse can only be used during epilogue vectorization");
7652// TODO: Move to VPlan transform stage once the transition to the VPlan-based 7653// cost model is complete for better cost estimates. 7659// Perform the actual loop transformation. 7664#ifdef EXPENSIVE_CHECKS 7665assert(DT->
verify(DominatorTree::VerificationLevel::Fast));
7668// 0. Generate SCEV-dependent code in the entry, including TripCount, before 7669// making any changes to the CFG. 7676assert(VectorizingEpilogue &&
"should only re-use the existing trip " 7677"count during epilogue vectorization");
7679// 1. Set up the skeleton for vectorization, including vector pre-header and 7680// middle block. The vector loop is created during VPlan execution. 7685if (VectorizingEpilogue)
7688// Only use noalias metadata when using memory checks guaranteeing no overlap 7689// across all iterations. 7691 std::unique_ptr<LoopVersioning> LVer =
nullptr;
7695// We currently don't use LoopVersioning for the actual loop cloning but we 7696// still use it to add the noalias metadata. 7697// TODO: Find a better way to re-use LoopVersioning functionality to add 7699 LVer = std::make_unique<LoopVersioning>(
7702 State.
LVer = &*LVer;
7708//===------------------------------------------------===// 7710// Notice: any optimization or new instruction that go 7711// into the code below should also be implemented in 7714//===------------------------------------------------===// 7716// 2. Copy and widen instructions from the old loop into the new loop. 7725// 2.5 When vectorizing the epilogue, fix reduction and induction resume 7726// values from the additional bypass block. 7727if (VectorizingEpilogue) {
7729"Epilogue vectorisation not yet supported with early exits");
7733 &R, State, State.
CFG.
VPBB2IRBB[MiddleVPBB], BypassBlock);
7737auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7739 Inc->setIncomingValueForBlock(BypassBlock, V);
7743// 2.6. Maintain Loop Hints 7744// Keep all loop hints from the original loop on the vector loop (we'll 7745// replace the vectorizer-specific hints below). 7749 std::optional<MDNode *> VectorizedLoopID =
7755if (VectorizedLoopID) {
7756 L->setLoopID(*VectorizedLoopID);
7758// Keep all loop hints from the original loop on the vector loop (we'll 7759// replace the vectorizer-specific hints below). 7772// 3. Fix the vectorized code: take care of header phi's, live-outs, 7773// predication, updating analyses. 7778// 4. Adjust branch weight of the branch in the middle block. 7782 cast<BranchInst>(State.
CFG.
VPBB2IRBB[MiddleVPBB]->getTerminator());
7783if (MiddleTerm->isConditional() &&
7785// Assume that `Count % VectorTripCount` is equally distributed. 7787assert(TripCount > 0 &&
"trip count should not be zero");
7788constuint32_t Weights[] = {1, TripCount - 1};
7796//===--------------------------------------------------------------------===// 7797// EpilogueVectorizerMainLoop 7798//===--------------------------------------------------------------------===// 7800/// This function is partially responsible for generating the control flow 7801/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7803const SCEV2ValueTy &ExpandedSCEVs) {
7806// Generate the code to check the minimum iteration count of the vector 7807// epilogue (see below). 7812// Generate the code to check any assumptions that we've made for SCEV 7816// Generate the code that checks at runtime if arrays overlap. We put the 7817// checks into a separate block to make the more common case of few elements 7821// Generate the iteration count check for the main loop, *after* the check 7822// for the epilogue loop, so that the path-length is shorter for the case 7823// that goes directly through the vector epilogue. The longer-path length for 7824// the main loop is compensated for, by the gain from vectorizing the larger 7825// trip count. Note: the branch will get updated later on when we vectorize 7830// Generate the induction variable. 7838dbgs() <<
"Create Skeleton for epilogue vectorized loop (first pass)\n" 7848dbgs() <<
"intermediate fn:\n" 7856assert(Bypass &&
"Expected valid bypass basic block.");
7860// Reuse existing vector loop preheader for TC checks. 7861// Note that new preheader block is generated for vector loop. 7865// Generate code to check if the loop's trip count is less than VF * UF of the 7877 TCCheckBlock->
setName(
"vector.main.loop.iter.check");
7879// Create new preheader for vector loop. 7881DT,
LI,
nullptr,
"vector.ph");
7886"TC check is expected to dominate Bypass");
7890// Save the trip count so we don't have to regenerate it in the 7891// vec.epilog.iter.check. This is safe to do because the trip count 7892// generated here dominates the vector epilog iter check. 7906//===--------------------------------------------------------------------===// 7907// EpilogueVectorizerEpilogueLoop 7908//===--------------------------------------------------------------------===// 7910/// This function is partially responsible for generating the control flow 7911/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. 7914const SCEV2ValueTy &ExpandedSCEVs) {
7917// Now, compare the remaining count and if there aren't enough iterations to 7918// execute the vectorized epilogue skip to the scalar part. 7922nullptr,
"vec.epilog.iter.check",
true);
7924 VecEpilogueIterationCountCheck);
7927// Adjust the control flow taking the state info from the main loop 7928// vectorization into account. 7930"expected this to be saved from the previous pass.");
7946// Keep track of bypass blocks, as they feed start values to the induction and 7947// reduction phis in the scalar loop preheader. 7954// The vec.epilog.iter.check block may contain Phi nodes from inductions or 7955// reductions which merge control-flow from the latch block and the middle 7956// block. Update the incoming values here and move the Phi into the preheader. 7958for (
PHINode &Phi : VecEpilogueIterationCountCheck->
phis())
7961for (
PHINode *Phi : PhisInBlock) {
7963 Phi->replaceIncomingBlockWith(
7965 VecEpilogueIterationCountCheck);
7967// If the phi doesn't have an incoming value from the 7968// EpilogueIterationCountCheck, we are done. Otherwise remove the incoming 7969// value and also those from other check blocks. This is needed for 7970// reduction phis only. 7972 return EPI.EpilogueIterationCountCheck == IncB;
7982// Generate bypass values from the additional bypass block. Note that when the 7983// vectorized epilogue is skipped due to iteration count check, then the 7984// resume value for the induction variable comes from the trip count of the 7985// main vector loop, passed as the second argument. 7995"Expected trip count to have been saved in the first pass.");
7999"saved trip count does not dominate insertion point.");
8004// Generate code to check if the loop's trip count is less than VF * UF of the 8005// vector epilogue loop. 8010Value *CheckMinIters =
8014"min.epilog.iters.check");
8020unsigned EpilogueLoopStep =
8022// We assume the remaining `Count` is equally distributed in 8024// So the probability for `Count < EpilogueLoopStep` should be 8025// min(MainLoopStep, EpilogueLoopStep) / MainLoopStep 8026unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
8027constuint32_t Weights[] = {EstimatedSkipCount,
8028 MainLoopStep - EstimatedSkipCount};
8034// A new entry block has been created for the epilogue VPlan. Hook it in, as 8035// otherwise we would try to modify the entry to the main vector loop. 8040// OldEntry is now dead and will be cleaned up when the plan gets destroyed. 8048dbgs() <<
"Create Skeleton for epilogue vectorized loop (second pass)\n" 8063return getVPValueOrAddLiveIn(
Op);
8075"unsupported switch either exiting loop or continuing to header");
8076// Create masks where the terminator in Src is a switch. We create mask for 8077// all edges at the same time. This is more efficient, as we can create and 8078// collect compares for all cases once. 8080BasicBlock *DefaultDst = SI->getDefaultDest();
8082for (
auto &
C : SI->cases()) {
8084assert(!EdgeMaskCache.
contains({Src, Dst}) &&
"Edge masks already created");
8085// Cases whose destination is the same as default are redundant and can be 8086// ignored - they will get there anyhow. 8087if (Dst == DefaultDst)
8089auto &Compares = Dst2Compares[Dst];
8094// We need to handle 2 separate cases below for all entries in Dst2Compares, 8095// which excludes destinations matching the default destination. 8098for (
constauto &[Dst, Conds] : Dst2Compares) {
8099// 1. Dst is not the default destination. Dst is reached if any of the cases 8100// with destination == Dst are taken. Join the conditions for each case 8101// whose destination == Dst using an OR. 8107 EdgeMaskCache[{Src, Dst}] = Mask;
8109// 2. Create the mask for the default destination, which is reached if none 8110// of the cases with destination != default destination are taken. Join the 8111// conditions for each case where the destination is != Dst using an OR and 8113 DefaultMask = DefaultMask ? Builder.
createOr(DefaultMask, Mask) : Mask;
8117 DefaultMask = Builder.
createNot(DefaultMask);
8121 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;
8127// Look for cached value. 8128 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8130if (ECEntryIt != EdgeMaskCache.
end())
8131return ECEntryIt->second;
8133if (
auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {
8135assert(EdgeMaskCache.
contains(Edge) &&
"Mask for Edge not created?");
8136return EdgeMaskCache[Edge];
8141// The terminator has to be a branch inst! 8142BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
8143assert(BI &&
"Unexpected terminator found");
8145return EdgeMaskCache[Edge] = SrcMask;
8147// If source is an exiting block, we know the exit edge is dynamically dead 8148// in the vector loop, and thus we don't need to restrict the mask. Avoid 8149// adding uses of an otherwise potentially dead instruction unless we are 8150// vectorizing a loop with uncountable exits. In that case, we always 8151// materialize the mask. 8154return EdgeMaskCache[Edge] = SrcMask;
8157assert(EdgeMask &&
"No Edge Mask found for condition");
8162if (SrcMask) {
// Otherwise block in-mask is all-one, no need to AND. 8163// The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask 8164// is false and EdgeMask is poison. Avoid that by using 'LogicalAnd' 8165// instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'. 8169return EdgeMaskCache[Edge] = EdgeMask;
8175// Look for cached value. 8176 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
8178assert(ECEntryIt != EdgeMaskCache.
end() &&
8179"looking up mask for edge which has not been created");
8180return ECEntryIt->second;
8186// When not folding the tail, use nullptr to model all-true mask. 8188 BlockMaskCache[Header] =
nullptr;
8192// Introduce the early-exit compare IV <= BTC to form header block mask. 8193// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by 8194// constructing the desired canonical IV in the header block as its first 8195// non-phi instructions. 8200 HeaderVPBB->
insert(
IV, NewInsertionPoint);
8207 BlockMaskCache[Header] = BlockMask;
8211// Return the cached value. 8213assert(BCEntryIt != BlockMaskCache.
end() &&
8214"Trying to access mask for block without one.");
8215return BCEntryIt->second;
8220assert(BlockMaskCache.
count(BB) == 0 &&
"Mask for block already computed");
8222"Loop header must have cached block mask");
8224// All-one mask is modelled as no-mask following the convention for masked 8225// load/store/gather/scatter. Initialize BlockMask to no-mask. 8227// This is the block mask. We OR all unique incoming edges. 8228for (
auto *Predecessor :
8231if (!EdgeMask) {
// Mask of predecessor is all-one so mask of block is too. 8232 BlockMaskCache[BB] = EdgeMask;
8236if (!BlockMask) {
// BlockMask has its initialized nullptr value. 8237 BlockMask = EdgeMask;
8241 BlockMask = Builder.
createOr(BlockMask, EdgeMask, {});
8244 BlockMaskCache[BB] = BlockMask;
8250assert((isa<LoadInst>(
I) || isa<StoreInst>(
I)) &&
8251"Must be called with either a load or store");
8257"CM decision should be taken at this point.");
8273// Determine if the pointer operand of the access is either consecutive or 8274// reverse consecutive. 8283auto *
GEP = dyn_cast<GetElementPtrInst>(
8284Ptr->getUnderlyingValue()->stripPointerCasts());
8287// When folding the tail, we may compute an address that we don't in the 8288// original scalar loop and it may not be inbounds. Drop Inbounds in that 8298GEP ?
GEP->getNoWrapFlags()
8305if (
LoadInst *Load = dyn_cast<LoadInst>(
I))
8314/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also 8315/// insert a recipe to expand the step for the induction recipe. 8323"step must be loop invariant");
8327if (
auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8330 TruncI->getDebugLoc());
8332assert(isa<PHINode>(PhiOrTrunc) &&
"must be a phi node here");
8334 IndDesc, Phi->getDebugLoc());
8340// Check if this is an integer or fp induction. If so, build the recipe that 8341// produces its scalar and vector values. 8344 *PSE.
getSE(), *OrigLoop);
8346// Check if this is pointer induction. If so, build the recipe for it. 8364// Optimize the special case where the source is a constant integer 8365// induction variable. Notice that we can only optimize the 'trunc' case 8366// because (a) FP conversions lose precision, (b) sext/zext may wrap, and 8367// (c) other casts depend on pointer size. 8369// Determine whether \p K is a truncation based on an induction variable that 8371auto IsOptimizableIVTruncate =
8379 IsOptimizableIVTruncate(
I),
Range)) {
8381auto *
Phi = cast<PHINode>(
I->getOperand(0));
8392unsigned NumIncoming =
Phi->getNumIncomingValues();
8394// We know that all PHIs in non-header blocks are converted into selects, so 8395// we don't have to worry about the insertion order and we can just use the 8396// builder. At this point we generate the predication tree. There may be 8397// duplications since this is a simple recursive scan, but future 8398// optimizations will clean it up. 8401for (
unsigned In = 0;
In < NumIncoming;
In++) {
8406assert(In == 0 &&
"Both null and non-null edge masks found");
8408"Distinct incoming values with one having a full mask");
8429if (
ID && (
ID == Intrinsic::assume ||
ID == Intrinsic::lifetime_end ||
8430ID == Intrinsic::lifetime_start ||
ID == Intrinsic::sideeffect ||
8431ID == Intrinsic::pseudoprobe ||
8432ID == Intrinsic::experimental_noalias_scope_decl))
8437// Is it beneficial to perform intrinsic call compared to lib call? 8438bool ShouldUseVectorIntrinsic =
8445if (ShouldUseVectorIntrinsic)
8450 std::optional<unsigned> MaskPos;
8451// Is better to call a vectorized version of the function than to to scalarize 8455// The following case may be scalarized depending on the VF. 8456// The flag shows whether we can use a usual Call for vectorized 8457// version of the instruction. 8459// If we've found a variant at a previous VF, then stop looking. A 8460// vectorized variant of a function expects input in a certain shape 8461// -- basically the number of input registers, the number of lanes 8462// per register, and whether there's a mask required. 8463// We store a pointer to the variant in the VPWidenCallRecipe, so 8464// once we have an appropriate variant it's only valid for that VF. 8465// This will force a different vplan to be generated for each VF that 8466// finds a valid variant. 8472 Variant = Decision.Variant;
8473 MaskPos = Decision.MaskPos;
8480if (ShouldUseVectorCall) {
8481if (MaskPos.has_value()) {
8482// We have 2 cases that would require a mask: 8483// 1) The block needs to be predicated, either due to a conditional 8484// in the scalar loop or use of an active lane mask with 8485// tail-folding, and we use the appropriate mask for the block. 8486// 2) No mask is required for the block, but the only available 8487// vector variant at this VF requires a mask, so we synthesize an 8496 Ops.insert(Ops.
begin() + *MaskPos, Mask);
8507assert(!isa<BranchInst>(
I) && !isa<PHINode>(
I) && !isa<LoadInst>(
I) &&
8508 !isa<StoreInst>(
I) &&
"Instruction should have been handled earlier");
8509// Instruction should be widened, unless it is scalar after vectorization, 8510// scalarization is profitable or it is predicated. 8523switch (
I->getOpcode()) {
8526case Instruction::SDiv:
8527case Instruction::UDiv:
8528case Instruction::SRem:
8529case Instruction::URem: {
8530// If not provably safe, use a select to form a safe divisor before widening the 8531// div/rem operation itself. Otherwise fall through to general handling below. 8537auto *SafeRHS = Builder.
createSelect(Mask, Ops[1], One,
I->getDebugLoc());
8543case Instruction::Add:
8544case Instruction::And:
8545case Instruction::AShr:
8546case Instruction::FAdd:
8547case Instruction::FCmp:
8548case Instruction::FDiv:
8549case Instruction::FMul:
8550case Instruction::FNeg:
8551case Instruction::FRem:
8552case Instruction::FSub:
8553case Instruction::ICmp:
8554case Instruction::LShr:
8555case Instruction::Mul:
8556case Instruction::Or:
8557case Instruction::Select:
8558case Instruction::Shl:
8559case Instruction::Sub:
8560case Instruction::Xor:
8561case Instruction::Freeze:
8564// The legacy cost model uses SCEV to check if some of the operands are 8565// constants. To match the legacy cost model's behavior, use SCEV to try 8566// to replace operands with constants. 8568auto GetConstantViaSCEV = [
this, &SE](
VPValue *
Op) {
8570if (isa<Constant>(V) || !SE.
isSCEVable(
V->getType()))
8572auto *
C = dyn_cast<SCEVConstant>(SE.
getSCEV(V));
8577// For Mul, the legacy cost model checks both operands. 8578if (
I->getOpcode() == Instruction::Mul)
8579 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
8580// For other binops, the legacy cost model only checks the second operand. 8581 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
8590// FIXME: Support other operations. 8591unsigned Opcode =
HI->Update->getOpcode();
8592assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
8593"Histogram update operation must be an Add or Sub");
8601// In case of predicated execution (due to tail-folding, or conditional 8602// execution, or both), pass the relevant mask. 8608HI->Store->getDebugLoc());
8614auto *PN = cast<PHINode>(R->getUnderlyingValue());
8616getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8629// Even if the instruction is not marked as uniform, there are certain 8630// intrinsic calls that can be effectively treated as such, so we check for 8631// them here. Conservatively, we only do this for scalable vectors, since 8632// for fixed-width VFs we can always fall back on full scalarization. 8633if (!IsUniform &&
Range.Start.isScalable() && isa<IntrinsicInst>(
I)) {
8635case Intrinsic::assume:
8636case Intrinsic::lifetime_start:
8637case Intrinsic::lifetime_end:
8638// For scalable vectors if one of the operands is variant then we still 8639// want to mark as uniform, which will generate one instruction for just 8640// the first lane of the vector. We can't scalarize the call in the same 8641// way as for fixed-width vectors because we don't know how many lanes 8644// The reasons for doing it this way for scalable vectors are: 8645// 1. For the assume intrinsic generating the instruction for the first 8646// lane is still be better than not generating any at all. For 8647// example, the input may be a splat across all lanes. 8648// 2. For the lifetime start/end intrinsics the pointer operand only 8649// does anything useful when the input comes from a stack object, 8650// which suggests it should always be uniform. For non-stack objects 8651// the effect is to poison the object, which still allows us to 8661// Finalize the recipe for Instr, first if it is not predicated. 8665// Instructions marked for predication are replicated and a mask operand is 8666// added initially. Masked replicate recipes will later be placed under an 8667// if-then construct to prevent side-effects. Generate recipes to compute 8668// the block mask for this region. 8672// Note that there is some custom logic to mark some intrinsics as uniform 8673// manually above for scalable vectors, which this assert needs to account for 8675assert((
Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8676 (
Range.Start.isScalable() && isa<IntrinsicInst>(
I))) &&
8677"Should not predicate a uniform recipe");
8679 IsUniform, BlockInMask);
8683/// Find all possible partial reductions in the loop and track all of those that 8684/// are valid so recipes can be formed later. 8686// Find all possible partial reductions. 8688 PartialReductionChains;
8690 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(),
Range,
8691 PartialReductionChains);
8694// A partial reduction is invalid if any of its extends are used by 8695// something that isn't another partial reduction. This is because the 8696// extends are intended to be lowered along with the reduction itself. 8698// Build up a set of partial reduction bin ops for efficient use checking. 8700for (
constauto &[PartialRdx,
_] : PartialReductionChains)
8701 PartialReductionBinOps.
insert(PartialRdx.BinOp);
8703auto ExtendIsOnlyUsedByPartialReductions =
8705returnall_of(Extend->users(), [&](
constUser *U) {
8706 return PartialReductionBinOps.contains(U);
8710// Check if each use of a chain's two extends is a partial reduction 8711// and only add those that don't have non-partial reduction users. 8712for (
auto Pair : PartialReductionChains) {
8714if (ExtendIsOnlyUsedByPartialReductions(Chain.
ExtendA) &&
8715 ExtendIsOnlyUsedByPartialReductions(Chain.
ExtendB))
8716 ScaledReductionMap.
insert(std::make_pair(Chain.
Reduction, Pair.second));
8720bool VPRecipeBuilder::getScaledReductions(
8727// TODO: Allow scaling reductions when predicating. The select at 8728// the end of the loop chooses between the phi value and most recent 8729// reduction result, both of which have different VFs to the active lane 8730// mask when scaling. 8734auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);
8738Value *
Op = Update->getOperand(0);
8739Value *PhiOp = Update->getOperand(1);
8743// Try and get a scaled reduction from the first non-phi operand. 8744// If one is found, we use the discovered reduction instruction in 8745// place of the accumulator for costing. 8746if (
auto *OpInst = dyn_cast<Instruction>(
Op)) {
8747if (getScaledReductions(
PHI, OpInst,
Range, Chains)) {
8748PHI = Chains.rbegin()->first.Reduction;
8750Op = Update->getOperand(0);
8751 PhiOp = Update->getOperand(1);
8759auto *BinOp = dyn_cast<BinaryOperator>(
Op);
8760if (!BinOp || !BinOp->hasOneUse())
8769Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));
8770Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));
8779unsigned TargetScaleFactor =
8780PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(
8781A->getType()->getPrimitiveSizeInBits());
8786 Update->getOpcode(),
A->getType(),
B->getType(),
PHI->getType(),
8787 VF, OpAExtend, OpBExtend,
8788 std::make_optional(BinOp->getOpcode()));
8792 Chains.push_back(std::make_pair(Chain, TargetScaleFactor));
8803// First, check for specific widening recipes that deal with inductions, Phi 8804// nodes, calls and memory operations. 8806if (
auto *Phi = dyn_cast<PHINode>(Instr)) {
8807if (Phi->getParent() != OrigLoop->
getHeader())
8810if ((Recipe = tryToOptimizeInductionPHI(Phi,
Operands,
Range)))
8816"can only widen reductions and fixed-order recurrences here");
8824// If the PHI is used by a partial reduction, set the scale factor. 8825unsigned ScaleFactor =
8831// TODO: Currently fixed-order recurrences are modeled as chains of 8832// first-order recurrences. If there are no users of the intermediate 8833// recurrences in the chain, the fixed order recurrence should be modeled 8834// directly, enabling more efficient codegen. 8838 PhisToFix.push_back(PhiRecipe);
8842if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
8846// All widen recipes below deal only with VF > 1. 8851if (
auto *CI = dyn_cast<CallInst>(Instr))
8854if (
StoreInst *SI = dyn_cast<StoreInst>(Instr))
8856return tryToWidenHistogram(*HistInfo,
Operands);
8858if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8864if (!shouldWiden(Instr,
Range))
8867if (
auto *
GEP = dyn_cast<GetElementPtrInst>(Instr))
8871if (
auto *SI = dyn_cast<SelectInst>(Instr)) {
8876if (
auto *CI = dyn_cast<CastInst>(Instr)) {
8881return tryToWiden(Instr,
Operands, VPBB);
8888"Unexpected number of operands for partial reduction");
8893if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||
8894 isa<VPPartialReductionRecipe>(BinOpRecipe))
8901void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
ElementCount MinVF,
8905auto MaxVFTimes2 = MaxVF * 2;
8907VFRange SubRange = {VF, MaxVFTimes2};
8908if (
auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8909// Now optimize the initial VPlan. 8914// TODO: try to put it close to addActiveLaneMask(). 8915// Discard the plan if it is not EVL-compatible 8920 VPlans.push_back(std::move(Plan));
8926// Add the necessary canonical IV and branch recipes required to control the 8930Value *StartIdx = ConstantInt::get(IdxTy, 0);
8933// Add a VPCanonicalIVPHIRecipe starting at 0 to the header. 8937 Header->insert(CanonicalIVPHI, Header->begin());
8940// Add a VPInstruction to increment the scalar canonical IV by VF * UF. 8942 Instruction::Add, {CanonicalIVPHI, &Plan.
getVFxUF()}, {HasNUW,
false},
DL,
8944 CanonicalIVPHI->
addOperand(CanonicalIVIncrement);
8946// Add the BranchOnCount VPInstruction to the latch. 8951/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the 8952/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute 8953/// the end value of the induction. 8957auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
8958// Truncated wide inductions resume from the last lane of their vector value 8959// in the last vector iteration which is handled elsewhere. 8960if (WideIntOrFp && WideIntOrFp->getTruncInst())
8967if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8969ID.getKind(), dyn_cast_or_null<FPMathOperator>(
ID.getInductionBinOp()),
8970 Start, VectorTC, Step);
8973// EndValue is derived from the vector trip count (which has the same type as 8974// the widest induction) and thus may be wider than the induction here. 8982auto *ResumePhiRecipe =
8985return ResumePhiRecipe;
8988/// Create resume phis in the scalar preheader for first-order recurrences, 8989/// reductions and inductions, and update the VPIRInstructions wrapping the 8990/// original phis in the scalar header. End values for inductions are added to 8996auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());
9000VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9005auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);
9006auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());
9010// TODO: Extract final value from induction recipe initially, optimize to 9011// pre-computed end value together in optimizeInductionExitUsers. 9012auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.
getRecipe(ScalarPhiI));
9013if (
auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
9015 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
9018"Expected a ResumePhi");
9019 IVEndValues[WideIVR] = ResumePhi->getOperand(0);
9020 ScalarPhiIRI->addOperand(ResumePhi);
9023// TODO: Also handle truncated inductions here. Computing end-values 9024// separately should be done as VPlan-to-VPlan optimization, after 9025// legalizing all resume values to use the last lane from the loop. 9026assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
9027"should only skip truncated wide inductions");
9031// The backedge value provides the value to resume coming out of a loop, 9032// which for FORs is a vector whose last element needs to be extracted. The 9033// start value provides the value if the loop is bypassed. 9034bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
9035auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
9037"Cannot handle loops with uncountable early exits");
9041"vector.recur.extract");
9045 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {},
Name);
9050// Collect VPIRInstructions for phis in the exit blocks that are modeled 9051// in VPlan and add the exiting VPValue as operand. 9058auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);
9061auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());
9065assert(ExitIRI->getNumOperands() ==
9066 ExitVPBB->getPredecessors().size() &&
9067"early-exit must update exit values on construction");
9071Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);
9073 ExitIRI->addOperand(V);
9076assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
9077"Only recipes defined inside a region should need fixing.");
9078 ExitUsersToFix.
insert(ExitIRI);
9081return ExitUsersToFix;
9084// Add exit values to \p Plan. Extracts are added for each entry in \p 9085// ExitUsersToFix if needed and their operands are updated. 9089if (ExitUsersToFix.
empty())
9093VPBuilderB(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9095// Introduce extract for exiting values and update the VPIRInstructions 9096// modeling the corresponding LCSSA phis. 9098assert(ExitIRI->getNumOperands() == 1 &&
9099 ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
9100"exit values from early exits must be fixed when branch to " 9101"early-exit is added");
9102 ExitIRI->extractLastLaneOfOperand(
B);
9106/// Handle users in the exit block for first order reductions in the original 9107/// exit block. The penultimate value of recurrences is fed to their LCSSA phi 9108/// users in the original exit block using the VPIRInstruction wrapping to the 9116VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
9121auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
9126"Cannot handle loops with uncountable early exits");
9128// This is the second phase of vectorizing first-order recurrences, creating 9129// extract for users outside the loop. An overview of the transformation is 9130// described below. Suppose we have the following loop with some use after 9131// the loop of the last a[i-1], 9133// for (int i = 0; i < n; ++i) { 9139// There is a first-order recurrence on "a". For this loop, the shorthand 9140// scalar IR looks like: 9147// i = phi [0, scalar.ph], [i+1, scalar.body] 9148// s1 = phi [s.init, scalar.ph], [s2, scalar.body] 9151// br cond, scalar.body, exit.block 9154// use = lcssa.phi [s1, scalar.body] 9156// In this example, s1 is a recurrence because it's value depends on the 9157// previous iteration. In the first phase of vectorization, we created a 9158// VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts 9159// for users in the scalar preheader and exit block. 9162// v_init = vector(..., ..., ..., a[-1]) 9166// i = phi [0, vector.ph], [i+4, vector.body] 9167// v1 = phi [v_init, vector.ph], [v2, vector.body] 9168// v2 = a[i, i+1, i+2, i+3] 9170// // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2)) 9171// b[i, i+1, i+2, i+3] = v2 - v1 9172// br cond, vector.body, middle.block 9175// vector.recur.extract.for.phi = v2(2) 9176// vector.recur.extract = v2(3) 9177// br cond, scalar.ph, exit.block 9180// scalar.recur.init = phi [vector.recur.extract, middle.block], 9181// [s.init, otherwise] 9185// i = phi [0, scalar.ph], [i+1, scalar.body] 9186// s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body] 9189// br cond, scalar.body, exit.block 9192// lo = lcssa.phi [s1, scalar.body], 9193// [vector.recur.extract.for.phi, middle.block] 9195// Now update VPIRInstructions modeling LCSSA phis in the exit block. 9196// Extract the penultimate value of the recurrence and use it as operand for 9197// the VPIRInstruction modeling the phi. 9199if (ExitIRI->getOperand(0) != FOR)
9203"vector.recur.extract.for.phi");
9205 ExitUsersToFix.remove(ExitIRI);
9211LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VFRange &
Range) {
9215// --------------------------------------------------------------------------- 9216// Build initial VPlan: Scan the body of the loop in a topological order to 9217// visit each basic block after having visited its predecessor basic blocks. 9218// --------------------------------------------------------------------------- 9220// Create initial VPlan skeleton, having a basic block for the pre-header 9221// which contains SCEV expansions that need to happen before the CFG is 9222// modified; a basic block for the vector pre-header, followed by a region for 9223// the vector loop, followed by the middle basic block. The skeleton vector 9224// loop region contains a header and latch basic blocks. 9226bool RequiresScalarEpilogueCheck =
9233 PSE, RequiresScalarEpilogueCheck,
9236// Don't use getDecisionAndClampRange here, because we don't know the UF 9237// so this function is better to be conservative, rather than to split 9238// it up into different VPlans. 9239// TODO: Consider using getDecisionAndClampRange here to split up VPlans. 9240bool IVUpdateMayOverflow =
false;
9246// Use NUW for the induction increment if we proved that it won't overflow in 9247// the vector loop or when not folding the tail. In the later case, we know 9248// that the canonical induction increment will not overflow as the vector trip 9249// count is >= increment and a multiple of the increment. 9256// --------------------------------------------------------------------------- 9257// Pre-construction: record ingredients whose recipes we'll need to further 9258// process after constructing the initial VPlan. 9259// --------------------------------------------------------------------------- 9261// For each interleave group which is relevant for this (possibly trimmed) 9262// Range, add it to the set of groups to be later applied to the VPlan and add 9263// placeholders for its members' Recipes which we'll be replacing with a 9264// single VPInterleaveRecipe. 9270// For scalable vectors, the only interleave factor currently supported 9271// is 2 since we require the (de)interleave2 intrinsics instead of 9274"Unsupported interleave factor for scalable vectors");
9279 InterleaveGroups.
insert(IG);
9282// --------------------------------------------------------------------------- 9283// Construct recipes for the instructions in the loop 9284// --------------------------------------------------------------------------- 9286// Scan the body of the loop in a topological order to visit each basic block 9287// after having visited its predecessor basic blocks. 9297 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();
9298 return Legal->blockNeedsPredication(BB) || NeedsBlends;
9301 RecipeBuilder.collectScaledReductions(
Range);
9303auto *MiddleVPBB = Plan->getMiddleBlock();
9306// Relevant instructions from basic block BB will be grouped into VPRecipe 9307// ingredients and fill a new VPBasicBlock. 9308if (VPBB != HeaderVPBB)
9312if (VPBB == HeaderVPBB)
9313 RecipeBuilder.createHeaderMask();
9315 RecipeBuilder.createBlockInMask(BB);
9317// Introduce each ingredient into VPlan. 9318// TODO: Model and preserve debug intrinsics in VPlan. 9322auto *
Phi = dyn_cast<PHINode>(Instr);
9323if (Phi &&
Phi->getParent() == HeaderBB) {
9324Operands.push_back(Plan->getOrAddLiveIn(
9327auto OpRange = RecipeBuilder.mapToVPValues(
Instr->operands());
9328Operands = {OpRange.begin(), OpRange.end()};
9331// The stores with invariant address inside the loop will be deleted, and 9332// in the exit block, a uniform store recipe will be created for the final 9333// invariant store of the reduction. 9335if ((SI = dyn_cast<StoreInst>(&
I)) &&
9337// Only create recipe for the final invariant store of the reduction. 9341 SI, RecipeBuilder.mapToVPValues(
Instr->operands()),
9342true/* IsUniform */);
9343 Recipe->insertBefore(*MiddleVPBB, MBIP);
9348 RecipeBuilder.tryToCreateWidenRecipe(Instr,
Operands,
Range, VPBB);
9350 Recipe = RecipeBuilder.handleReplication(Instr,
Range);
9352 RecipeBuilder.setRecipe(Instr, Recipe);
9353if (isa<VPHeaderPHIRecipe>(Recipe)) {
9354// VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In 9355// the following cases, VPHeaderPHIRecipes may be created after non-phi 9356// recipes and need to be moved to the phi section of HeaderVPBB: 9357// * tail-folding (non-phi recipes computing the header mask are 9358// introduced earlier than regular header phi recipes, and should appear 9360// * Optimizing truncates to VPWidenIntOrFpInductionRecipe. 9364"unexpected recipe needs moving");
9374// After here, VPBB should not be used. 9377assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
9378 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
9379"entry block must be set to a VPRegionBlock having a non-empty entry " 9381 RecipeBuilder.fixHeaderPhis();
9383// Update wide induction increments to use the same step as the corresponding 9384// wide induction. This enables detecting induction increments directly in 9385// VPlan and removes redundant splats. 9387auto *IVInc = cast<Instruction>(
9392 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));
9397if (
auto *UncountableExitingBlock =
9400 *Plan, *PSE.
getSE(), OrigLoop, UncountableExitingBlock,
9403"Some exit values in loop with uncountable exit not supported yet",
9404"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);
9415// --------------------------------------------------------------------------- 9416// Transform initial VPlan: Apply previously taken decisions, in order, to 9417// bring the VPlan to its final state. 9418// --------------------------------------------------------------------------- 9420// Adjust the recipes for any inloop reductions. 9421 adjustRecipesForReductions(Plan, RecipeBuilder,
Range.Start);
9423// Interleave memory: for each Interleave Group we marked earlier as relevant 9424// for this VPlan, replace the Recipes widening its memory instructions with a 9425// single VPInterleaveRecipe at its insertion point. 9431 Plan->setName(
"Initial VPlan");
9433// Replace VPValues for known constant strides guaranteed by predicate scalar 9436auto *
R = cast<VPRecipeBase>(&U);
9437returnR->getParent()->getParent() ||
9439 Plan->getVectorLoopRegion()->getSinglePredecessor();
9442auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
9443auto *ScevStride = dyn_cast<SCEVConstant>(PSE.
getSCEV(StrideV));
9444// Only handle constant strides for now. 9448auto *CI = Plan->getOrAddLiveIn(
9449 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));
9450if (
VPValue *StrideVPV = Plan->getLiveIn(StrideV))
9453// The versioned value may not be used in the loop directly but through a 9454// sext/zext. Add new live-ins in those cases. 9456if (!isa<SExtInst, ZExtInst>(U))
9458VPValue *StrideVPV = Plan->getLiveIn(U);
9461unsigned BW =
U->getType()->getScalarSizeInBits();
9462APIntC = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)
9463 : ScevStride->getAPInt().zext(BW);
9464VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(
U->getType(),
C));
9473// Sink users of fixed-order recurrence past the recipe defining the previous 9474// value and introduce FirstOrderRecurrenceSplice VPInstructions. 9479// TODO: Move checks to VPlanTransforms::addActiveLaneMask once 9480// TailFoldingStyle is visible there. 9482bool WithoutRuntimeCheck =
9485 WithoutRuntimeCheck);
9494// Outer loop handling: They may require CFG and instruction level 9495// transformations before even evaluating whether vectorization is profitable. 9496// Since we cannot modify the incoming IR, we need to build VPlan upfront in 9497// the vectorization pipeline. 9501// Create new empty VPlan 9503true,
false, OrigLoop);
9505// Build hierarchical CFG 9507 HCFGBuilder.buildHierarchicalCFG();
9515 *PSE.
getSE(), *TLI);
9517// Tail folding is not supported for outer loops, so the induction increment 9518// is guaranteed to not wrap. 9523// Collect mapping of IR header phis to header phi recipes, to be used in 9524// addScalarResumePhis. 9527for (
auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9528if (isa<VPCanonicalIVPHIRecipe>(&R))
9530auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);
9531 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);
9534// TODO: IVEndValues are not used yet in the native path, to optimize exit 9542// Adjust the recipes for reductions. For in-loop reductions the chain of 9543// instructions leading from the loop exit instr to the phi need to be converted 9544// to reductions, with one operand being vector and the other being the scalar 9545// reduction chain. For other reductions, a select is introduced between the phi 9546// and users outside the vector region when folding the tail. 9548// A ComputeReductionResult recipe is added to the middle block, also for 9549// in-loop reductions which compute their result in-loop, because generating 9550// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes. 9552// Adjust AnyOf reductions; replace the reduction phi for the selected value 9553// with a boolean reduction phi node to check if the condition is true in any 9554// iteration. The final value is selected by the final ComputeReductionResult. 9555void LoopVectorizationPlanner::adjustRecipesForReductions(
9557using namespaceVPlanPatternMatch;
9558VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
9564auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9565if (!PhiR || !PhiR->isInLoop() || (MinVF.
isScalar() && !PhiR->isOrdered()))
9573"AnyOf and FindLast reductions are not allowed for in-loop reductions");
9575// Collect the chain of "link" recipes for the reduction starting at PhiR. 9578for (
unsignedI = 0;
I != Worklist.
size(); ++
I) {
9581auto *UserRecipe = cast<VPSingleDefRecipe>(U);
9582if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9583assert((UserRecipe->getParent() == MiddleVPBB ||
9584 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9585"U must be either in the loop region, the middle block or the " 9586"scalar preheader.");
9589 Worklist.
insert(UserRecipe);
9593// Visit operation "Links" along the reduction chain top-down starting from 9594// the phi until LoopExitValue. We keep track of the previous item 9595// (PreviousLink) to tell which of the two operands of a Link will remain 9596// scalar and which will be reduced. For minmax by select(cmp), Link will be 9597// the select instructions. Blend recipes of in-loop reduction phi's will 9598// get folded to their non-phi operand, as the reduction recipe handles the 9599// condition directly. 9602Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9604// Index of the first operand which holds a non-mask vector operand. 9605unsigned IndexOfFirstOperand;
9606// Recognize a call to the llvm.fmuladd intrinsic. 9613"Expected instruction to be a call to the llvm.fmuladd intrinsic");
9614assert(((MinVF.
isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9615 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9616 CurrentLink->getOperand(2) == PreviousLink &&
9617"expected a call where the previous link is the added operand");
9619// If the instruction is a call to the llvm.fmuladd intrinsic then we 9620// need to create an fmul recipe (multiplying the first two operands of 9621// the fmuladd together) to use as the vector operand for the fadd 9625 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},
9627 LinkVPBB->
insert(FMulRecipe, CurrentLink->getIterator());
9630auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);
9631if (PhiR->isInLoop() && Blend) {
9632assert(Blend->getNumIncomingValues() == 2 &&
9633"Blend must have 2 incoming values");
9634if (Blend->getIncomingValue(0) == PhiR)
9635 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));
9637assert(Blend->getIncomingValue(1) == PhiR &&
9638"PhiR must be an operand of the blend");
9639 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));
9645if (isa<VPWidenRecipe>(CurrentLink)) {
9646assert(isa<CmpInst>(CurrentLinkI) &&
9647"need to have the compare of the select");
9650assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9651"must be a select recipe");
9652 IndexOfFirstOperand = 1;
9655"Expected to replace a VPWidenSC");
9656 IndexOfFirstOperand = 0;
9658// Note that for non-commutable operands (cmp-selects), the semantics of 9659// the cmp-select are captured in the recurrence kind. 9661 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
9662 ? IndexOfFirstOperand + 1
9663 : IndexOfFirstOperand;
9664 VecOp = CurrentLink->getOperand(VecOpId);
9665assert(VecOp != PreviousLink &&
9666 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9667 (VecOpId - IndexOfFirstOperand)) ==
9669"PreviousLink must be the operand other than VecOp");
9678 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,
9680// Append the recipe to the end of the VPBasicBlock because we need to 9681// ensure that it comes after all of it's inputs, including CondOp. 9682// Delete CurrentLink as it will be invalid if its operand is replaced 9683// with a reduction defined at the bottom of the block in the next link. 9685 CurrentLink->replaceAllUsesWith(RedRecipe);
9687 PreviousLink = RedRecipe;
9694 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9700// If tail is folded by masking, introduce selects between the phi 9701// and the users outside the vector region of each reduction, at the 9702// beginning of the dedicated latch block. 9707assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&
9708"reduction recipe must be defined before latch");
9710 std::optional<FastMathFlags> FMFs =
9717return isa<VPInstruction>(&U) &&
9718 cast<VPInstruction>(&U)->getOpcode() ==
9726// If the vector reduction can be performed in a smaller type, we truncate 9727// then extend the loop exit value to enable InstCombine to evaluate the 9728// entire expression in the smaller type. 9733assert(!PhiR->
isInLoop() &&
"Unexpected truncated inloop reduction!");
9742 Trunc->
insertAfter(NewExitingVPV->getDefiningRecipe());
9743 Extnd->insertAfter(Trunc);
9745 PhiR->
setOperand(1, Extnd->getVPSingleValue());
9746 NewExitingVPV = Extnd;
9749// We want code in the middle block to appear to execute on the location of 9750// the scalar loop's latch terminator because: (a) it is all compiler 9751// generated, (b) these instructions are always executed after evaluating 9752// the latch conditional branch, and (c) other passes may add new 9753// predecessors which terminate on this line. This is the easiest way to 9754// ensure we don't accidentally cause an extra step back into the loop while 9758// TODO: At the moment ComputeReductionResult also drives creation of the 9759// bc.merge.rdx phi nodes, hence it needs to be created unconditionally here 9760// even for in-loop reductions, until the reduction resume value handling is 9761// also modeled in VPlan. 9764// Update all users outside the vector region. 9766 FinalReductionResult, [](
VPUser &
User,
unsigned) {
9767auto *Parent = cast<VPRecipeBase>(&
User)->getParent();
9768return Parent && !Parent->getParent();
9770 FinalReductionResult->insertBefore(*MiddleVPBB, IP);
9772// Adjust AnyOf reductions; replace the reduction phi for the selected value 9773// with a boolean reduction phi node to check if the condition is true in 9774// any iteration. The final value is selected by the final 9775// ComputeReductionResult. 9779 return isa<VPWidenSelectRecipe>(U) ||
9780 (isa<VPReplicateRecipe>(U) &&
9781 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==
9782 Instruction::Select);
9785// If the compare is checking the reduction PHI node, adjust it to check 9788for (
unsignedI = 0;
I != CmpR->getNumOperands(); ++
I)
9789if (CmpR->getOperand(
I) == PhiR)
9795// If the true value of the select is the reduction phi, the new value is 9796// selected if the negated condition is true in any iteration. 9797if (
Select->getOperand(1) == PhiR)
9800Select->getVPSingleValue()->replaceAllUsesWith(
Or);
9801// Delete Select now that it has invalid types. 9804// Convert the reduction phi to operate on bools. 9812// Adjust the start value for FindLastIV recurrences to use the sentinel 9813// value after generating the ResumePhi recipe, which uses the original 9821R->eraseFromParent();
9825assert(!State.
Lane &&
"VPDerivedIVRecipe being replicated.");
9827// Fast-math-flags propagate from the original induction instruction. 9836 cast_if_present<BinaryOperator>(FPBinOp));
9838// If index is the vector trip count, the concrete value will only be set in 9839// prepareToExecute, leading to missed simplifications, e.g. if it is 0. 9840// TODO: Remove the special case for the vector trip count once it is computed 9841// in VPlan and can be used during VPlan simplification. 9842assert((DerivedIV != Index ||
9844"IV didn't need transforming?");
9850if (State.
Lane) {
// Generate a single instance. 9852"uniform recipe shouldn't be predicated");
9855// Insert scalar instance packing it into a vector. 9857// If we're constructing lane 0, initialize to start from poison. 9858if (State.
Lane->isFirstLane()) {
9870// Uniform within VL means we need to generate lane 0. 9875// A store of a loop varying value to a uniform address only needs the last 9876// copy of the store. 9877if (isa<StoreInst>(UI) &&
9884// Generate scalar instances for all VF lanes. 9887for (
unsigned Lane = 0; Lane < EndLane; ++Lane)
9891// Determine how to lower the scalar epilogue, which depends on 1) optimising 9892// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing 9893// predication, and 4) a TTI hook that analyses whether the loop is suitable 9899// 1) OptSize takes precedence over all other options, i.e. if this is set, 9900// don't look at hints or options, and don't request a scalar epilogue. 9901// (For PGSO, as shouldOptimizeForSize isn't currently accessible from 9902// LoopAccessInfo (due to code dependency and not being able to reliably get 9903// PSI/BFI from a loop analysis under NPM), we cannot suppress the collection 9904// of strides in LoopAccessInfo::analyzeLoop() and vectorize without 9905// versioning when the vectorization is forced, unlike hasOptSize. So revert 9906// back to the old way and vectorize with versioning when forced. See D81345.) 9912// 2) If set, obey the directives 9924// 3) If set, obey the hints 9932// 4) if the TTI hook indicates this is profitable, request predication. 9940// Process the loop in the VPlan-native vectorization path. This path builds 9941// VPlan upfront in the vectorization pipeline, which allows to apply 9942// VPlan-to-VPlan transformations from the very beginning without modifying the 9953LLVM_DEBUG(
dbgs() <<
"LV: cannot compute the outer-loop trip count\n");
9957Function *
F = L->getHeader()->getParent();
9963LoopVectorizationCostModel CM(
SEL, L, PSE, LI, LVL, *
TTI, TLI, DB, AC, ORE,
F,
9965// Use the planner for outer loop vectorization. 9966// TODO: CM is not used at this point inside the planner. Turn CM into an 9967// optional argument if we don't need it in the future. 9968LoopVectorizationPlanner LVP(L, LI, DT, TLI, *
TTI, LVL, CM, IAI, PSE, Hints,
9971// Get user vectorization factor. 9976// Plan how to best vectorize, return the best VF and its cost. 9979// If we are stress testing VPlan builds, do not attempt to generate vector 9980// code. Masked vector code generation support will follow soon. 9981// Also, do not attempt to vectorize if no vector code will be produced. 9988bool AddBranchWeights =
9990 GeneratedRTChecks Checks(PSE, DT, LI,
TTI,
F->getDataLayout(),
9993 VF.
Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);
9995 << L->getHeader()->getParent()->getName() <<
"\"\n");
10001// Mark the loop as already vectorized to avoid vectorizing again. 10007// Emit a remark if there are stores to floats that required a floating point 10008// extension. If the vectorized loop was generated with floating point there 10009// will be a performance penalty from the conversion overhead and the change in 10010// the vector width. 10015if (
auto *S = dyn_cast<StoreInst>(&Inst)) {
10016if (S->getValueOperand()->getType()->isFloatTy())
10022// Traverse the floating point stores upwards searching, for floating point 10026while (!Worklist.
empty()) {
10028if (!L->contains(
I))
10030if (!Visited.
insert(
I).second)
10033// Emit a remark if the floating point store required a floating 10034// point conversion. 10035// TODO: More work could be done to identify the root cause such as a 10036// constant or a function return type and point the user to it. 10037if (isa<FPExtInst>(
I) && EmittedRemark.
insert(
I).second)
10040I->getDebugLoc(), L->getHeader())
10041 <<
"floating point conversion changes vector width. " 10042 <<
"Mixed floating point precision requires an up/down " 10043 <<
"cast that will negatively impact performance.";
10046for (
Use &
Op :
I->operands())
10047if (
auto *OpI = dyn_cast<Instruction>(
Op))
10061// When interleaving only scalar and vector cost will be equal, which in turn 10062// would lead to a divide by 0. Fall back to hard threshold. 10067 <<
"LV: Interleaving only is not profitable due to runtime checks\n");
10073// The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated. 10078// First, compute the minimum iteration count required so that the vector 10079// loop outperforms the scalar loop. 10080// The total cost of the scalar loop is 10083// * TC is the actual trip count of the loop. 10084// * ScalarC is the cost of a single scalar iteration. 10086// The total cost of the vector loop is 10087// RtC + VecC * (TC / VF) + EpiC 10089// * RtC is the cost of the generated runtime checks 10090// * VecC is the cost of a single vector iteration. 10091// * TC is the actual trip count of the loop 10092// * VF is the vectorization factor 10093// * EpiCost is the cost of the generated epilogue, including the cost 10094// of the remaining scalar operations. 10096// Vectorization is profitable once the total vector cost is less than the 10097// total scalar cost: 10098// RtC + VecC * (TC / VF) + EpiC < ScalarC * TC 10100// Now we can compute the minimum required trip count TC as 10101// VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC 10103// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that 10104// the computations are performed on doubles, not integers and the result 10105// is rounded up, hence we get an upper estimate of the TC. 10111// Second, compute a minimum iteration count so that the cost of the 10112// runtime checks is only a fraction of the total scalar loop cost. This 10113// adds a loop-dependent bound on the overhead incurred if the runtime 10114// checks fail. In case the runtime checks fail, the cost is RtC + ScalarC 10115// * TC. To bound the runtime check to be a fraction 1/X of the scalar 10117// RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC 10120// Now pick the larger minimum. If it is not a multiple of VF and a scalar 10121// epilogue is allowed, choose the next closest multiple of VF. This should 10122// partly compensate for ignoring the epilogue cost. 10123uint64_t MinTC = std::max(MinTC1, MinTC2);
10125 MinTC =
alignTo(MinTC, IntVF);
10129dbgs() <<
"LV: Minimum required TC for runtime checks to be profitable:" 10132// Skip vectorization if the expected trip count is less than the minimum 10133// required trip count. 10137LLVM_DEBUG(
dbgs() <<
"LV: Vectorization is not beneficial: expected " 10138"trip count < minimum profitable VF (" 10149 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10151 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10154/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue 10155/// vectorization. Remove ResumePhis from \p MainPlan for inductions that 10156/// don't have a corresponding wide induction in \p EpiPlan. 10158// Collect PHI nodes of widened phis in the VPlan for the epilogue. Those 10159// will need their resume-values computed in the main vector loop. Others 10160// can be removed from the main VPlan. 10164if (isa<VPCanonicalIVPHIRecipe>(&R))
10167 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));
10171auto *VPIRInst = cast<VPIRInstruction>(&R);
10172auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());
10177// There is no corresponding wide induction in the epilogue plan that would 10178// need a resume value. Remove the VPIRInst wrapping the scalar header phi 10179// together with the corresponding ResumePhi. The resume values for the 10180// scalar loop will be created during execution of EpiPlan. 10187using namespaceVPlanPatternMatch;
10190// If there is a suitable resume value for the canonical induction in the 10191// scalar (which will become vector) epilogue loop we are done. Otherwise 10194returnmatch(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10202"vec.epilog.resume.val");
10205/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded 10206/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes. 10209const SCEV2ValueTy &ExpandedSCEVs,
10213 Header->setName(
"vec.epilog.vector.body");
10215// Re-use the trip count and steps expanded for the main loop, as 10216// skeleton creation needs it as a value that dominates both the scalar 10217// and vector epilogue loops 10218// TODO: This is a workaround needed for epilogue vectorization and it 10219// should be removed once induction resume value creation is done 10220// directly in VPlan. 10222auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10226 Plan.
getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10230 ExpandR->eraseFromParent();
10233// Ensure that the start values for all header phi recipes are updated before 10234// vectorizing the epilogue loop. 10236if (
auto *
IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {
10237// When vectorizing the epilogue loop, the canonical induction start 10238// value needs to be changed from zero to the value after the main 10239// vector loop. Find the resume value created during execution of the main 10241// FIXME: Improve modeling for canonical IV start values in the epilogue 10243BasicBlock *MainMiddle = find_singleton<BasicBlock>(
10246 if (BB != EPI.MainLoopIterationCountCheck &&
10247 BB != EPI.EpilogueIterationCountCheck &&
10248 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)
10253Type *IdxTy =
IV->getScalarType();
10254PHINode *EPResumeVal = find_singleton<PHINode>(
10255 L->getLoopPreheader()->phis(),
10257 if (P.getType() == IdxTy &&
10258 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&
10260 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
10265assert(EPResumeVal &&
"must have a resume value for the canonical IV");
10269 return isa<VPScalarIVStepsRecipe>(U) ||
10270 isa<VPScalarCastRecipe>(U) ||
10271 isa<VPDerivedIVRecipe>(U) ||
10272 cast<VPInstruction>(U)->getOpcode() ==
10275"the canonical IV should only be used by its increment or " 10276"ScalarIVSteps when resetting the start value");
10277IV->setOperand(0, VPV);
10281Value *ResumeV =
nullptr;
10282// TODO: Move setting of resume values to prepareToExecute. 10283if (
auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10284 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())
10285 ->getIncomingValueForBlock(L->getLoopPreheader());
10287 ReductionPhi->getRecurrenceDescriptor();
10290// VPReductionPHIRecipes for AnyOf reductions expect a boolean as 10291// start value; compare the final value from the main vector loop 10292// to the start value. 10298// VPReductionPHIRecipe for FindLastIV reductions requires an adjustment 10299// to the resume value. The resume value is adjusted to the sentinel 10300// value when the final value from the main vector loop equals the start 10301// value. This ensures correctness when the start value might not be 10302// less than the minimum value of a monotonically increasing induction 10312// Retrieve the induction resume values for wide inductions from 10313// their original phi nodes in the scalar loop. 10314PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();
10315// Hook up to the PHINode generated by a ResumePhi recipe of main 10316// loop VPlan, which feeds the scalar loop. 10319assert(ResumeV &&
"Must have a resume value");
10321 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
10327"VPlan-native path is not enabled. Only process inner loops.");
10330 << L->getHeader()->getParent()->getName() <<
"' from " 10331 << L->getLocStr() <<
"\n");
10336dbgs() <<
"LV: Loop hints:" 10346// Function containing loop 10347Function *
F = L->getHeader()->getParent();
10349// Looking at the diagnostic output is the only way to determine if a loop 10350// was vectorized (other than looking at the IR or machine code), so it 10351// is important to generate an optimization remark for each loop. Most of 10352// these messages are generated as OptimizationRemarkAnalysis. Remarks 10353// generated as OptimizationRemark and OptimizationRemarkMissed are 10354// less verbose reporting vectorized loops and unvectorized loops that may 10355// benefit from vectorization, respectively. 10364// Check if it is legal to vectorize the loop. 10369LLVM_DEBUG(
dbgs() <<
"LV: Not vectorizing: Cannot prove legality.\n");
10376"early exit is not enabled",
10377"UncountableEarlyExitLoopsDisabled",
ORE, L);
10383"types is not yet supported",
10384"StructCallVectorizationUnsupported",
ORE, L);
10388// Entrance to the VPlan-native vectorization path. Outer loops are processed 10389// here. They may require CFG and instruction level transformations before 10390// even evaluating whether vectorization is profitable. Since we cannot modify 10391// the incoming IR, we need to build VPlan upfront in the vectorization 10393if (!L->isInnermost())
10397assert(L->isInnermost() &&
"Inner loop expected.");
10402// If an override option has been passed in for interleaved accesses, use it. 10406// Analyze interleaved memory accesses. 10414 [LoopLatch](
BasicBlock *BB) { return BB != LoopLatch; })) {
10416"requiring a scalar epilogue is unsupported",
10417"UncountableEarlyExitUnsupported",
ORE, L);
10422// Check the function attributes and profiles to find out if this function 10423// should be optimized for size. 10427// Check the loop for a trip count threshold: vectorize loops with a tiny trip 10428// count by optimizing for size, to minimize overheads. 10431LLVM_DEBUG(
dbgs() <<
"LV: Found a loop with a very small trip count. " 10432 <<
"This loop is worth vectorizing only if no scalar " 10433 <<
"iteration overheads are incurred.");
10435LLVM_DEBUG(
dbgs() <<
" But vectorizing was explicitly forced.\n");
10439// Predicate tail-folded loops are efficient even when the loop 10440// iteration count is low. However, setting the epilogue policy to 10441// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops 10442// with runtime checks. It's more effective to let 10443// `areRuntimeChecksProfitable` determine if vectorization is beneficial 10448LLVM_DEBUG(
dbgs() <<
" But the target considers the trip count too " 10449"small to consider vectorizing.\n");
10451"The trip count is below the minial threshold value.",
10452"loop trip count is too low, avoiding vectorization",
10453"LowTripCount",
ORE, L);
10460// Check the function attributes to see if implicit floats or vectors are 10462if (
F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10464"Can't vectorize when the NoImplicitFloat attribute is used",
10465"loop not vectorized due to NoImplicitFloat attribute",
10466"NoImplicitFloat",
ORE, L);
10471// Check if the target supports potentially unsafe FP vectorization. 10472// FIXME: Add a check for the type of safety issue (denormal, signaling) 10473// for the target we're vectorizing for, to make sure none of the 10474// additional fp-math flags can help. 10478"Potentially unsafe FP op prevents vectorization",
10479"loop not vectorized due to unsafe FP support.",
10485bool AllowOrderedReductions;
10486// If the flag is set, use that instead and override the TTI behaviour. 10495 ExactFPMathInst->getDebugLoc(),
10496 ExactFPMathInst->getParent())
10497 <<
"loop not vectorized: cannot prove it is safe to reorder " 10498"floating-point operations";
10500LLVM_DEBUG(
dbgs() <<
"LV: loop not vectorized: cannot prove it is safe to " 10501"reorder floating-point operations\n");
10506// Use the cost model. 10507LoopVectorizationCostModel CM(
SEL, L, PSE,
LI, &LVL, *
TTI,
TLI,
DB,
AC,
ORE,
10509// Use the planner for vectorization. 10510LoopVectorizationPlanner LVP(L,
LI,
DT,
TLI, *
TTI, &LVL, CM, IAI, PSE, Hints,
10513// Get user vectorization factor and interleave count. 10517// Plan how to best vectorize. 10518 LVP.
plan(UserVF, UserIC);
10525bool AddBranchWeights =
10527 GeneratedRTChecks Checks(PSE,
DT,
LI,
TTI,
F->getDataLayout(),
10530// Select the interleave count. 10533unsigned SelectedIC = std::max(IC, UserIC);
10534// Optimistically generate runtime checks if they are needed. Drop them if 10535// they turn out to not be profitable. 10539// Check if it is profitable to vectorize with runtime checks. 10540bool ForceVectorization =
10542if (!ForceVectorization &&
10546DEBUG_TYPE,
"CantReorderMemOps", L->getStartLoc(),
10548 <<
"loop not vectorized: cannot prove it is safe to reorder " 10549"memory operations";
10557// Identify the diagnostic messages that should be produced. 10558 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10559bool VectorizeLoop =
true, InterleaveLoop =
true;
10561LLVM_DEBUG(
dbgs() <<
"LV: Vectorization is possible but not beneficial.\n");
10562 VecDiagMsg = std::make_pair(
10563"VectorizationNotBeneficial",
10564"the cost-model indicates that vectorization is not beneficial");
10565 VectorizeLoop =
false;
10569// Tell the user interleaving was avoided up-front, despite being explicitly 10571LLVM_DEBUG(
dbgs() <<
"LV: Ignoring UserIC, because vectorization and " 10572"interleaving should be avoided up front\n");
10573 IntDiagMsg = std::make_pair(
10574"InterleavingAvoided",
10575"Ignoring UserIC, because interleaving was avoided up front");
10576 InterleaveLoop =
false;
10577 }
elseif (IC == 1 && UserIC <= 1) {
10578// Tell the user interleaving is not beneficial. 10580 IntDiagMsg = std::make_pair(
10581"InterleavingNotBeneficial",
10582"the cost-model indicates that interleaving is not beneficial");
10583 InterleaveLoop =
false;
10585 IntDiagMsg.first =
"InterleavingNotBeneficialAndDisabled";
10586 IntDiagMsg.second +=
10587" and is explicitly disabled or interleave count is set to 1";
10589 }
elseif (IC > 1 && UserIC == 1) {
10590// Tell the user interleaving is beneficial, but it explicitly disabled. 10592dbgs() <<
"LV: Interleaving is beneficial but is explicitly disabled.");
10593 IntDiagMsg = std::make_pair(
10594"InterleavingBeneficialButDisabled",
10595"the cost-model indicates that interleaving is beneficial " 10596"but is explicitly disabled or interleave count is set to 1");
10597 InterleaveLoop =
false;
10600// If there is a histogram in the loop, do not just interleave without 10601// vectorizing. The order of operations will be incorrect without the 10602// histogram intrinsics, which are only used for recipes with VF > 1. 10603if (!VectorizeLoop && InterleaveLoop && LVL.
hasHistograms()) {
10604LLVM_DEBUG(
dbgs() <<
"LV: Not interleaving without vectorization due " 10605 <<
"to histogram operations.\n");
10606 IntDiagMsg = std::make_pair(
10607"HistogramPreventsScalarInterleaving",
10608"Unable to interleave without vectorization due to constraints on " 10609"the order of histogram operations");
10610 InterleaveLoop =
false;
10613// Override IC if user provided an interleave count. 10614 IC = UserIC > 0 ? UserIC : IC;
10616// Emit diagnostic messages, if any. 10618if (!VectorizeLoop && !InterleaveLoop) {
10619// Do not vectorize or interleaving the loop. 10622 L->getStartLoc(), L->getHeader())
10623 << VecDiagMsg.second;
10627 L->getStartLoc(), L->getHeader())
10628 << IntDiagMsg.second;
10633if (!VectorizeLoop && InterleaveLoop) {
10637 L->getStartLoc(), L->getHeader())
10638 << VecDiagMsg.second;
10640 }
elseif (VectorizeLoop && !InterleaveLoop) {
10642 <<
") in " << L->getLocStr() <<
'\n');
10645 L->getStartLoc(), L->getHeader())
10646 << IntDiagMsg.second;
10648 }
elseif (VectorizeLoop && InterleaveLoop) {
10650 <<
") in " << L->getLocStr() <<
'\n');
10654bool DisableRuntimeUnroll =
false;
10655MDNode *OrigLoopID = L->getLoopID();
10658if (!VectorizeLoop) {
10659assert(IC > 1 &&
"interleave count should not be 1 or 0");
10660// If we decided that it is not legal to vectorize the loop, then 10672 <<
"interleaved loop (interleaved count: " 10673 << NV(
"InterleaveCount", IC) <<
")";
10676// If we decided that it is *legal* to vectorize the loop, then do it. 10679// Consider vectorizing the epilogue too if it's profitable. 10683 std::unique_ptr<VPlan> BestMainPlan(BestPlan.
duplicate());
10685// The first pass vectorizes the main loop and creates a scalar epilogue 10686// to be vectorized by executing the plan (potentially with a different 10687// factor) again shortly afterwards. 10693 EPI, &LVL, &CM,
BFI,
PSI, Checks,
10696 *BestMainPlan, MainILV,
DT,
false);
10699// Second pass vectorizes the epilogue and adjusts the control flow 10700// edges from the first pass. 10705 Checks, BestEpiPlan);
10710DT,
true, &ExpandedSCEVs);
10711 ++LoopsEpilogueVectorized;
10714 DisableRuntimeUnroll =
true;
10718PSI, Checks, BestPlan);
10722// Add metadata to disable runtime unrolling a scalar loop when there 10723// are no runtime checks about strides and memory. A scalar loop that is 10724// rarely used is not worth unrolling. 10726 DisableRuntimeUnroll =
true;
10728// Report the vectorization decision. 10737"DT not preserved correctly");
10739 std::optional<MDNode *> RemainderLoopID =
10742if (RemainderLoopID) {
10743 L->setLoopID(*RemainderLoopID);
10745if (DisableRuntimeUnroll)
10748// Mark the loop as already vectorized to avoid vectorizing again. 10759// 1. the target claims to have no vector registers, and 10760// 2. interleaving won't help ILP. 10762// The second condition is necessary because, even if the target has no 10763// vector registers, loop vectorization may still enable scalar 10769bool Changed =
false, CFGChanged =
false;
10771// The vectorizer requires loops to be in simplified form. 10772// Since simplification may add new inner loops, it has to run before the 10773// legality and profitability checks. This means running the loop vectorizer 10774// will simplify all loops, regardless of whether anything end up being 10776for (
constauto &L : *
LI)
10777 Changed |= CFGChanged |=
10780// Build up a worklist of inner-loops to vectorize. This is necessary as 10781// the act of vectorizing or partially unrolling a loop creates new loops 10782// and can invalidate iterators across the loops. 10788 LoopsAnalyzed += Worklist.
size();
10790// Now walk the identified inner loops. 10791while (!Worklist.
empty()) {
10794// For the inner loops we actually process, form LCSSA to simplify the 10810// Process each loop nest in the function. 10817// There are no loops in the function. Return before computing other 10818// expensive analyses. 10836if (!Result.MadeAnyChange)
10850if (Result.MadeCFGChange) {
10851// Making CFG changes likely means a loop got vectorized. Indicate that 10852// extra simplification passes should be run. 10853// TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only 10854// be run if runtime checks have been added. 10866OS, MapClassName2PassName);
10869OS << (InterleaveOnlyWhenForced ?
"" :
"no-") <<
"interleave-forced-only;";
10870OS << (VectorizeOnlyWhenForced ?
"" :
"no-") <<
"vectorize-forced-only;";
static unsigned getIntrinsicID(const SDNode *N)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefAnalysis InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
This file contains the simple types necessary to represent the attributes associated with functions a...
This is the interface for LLVM's primary stateless and local alias analysis.
static bool IsEmptyBlock(MachineBasicBlock *MBB)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define DEBUG_WITH_TYPE(TYPE,...)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
This file defines DenseMapInfo traits for DenseMap.
This file defines the DenseMap class.
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
This is the interface for a simple mod/ref and alias analysis over globals.
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
Module.h This file contains the declarations for the Module class.
This defines the Use class.
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
Legalize the Machine IR a function s Machine IR
This header provides classes for managing per-loop analyses.
static const char * VerboseDebug
loop Loop Strength Reduction
This file defines the LoopVectorizationLegality class.
This file provides a LoopVectorizationPlanner class.
static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)
static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))
static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))
static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...
static void addRuntimeUnrollDisableMetaData(Loop *L)
static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)
static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))
static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)
Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.
static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))
Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...
static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)
Write a DebugMsg about vectorization to the debug output stream.
static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))
static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Create and return a ResumePhi for WideIV, unless it is truncated.
static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)
Compute the transformed value of Index at offset StartValue using step StepValue.
static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)
Look for a meaningful debug location on the instruction or its operands.
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, const TargetTransformInfo &TTI, PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL)
static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB)
Replace VPBB with a VPIRBasicBlock wrapping IRBB.
const char LLVMLoopVectorizeFollowupAll[]
static SetVector< VPIRInstruction * > collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, SetVector< VPIRInstruction * > &ExitUsersToFix)
Handle users in the exit block for first order reductions in the original exit block.
static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)
static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)
Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....
static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)
static constexpr uint32_t MemCheckBypassWeights[]
cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))
std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)
static constexpr uint32_t MinItersBypassWeights[]
static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))
static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))
static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop)
Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...
static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))
static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))
static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)
static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))
static Type * maybeVectorizeType(Type *Elt, ElementCount VF)
static std::optional< unsigned > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)
Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...
static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))
static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))
static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)
const char VerboseDebug[]
static void fixReductionScalarResumeWhenVectorizingEpilog(VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, BasicBlock *BypassBlock)
static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))
static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)
Creates a VPWidenIntOrFpInductionRecpipe for Phi.
static constexpr uint32_t SCEVCheckBypassWeights[]
static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))
const char LLVMLoopVectorizeFollowupVectorized[]
static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))
static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))
static bool hasIrregularType(Type *Ty, const DataLayout &DL)
A helper function that returns true if the given type is irregular.
static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))
static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)
Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.
const char LLVMLoopVectorizeFollowupEpilogue[]
static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, const EpilogueLoopVectorizationInfo &EPI)
Prepare Plan for vectorizing the epilogue loop.
static bool useActiveLaneMask(TailFoldingStyle Style)
static unsigned getEstimatedRuntimeVF(const Loop *L, const TargetTransformInfo &TTI, ElementCount VF)
This function attempts to return a value that represents the vectorization factor at runtime.
static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)
For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...
static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)
Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...
static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))
static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))
static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))
An interleave-group may need masking if it resides in a block that needs predication,...
static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))
static void cse(BasicBlock *BB)
Perform cse of induction variable instructions.
static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)
Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...
static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))
static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))
static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)
static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)
static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))
The number of stores in a loop that are allowed to need predication.
static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))
static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))
static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)
static void addUsersInExitBlocks(VPlan &Plan, const SetVector< VPIRInstruction * > &ExitUsersToFix)
static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)
Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.
static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(false), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))
static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))
static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})
Create an analysis remark that explains why vectorization failed.
mir Rename Register Operands
This file implements a map that provides insertion order iteration.
std::pair< uint64_t, uint64_t > Interval
This file contains the declarations for metadata subclasses.
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
This file contains the declarations for profiling metadata utility functions.
const SmallVectorImpl< MachineOperand > & Cond
static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
This pass exposes codegen information to IR-level passes.
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const char PassName[]
static const uint32_t IV[8]
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
uint64_t getZExtValue() const
Get zero extended value.
int64_t getSExtValue() const
Get sign extended value.
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
void registerAssumption(AssumeInst *CI)
Add an @llvm.assume intrinsic to this function's cache.
unsigned getVScaleRangeMin() const
Returns the minimum value for the vscale_range attribute.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
const Function * getParent() const
Return the enclosing method, or null if none.
LLVMContext & getContext() const
Get the context in which this basic block lives.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
Analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Represents analyses that only rely on functions' control flow.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_UGT
unsigned greater than
@ ICMP_ULT
unsigned less than
@ ICMP_ULE
unsigned less or equal
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
This is the shared class of boolean and integer constants.
static ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getFalse(LLVMContext &Context)
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
An analysis that produces DemandedBits for a function.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
DomTreeNodeBase * getIDom() const
Analysis pass which computes a DominatorTree.
bool verify(VerificationLevel VL=VerificationLevel::Full) const
verify - checks if the tree is correct.
void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)
Add a new node to the dominator tree information.
void eraseNode(NodeT *BB)
eraseNode - Removes a node from the dominator tree.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
constexpr bool isVector() const
One or more elements.
static constexpr ElementCount getScalable(ScalarTy MinVal)
static constexpr ElementCount getFixed(ScalarTy MinVal)
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
constexpr bool isScalar() const
Exactly one element.
BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)
Emits an iteration count bypass check after the main vector loop has finished to see if there are any...
void printDebugTracesAtEnd() override
EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...
A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...
void printDebugTracesAtEnd() override
BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...
EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
void printDebugTracesAtStart() override
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)
Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent function types.
param_iterator param_begin() const
param_iterator param_end() const
bool hasOptSize() const
Optimize this function for size (-Os) or minimum size (-Oz).
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Represents flags for the getelementptr instruction/expression.
static GEPNoWrapFlags inBounds()
static GEPNoWrapFlags none()
Common base class shared among various IRBuilders.
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
ConstantInt * getFalse()
Get the constant value for i1 false.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
A struct for saving information about induction variables.
const SCEV * getStep() const
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_NoInduction
Not an induction variable.
@ IK_FpInduction
Floating point induction variable.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
const SmallVectorImpl< Instruction * > & getCastInsts() const
Returns a reference to the type cast instructions in the induction update chain, that are redundant w...
Value * getStartValue() const
An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...
InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
virtual BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0
The interface for creating a vectorized skeleton using one of two different strategies,...
EpilogueLoopVectorizationInfo & EPI
Holds and updates state information required to vectorize the main loop and its epilogue in two separ...
InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...
virtual void printDebugTracesAtStart()
Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...
Value * TripCount
Trip count of the original loop.
void sinkScalarOperands(Instruction *PredInst)
Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...
const TargetLibraryInfo * TLI
Target Library Info.
ElementCount MinProfitableTripCount
const TargetTransformInfo * TTI
Target Transform Info.
Value * VectorTripCount
Trip count of the widened loop (TripCount - TripCount % (VF*UF))
bool areSafetyChecksAdded()
BasicBlock * emitSCEVChecks(BasicBlock *Bypass)
Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.
virtual BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)
Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...
LoopVectorizationCostModel * Cost
The profitablity analysis.
BasicBlock * AdditionalBypassBlock
The additional bypass block which conditionally skips over the epilogue loop after executing the main...
BlockFrequencyInfo * BFI
BFI and PSI are used to check for profile guided size optimizations.
Value * getTripCount() const
Returns the original loop trip count.
BasicBlock * LoopMiddleBlock
Middle Block between the vector and the scalar.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)
A helper function to scalarize a single Instruction in the innermost loop.
SmallVector< Instruction *, 4 > PredicatedInstructions
Store instructions that were predicated.
DenseMap< PHINode *, Value * > Induction2AdditionalBypassValue
Mapping of induction phis to their additional bypass values.
void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)
Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...
void createVectorLoopSkeleton(StringRef Prefix)
Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...
BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)
Emit bypass checks to check any memory assumptions we may have made.
BasicBlock * LoopScalarPreHeader
The scalar-loop preheader.
void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)
Create and record the values for induction variables to resume coming from the additional bypass bloc...
VPBlockBase * VectorPHVPB
The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...
LoopVectorizationLegality * Legal
The legality analysis.
InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)
void emitIterationCountCheck(BasicBlock *Bypass)
Emit a bypass check to see if the vector trip count is zero, including if it overflows.
PredicatedScalarEvolution & PSE
A wrapper around ScalarEvolution used to add runtime SCEV checks.
Value * getInductionAdditionalBypassValue(PHINode *OrigPhi) const
induction header phi.
BasicBlock * getAdditionalBypassBlock() const
Return the additional bypass block which targets the scalar loop by skipping the epilogue loop after ...
DominatorTree * DT
Dominator Tree.
void setTripCount(Value *TC)
Used to set the trip count after ILV's construction and after the preheader block has been executed.
bool OptForSizeBasedOnProfile
void fixVectorizedLoop(VPTransformState &State)
Fix the vectorized code, taking care of header phi's, and more.
BasicBlock * LoopVectorPreHeader
The vector-loop preheader.
virtual void printDebugTracesAtEnd()
AssumptionCache * AC
Assumption Cache.
Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)
Returns (and creates if needed) the trip count of the widened loop.
IRBuilder Builder
The builder that we use.
void fixNonInductionPHIs(VPTransformState &State)
Fix the non-induction PHIs in Plan.
unsigned UF
The vectorization unroll factor to use.
SmallVector< BasicBlock *, 4 > LoopBypassBlocks
A list of all bypass blocks. The first block is the entry of the loop.
GeneratedRTChecks & RTChecks
Structure to hold information about generated runtime checks, responsible for cleaning the checks,...
virtual ~InnerLoopVectorizer()=default
ElementCount VF
The vectorization SIMD factor to use.
Loop * OrigLoop
The original loop.
static InstructionCost getInvalid(CostType Val=0)
static InstructionCost getMax()
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)
Replace specified successor OldBB to point at the provided block.
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
const char * getOpcodeName() const
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
InstTy * getInsertPos() const
uint32_t getNumMembers() const
Drive the analysis of interleaved memory accesses in the loop.
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
bool invalidateGroups()
Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...
iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()
void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Type * getPointerOperandType() const
This analysis provides dependence information for the memory accesses of a loop.
Drive the analysis of memory accesses in the loop.
const RuntimePointerChecking * getRuntimePointerChecking() const
unsigned getNumRuntimePointerChecks() const
Number of memchecks required to prove independence of otherwise may-alias pointers.
const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const
If an access has a symbolic strides, this maps the pointer value to the stride symbol.
Analysis pass that exposes the LoopInfo for a function.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const
Return all of the successor blocks of this loop.
BlockT * getUniqueLatchExitBlock() const
Return the unique exit block for the latch, or null if there are multiple different exit blocks or th...
void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const
Return all blocks inside the loop that have successors outside of the loop.
BlockT * getHeader() const
unsigned getLoopDepth() const
Return the nesting level of this loop.
void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)
This method is used by other analyses to update loop information.
iterator_range< block_iterator > blocks() const
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Store the result of a depth first search within basic blocks contained by a single loop.
RPOIterator beginRPO() const
Reverse iterate over the cached postorder blocks.
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
RPOIterator endRPO() const
Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...
void perform(const LoopInfo *LI)
Traverse the loop blocks and store the DFS result.
void removeBlock(BlockT *BB)
This method completely removes BB from all data structures, including all of the Loop objects it is n...
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
LoopVectorizationCostModel - estimates the expected speedups due to vectorization.
SmallPtrSet< Type *, 16 > ElementTypesInLoop
All element types found in the loop.
void collectElementTypesForWidening()
Collect all element types in the loop for which widening is needed.
bool canVectorizeReductions(ElementCount VF) const
Returns true if the target machine supports all of the reduction variables found for the given VF.
bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const
Returns true if epilogue vectorization is considered profitable, and false otherwise.
bool requiresScalarEpilogue(VFRange Range) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
bool isPredicatedInst(Instruction *I) const
Returns true if I is an instruction that needs to be predicated at runtime.
bool hasPredStores() const
void collectValuesToIgnore()
Collect values we want to ignore in the cost model.
void collectInLoopReductions()
Split reductions into those that happen in the loop, and those that happen outside.
std::pair< unsigned, unsigned > getSmallestAndWidestTypes()
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be uniform after vectorization.
bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const
Returns true if the predicated reduction select should be used to set the incoming value for the redu...
PredicatedScalarEvolution & PSE
Predicated scalar evolution analysis.
const LoopVectorizeHints * Hints
Loop Vectorize Hint.
std::optional< unsigned > getMaxSafeElements() const
Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...
const TargetTransformInfo & TTI
Vector target information.
LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)
const Function * TheFunction
LoopVectorizationLegality * Legal
Vectorization legality.
bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked load operation for the given DataType and kind of ...
std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const
Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF)
Returns the execution time cost of an instruction for a given vector width.
DemandedBits * DB
Demanded bits analysis.
bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const
Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...
const TargetLibraryInfo * TLI
Target Library Info.
bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)
Returns true if I is a memory instruction with consecutive memory access that can be widened.
const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const
Get the interleaved access group that Instr belongs to.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const
Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const
Returns true if I is known to be scalar after vectorization.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)
Return True if instruction I is an optimizable truncate whose operand is an induction variable.
FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)
Loop * TheLoop
The loop that we evaluate.
TTI::TargetCostKind CostKind
The kind of cost that we are calculating.
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Returns the TailFoldingStyle that is best for the current loop.
InterleavedAccessInfo & InterleaveInfo
The interleave access information contains groups of interleaved accesses with the same stride and cl...
SmallPtrSet< const Value *, 16 > ValuesToIgnore
Values to ignore in the cost model.
void setVectorizedCallDecision(ElementCount VF)
A call may be vectorized in different ways depending on whether we have vectorized variants available...
void invalidateCostModelingDecisions()
Invalidates decisions already taken by the cost model.
bool isAccessInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleaved access group.
bool selectUserVectorizationFactor(ElementCount UserVF)
Setup cost-based decisions for user vectorization factor.
OptimizationRemarkEmitter * ORE
Interface to emit optimization remarks.
bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const
Returns true if the target machine supports masked store operation for the given DataType and kind of...
LoopInfo * LI
Loop Info analysis.
bool requiresScalarEpilogue(bool IsVectorizing) const
Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...
SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)
SmallPtrSet< const Value *, 16 > VecValuesToIgnore
Values to ignore in the cost model when VF > 1.
bool isInLoopReduction(PHINode *Phi) const
Returns true if the Phi is part of an inloop reduction.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const
void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...
const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const
CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const
bool isLegalGatherOrScatter(Value *V, ElementCount VF)
Returns true if the target machine can represent V as a masked gather or scatter operation.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const
bool runtimeChecksRequired()
bool shouldConsiderInvariant(Value *Op)
Returns true if Op should be considered invariant and if it is trivially hoistable.
bool foldTailByMasking() const
Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailWithEVL() const
Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...
void collectUniformsAndScalars(ElementCount VF)
Collect Uniform and Scalar values for the given VF.
bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const
Returns true if the instructions in this block requires predication for any reason,...
void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)
Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.
AssumptionCache * AC
Assumption cache.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)
Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.
InstWidening
Decision that was taken during cost calculation for memory instruction.
bool isScalarWithPredication(Instruction *I, ElementCount VF) const
Returns true if I is an instruction which requires predication and for which our chosen predication s...
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const
Estimate cost of a call instruction CI if it were vectorized with factor VF.
bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const
Returns true if we should use strict in-order reductions for the given RdxDesc.
std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const
Return the costs for our two available strategies for lowering a div/rem operation which requires spe...
bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const
Given costs for both strategies, return true if the scalar predication lowering should be used for di...
InstructionCost expectedCost(ElementCount VF)
Returns the expected execution cost.
void setCostBasedWideningDecision(ElementCount VF)
Memory access instruction may be vectorized in more than one way.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) const
Return the cost model decision for the given instruction I and vector width VF.
bool isScalarEpilogueAllowed() const
Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.
InstructionCost getWideningCost(Instruction *I, ElementCount VF)
Return the vectorization cost for the given instruction I and vector width VF.
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)
void collectInstsToScalarize(ElementCount VF)
Collects the instructions to scalarize for each predicated instruction in the loop.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
unsigned getNumStores() const
bool isInvariantStoreOfReduction(StoreInst *SI)
Returns True if given store is a final invariant store of one of the reductions found in the loop.
bool hasVectorCallVariants() const
Returns true if there is at least one function call in the loop which has a vectorized variant availa...
uint64_t getMaxSafeVectorWidthInBits() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
bool isInvariantAddressOfReduction(Value *V)
Returns True if given address is invariant and is used to store recurrent expression.
bool blockNeedsPredication(BasicBlock *BB) const
Return true if the block BB needs to be predicated in order for the loop to be vectorized.
bool canVectorize(bool UseVPlanNativePath)
Returns true if it is legal to vectorize this loop.
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const
Check if this pointer is consecutive when vectorizing.
std::optional< const HistogramInfo * > getHistogramInfo(Instruction *I) const
Returns a HistogramInfo* for the given instruction if it was determined to be part of a load -> updat...
bool canVectorizeFPMath(bool EnableStrictReductions)
Returns true if it is legal to vectorize the FP math operations in this loop.
bool isReductionVariable(PHINode *PN) const
Returns True if PN is a reduction variable in this loop.
bool isFixedOrderRecurrence(const PHINode *Phi) const
Returns True if Phi is a fixed-order recurrence in this loop.
const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is pointer induction.
const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const
Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.
bool isInductionPhi(const Value *V) const
Returns True if V is a Phi node of an induction variable in this loop.
PHINode * getPrimaryInduction()
Returns the primary induction variable.
const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const
Returns all exiting blocks with a countable exit, i.e.
const InductionList & getInductionVars() const
Returns the induction variables found in the loop.
bool hasStructVectorCall() const
Returns true if there is at least one function call in the loop which returns a struct type and needs...
bool isInvariant(Value *V) const
Returns true if V is invariant across all loop iterations according to SCEV.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
bool isSafeForAnyVectorWidth() const
unsigned getNumLoads() const
bool canFoldTailByMasking() const
Return true if we can vectorize this loop while folding its tail by masking.
void prepareToFoldTailByMasking()
Mark all respective loads/stores for masking.
Type * getWidestInductionType()
Returns the widest induction type.
bool hasUncountableEarlyExit() const
Returns true if the loop has exactly one uncountable early exit, i.e.
bool hasHistograms() const
Returns a list of all known histogram operations in the loop.
const LoopAccessInfo * getLAI() const
bool isUniformMemOp(Instruction &I, ElementCount VF) const
A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...
BasicBlock * getUncountableEarlyExitingBlock() const
Returns the uncountable early exiting block, if there is exactly one.
bool isMaskRequired(const Instruction *I) const
Returns true if vector representation of the instruction I requires mask.
const RuntimePointerChecking * getRuntimePointerChecking() const
Returns the information that we collected about runtime memory check.
Planner drives the vectorization process after having passed Legality checks.
VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)
VPlan & getPlanFor(ElementCount VF) const
Return the VPlan for VF.
VectorizationFactor planInVPlanNativePath(ElementCount UserVF)
Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF)
Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...
VectorizationFactor computeBestVF()
Compute and return the most profitable vectorization factor.
void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)
Emit remarks for recipes with invalid costs in the available VPlans.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
void printPlans(raw_ostream &O)
void plan(ElementCount UserVF, unsigned UserIC)
Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...
DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)
Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...
bool hasPlanWithVF(ElementCount VF) const
Look through the existing plans and return true if we have one with vectorization factor VF.
This holds vectorization requirements that must be verified late in the process.
Instruction * getExactFPInst()
Utility class for getting and setting loop vectorizer hints in the form of loop metadata.
bool isScalableVectorizationDisabled() const
enum ForceKind getForce() const
bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const
bool allowReordering() const
When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...
void emitRemarkWithHints() const
Dumps all the hint information.
bool isPotentiallyUnsafe() const
ElementCount getWidth() const
@ FK_Enabled
Forcing enabled.
@ FK_Undefined
Not selected.
@ FK_Disabled
Forcing disabled.
unsigned getPredicate() const
void setAlreadyVectorized()
Mark the loop L as already vectorized by setting the width to 1.
const char * vectorizeAnalysisPassName() const
If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...
unsigned getInterleave() const
void prepareNoAliasMetadata()
Set up the aliasing scopes based on the memchecks.
Represents a single loop in the control flow graph.
bool hasLoopInvariantOperands(const Instruction *I) const
Return true if all the operands of the specified instruction are loop invariant.
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
MDNode * getLoopID() const
Return the llvm.loop loop id metadata node for this loop if it is present.
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
const MDOperand & getOperand(unsigned I) const
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
unsigned getNumOperands() const
Return number of MDNode operands.
static MDString * get(LLVMContext &Context, StringRef Str)
This class implements a map that also provides access to all stored values in a deterministic order.
iterator find(const KeyT &Key)
bool contains(const KeyT &Key) const
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Diagnostic information for optimization analysis remarks related to pointer aliasing.
Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.
Diagnostic information for optimization analysis remarks.
The optimization diagnostic interface.
bool allowExtraAnalysis(StringRef PassName) const
Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
static unsigned getIncomingValueNumForOperand(unsigned i)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
const SCEVPredicate & getPredicate() const
unsigned getSmallConstantMaxTripCount()
Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...
const SCEV * getBackedgeTakenCount()
Get the (predicated) backedge count for the analyzed loop.
const SCEV * getSymbolicMaxBackedgeTakenCount()
Get the (predicated) symbolic max backedge count for the analyzed loop.
const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
void preserve()
Mark an analysis as preserved.
An analysis pass based on the new PM to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
static bool isFMulAddIntrinsic(Instruction *I)
Returns true if the instruction is a call to the llvm.fmuladd intrinsic.
FastMathFlags getFastMathFlags() const
Instruction * getLoopExitInstr() const
static unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
Type * getRecurrenceType() const
Returns the type of the recurrence.
const SmallPtrSet< Instruction *, 8 > & getCastInsts() const
Returns a reference to the instructions used for type-promoting the recurrence.
unsigned getMinWidthCastToRecurrenceTypeInBits() const
Returns the minimum width used by the recurrence in bits.
TrackingVH< Value > getRecurrenceStartValue() const
SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const
Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...
static bool isAnyOfRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
static bool isFindLastIVRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...
bool isSigned() const
Returns true if all source operands of the recurrence are SExtInsts.
RecurKind getRecurrenceKind() const
bool isOrdered() const
Expose an ordered FP reduction to the instance users.
Value * getSentinelValue() const
Returns the sentinel value for FindLastIV recurrences to replace the start value.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
bool Need
This flag indicates if we need to add the runtime check.
std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const
const SmallVectorImpl< RuntimePointerCheck > & getChecks() const
Returns the checks that generateChecks created.
This class represents a constant integer value.
const APInt & getAPInt() const
Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.
This class uses information about analyze scalars to rewrite expressions in canonical form.
ScalarEvolution * getSE()
bool isInsertedInstruction(Instruction *I) const
Return true if the specified instruction was inserted by the code rewriter.
Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)
Generates a code sequence that evaluates this predicate.
This class represents an assumption made using SCEV expressions which can be checked at run-time.
virtual bool isAlwaysTrue() const =0
Returns true if the predicate is always true.
This class represents an analyzed expression in the program.
bool isOne() const
Return true if the expression is a constant one.
bool isZero() const
Return true if the expression is a constant zero.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)
Represents an unsigned remainder expression based on unsigned division.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
void forgetBlockAndLoopDispositions(Value *V=nullptr)
Called when the client has changed the disposition of values in a loop or block.
const SCEV * getMinusOne(Type *Ty)
Return a SCEV for the constant -1 of a specific type.
void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)
Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...
unsigned getSmallConstantTripCount(const Loop *L)
Returns the exact trip count of the loop if we can compute it, and the result is a small constant.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)
Try to apply information from loop guards for L to Expr.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
iterator end()
Get an iterator to the end of the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
iterator begin()
Get an iterator to the beginning of the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
value_type pop_back_val()
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
std::optional< unsigned > getVScaleForTuning() const
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const
Return true if the target supports masked scatter.
bool enableAggressiveInterleaving(bool LoopHasReductions) const
Don't restrict interleaved unrolling to small loops.
bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
bool supportsEfficientVectorElementLoadStore() const
If target has efficient vector element load/store instructions, it can return true here so that inser...
bool prefersVectorizedAddressing() const
Return true if target doesn't mind addresses in vectors.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
bool hasBranchDivergence(const Function *F=nullptr) const
Return true if branch divergence exists.
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const
Get target-customized preferences for the generic loop unrolling transformation.
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instructions unique non-constant operands.
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
std::optional< unsigned > getMaxVScale() const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
bool enableOrderedReductions() const
Return true if we should be enabling ordered reductions for the target.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
Query the target what the preferred style of tail folding is.
unsigned getRegUsageForType(Type *Ty) const
Returns the estimated number of registers required to represent Ty.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool isElementTypeLegalForScalableVector(Type *Ty) const
ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const
bool enableMaskedInterleavedAccessVectorization() const
Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
bool isVScaleKnownToBeAPowerOfTwo() const
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
const char * getRegisterClassName(unsigned ClassID) const
bool preferEpilogueVectorization() const
Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const
unsigned getEpilogueVectorizationMinVF() const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool supportsScalableVectors() const
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
bool isLegalMaskedStore(Type *DataType, Align Alignment) const
Return true if the target supports masked store.
PartialReductionExtendKind
@ TCC_Free
Expected to fold away in lowering.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
bool enableInterleavedAccessVectorization() const
Enable matching of interleaved access groups.
unsigned getMinTripCountTailFoldingThreshold() const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
unsigned getMaxInterleaveFactor(ElementCount VF) const
bool enableScalableVectorization() const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp=std::nullopt) const
bool isTruncateFree(Type *Ty1, Type *Ty2) const
Return true if it's free to truncate a value of type Ty1 to type Ty2.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ Interleave
The cast is used with an interleaved load/store.
@ GatherScatter
The cast is used with a gather/scatter.
bool preferFixedOverScalableIfEqualCost() const
bool isLegalMaskedLoad(Type *DataType, Align Alignment) const
Return true if the target supports masked load.
This class represents a truncation of integer types.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntOrPtrTy() const
Return true if this is an integer type or a pointer type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isTokenTy() const
Return true if this is 'token'.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
This function has undefined behavior.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
RecipeListTy::iterator iterator
Instruction iterators...
void execute(VPTransformState *State) override
The method which generates the output IR instructions that correspond to this VPBasicBlock,...
iterator begin()
Recipe iterator methods.
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
void insert(VPRecipeBase *Recipe, iterator InsertPt)
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
VPRegionBlock * getParent()
const VPBasicBlock * getExitingBasicBlock() const
void setName(const Twine &newName)
size_t getNumSuccessors() const
void swapSuccessors()
Swap successors of the block. The block must have exactly 2 successors.
VPBlockBase * getSinglePredecessor() const
const VPBasicBlock * getEntryBasicBlock() const
VPBlockBase * getSingleSuccessor() const
const VPBlocksTy & getSuccessors() const
static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBase NewBlock after BlockPtr.
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)
Reassociate all the blocks connected to Old so that they now point to New.
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")
Create a new ICmp VPInstruction with predicate Pred and operands A and B.
VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPBasicBlock * getInsertBlock() const
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
VPScalarCastRecipe * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)
VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")
VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")
VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
Canonical scalar induction phi of the vector loop.
Type * getScalarType() const
Returns the scalar type of the induction.
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
void execute(VPTransformState &State) override
Generate the transformed value of the induction at offset StartValue (1.
VPValue * getStepValue() const
VPValue * getStartValue() const
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
A recipe representing a sequence of load -> update -> store as part of a histogram operation.
A special type of VPBasicBlock that wraps an existing IR basic block.
A recipe to wrap on original IR instruction not to be modified during execution, execept for PHIs.
This is a concrete Recipe that models a single VPlan-level instruction.
@ ResumePhi
Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...
static VPLane getLastLaneForVF(const ElementCount &VF)
static VPLane getFirstLane()
A recipe for forming partial reductions.
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
VPBasicBlock * getParent()
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * tryToCreatePartialReduction(Instruction *Reduction, ArrayRef< VPValue * > Operands)
Create and return a partial reduction recipe for a reduction instruction along with binary operation ...
VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)
A helper function that computes the predicate of the edge between SRC and DST.
VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)
Build a VPReplicationRecipe for I.
void createSwitchEdgeMasks(SwitchInst *SI)
Create an edge mask for every destination of cases and/or default.
VPValue * getBlockInMask(BasicBlock *BB) const
Returns the entry mask for the block BB.
VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const
A helper that returns the previously computed predicate of the edge between SRC and DST.
iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)
Returns a range mapping the values of the range Operands to their corresponding VPValues.
void fixHeaderPhis()
Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.
VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)
Create and return a widened recipe for I if one can be created within the given VF Range.
VPValue * getVPValueOrAddLiveIn(Value *V)
void createHeaderMask()
Create the mask for the vector loop header block.
std::optional< unsigned > getScalingForReduction(const Instruction *ExitInst)
void createBlockInMask(BasicBlock *BB)
A helper function that computes the predicate of the block BB, assuming that the header block of the ...
void collectScaledReductions(VFRange &Range)
Find all possible partial reductions in the loop and track all of those that are valid so recipes can...
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
void setFlags(Instruction *I) const
Set the IR flags for I.
A recipe for handling reduction phis.
bool isInLoop() const
Returns true, if the phi is part of an in-loop reduction.
const RecurrenceDescriptor & getRecurrenceDescriptor() const
A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
const VPBlockBase * getEntry() const
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
void execute(VPTransformState &State) override
Generate replicas of the desired Ingredient.
bool shouldPack() const
Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.
A recipe to compute the pointers for widened memory accesses of IndexTy in reverse order.
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
An analysis for type-inference for VPValues.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
void setOperand(unsigned I, VPValue *New)
unsigned getNumOperands() const
VPValue * getOperand(unsigned N) const
void addOperand(VPValue *Operand)
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
void replaceAllUsesWith(VPValue *New)
Value * getLiveInIRValue()
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
A recipe to compute the pointers for widened memory accesses of IndexTy.
A recipe for widening Call instructions using library calls.
A Recipe for widening the canonical induction variable of the vector loop.
VPWidenCastRecipe is a recipe to create vector cast instructions.
A recipe for handling GEP instructions.
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
VPValue * getStepValue()
Returns the step value of the induction.
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
A recipe for widening vector intrinsics.
A common base class for widening memory operations.
A recipe for handling phis that are widened in the vector loop.
VPValue * getIncomingValue(unsigned I)
Returns the I th incoming VPValue.
VPBasicBlock * getIncomingBlock(unsigned I)
Returns the I th incoming VPBasicBlock.
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Main class to build the VPlan H-CFG for an incoming IR.
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
void prepareToExecute(Value *TripCount, Value *VectorTripCount, VPTransformState &State)
Prepare the plan for execution, setting up the required live-in values.
VPBasicBlock * getEntry()
VPValue & getVectorTripCount()
The vector trip count.
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
VPValue & getVF()
Returns the VF of the vector loop region.
VPValue * getTripCount() const
The trip count of the original loop.
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const
Returns an iterator range over all VFs of the plan.
static VPlanPtr createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)
Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header) which cont...
bool hasVF(ElementCount VF)
bool hasUF(unsigned UF) const
auto getExitBlocks()
Return an iterator range over the VPIRBasicBlock wrapping the exit blocks of the VPlan,...
VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
InstructionCost cost(ElementCount VF, VPCostContext &Ctx)
Return the cost of this plan.
const VPBasicBlock * getMiddleBlock() const
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
void setEntry(VPBasicBlock *VPBB)
VPIRBasicBlock * createVPIRBasicBlock(BasicBlock *IRBB)
Create a VPIRBasicBlock from IRBB containing VPIRInstructions for all instructions in IRBB,...
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
void execute(VPTransformState *State)
Generate the IR code for this VPlan.
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the vector loop.
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
VPlan * duplicate()
Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUser() const
Return true if there is exactly one user of this value.
void setName(const Twine &Name)
Change the name of the value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
LLVMContext & getContext() const
All values hold a context through their type.
StringRef getName() const
Return a constant reference to the value's name.
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
int getNumOccurrences() const
std::pair< iterator, bool > insert(const ValueT &V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
constexpr ScalarTy getFixedValue() const
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr bool isNonZero() const
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
constexpr bool isZero() const
static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
A raw_ostream that writes to an std::string.
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ PredicateElseScalarEpilogue
@ PredicateOrDontVectorize
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
@ C
The default llvm calling convention, compatible with C.
ID ArrayRef< Type * > Tys
std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant
Alias for the std::variant specialization base class of DbgVariable.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< InstrNode * > Instr
NodeAddr< PhiNode * > Phi
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
const_iterator end(StringRef path LLVM_LIFETIME_BOUND)
Get end iterator over path.
bool isUniformAfterVectorization(const VPValue *VPV)
Returns true if VPV is uniform after vectorization.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)
Get or create a VPValue that corresponds to the expansion of Expr.
const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)
Return the SCEV expression for V.
This is an optimization pass for GlobalISel generic memory operations.
bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
Simplify each loop in a loop nest recursively.
void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)
Replace the instruction specified by BI with the instruction specified by I.
Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)
Add code that checks at runtime if the accessed arrays in PointerChecks overlap.
bool RemoveRedundantDbgInstrs(BasicBlock *BB)
Try to remove redundant dbg.value instructions from given basic block.
std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)
Returns a loop's estimated trip count based on branch weight metadata.
static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)
Report successful vectorization of the loop.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
auto pred_end(const MachineBasicBlock *BB)
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
auto successors(const MachineBasicBlock *BB)
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
std::pair< Instruction *, ElementCount > InstructionVFPair
Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)
Return the runtime value for VF.
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)
Create a new loop identifier for a loop created from a loop transformation.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
auto map_range(ContainerTy &&C, FuncTy F)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)
auto reverse(ContainerTy &&C)
void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)
Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))
void sort(IteratorTy Start, IteratorTy End)
std::unique_ptr< VPlan > VPlanPtr
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
cl::opt< bool > EnableLoopVectorization
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)
This function calls abort(), and prints the optional message to stderr.
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})
Reports an informative message: print Msg for debugging purposes as well as an optimization remark.
bool isAssignmentTrackingEnabled(const Module &M)
Return true if assignment tracking is enabled for module M.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)
Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)
Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...
DWARFExpression::Operation Op
@ CM_ScalarEpilogueNotAllowedLowTripLoop
@ CM_ScalarEpilogueNotNeededUsePredicate
@ CM_ScalarEpilogueNotAllowedOptSize
@ CM_ScalarEpilogueAllowed
@ CM_ScalarEpilogueNotAllowedUsePredicate
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto pred_begin(const MachineBasicBlock *BB)
Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)
Return a value for Step multiplied by VF.
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
@ None
Don't use tail folding.
@ DataWithEVL
Use predicated EVL instructions for tail-folding.
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
unsigned getReciprocalPredBlockProb()
A helper function that returns the reciprocal of the block probability of predicated blocks.
bool hasBranchWeightMD(const Instruction &I)
Checks if an instructions has Branch Weight Metadata.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
bool verifyVPlanIsValid(const VPlan &Plan)
Verify invariants for general VPlans.
MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
cl::opt< bool > EnableLoopInterleaving
Implement std::hash so that hash_code can be used in STL containers.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
A special type used by analysis passes to provide an address that identifies that particular analysis...
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
An information struct used to provide DenseMap with the various necessary components for a given valu...
Encapsulate information regarding vectorization of a loop and its epilogue.
BasicBlock * SCEVSafetyCheck
EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)
BasicBlock * MemSafetyCheck
BasicBlock * MainLoopIterationCountCheck
BasicBlock * EpilogueIterationCountCheck
A class that represents two vectorization factors (initialized with 0 by default).
static FixedScalableVFPair getNone()
This holds details about a histogram operation – a load -> update -> store sequence where each lane i...
std::optional< unsigned > MaskPos
A struct that represents some properties of the register usage of a loop.
SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers
Holds the maximum number of concurrent live intervals in the loop.
SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs
Holds the number of loop invariant values that are used in the loop.
LoopVectorizeResult runImpl(Function &F)
bool processLoop(Loop *L)
LoopAccessInfoManager * LAIs
void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)
LoopVectorizePass(LoopVectorizeOptions Opts={})
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
OptimizationRemarkEmitter * ORE
Storage for information about made changes.
A chain of instructions that form a partial reduction.
Instruction * Reduction
The top-level binary operation that forms the reduction to a scalar after the loop body.
Instruction * ExtendA
The extension of each of the inner binary operation's operands.
A CRTP mix-in to automatically provide informational APIs needed for passes.
A marker analysis to determine if extra passes should be run after loop vectorization.
A MapVector that performs no allocations if smaller than a certain size.
Flags describing the kind of vector reduction.
Parameters that control the generic loop unrolling transformation.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
Holds the VFShape for a specific scalar to vector function mapping.
std::optional< unsigned > getParamIndexForOptionalMask() const
Instruction Set Architecture.
Encapsulates information needed to describe a parameter.
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
LoopVectorizationCostModel & CM
bool skipCostComputation(Instruction *UI, bool IsVector) const
Return true if the cost for UI shouldn't be computed, e.g.
InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const
Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...
SmallPtrSet< Instruction *, 8 > SkipCostComputation
A recipe for handling first-order recurrence phis.
BasicBlock * PrevBB
The previous IR BasicBlock created or used.
SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB
A mapping of each VPBasicBlock to the corresponding BasicBlock.
VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...
DenseMap< const SCEV *, Value * > ExpandedSCEVs
Map SCEVs to their expanded values.
VPTypeAnalysis TypeAnalysis
VPlan-based type analysis.
void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane)
Construct the vector value of a scalarized value V one lane at a time.
Value * get(VPValue *Def, bool IsScalar=false)
Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...
struct llvm::VPTransformState::CFGState CFG
LoopVersioning * LVer
LoopVersioning.
void addNewMetadata(Instruction *To, const Instruction *Orig)
Add additional metadata to To that was not present on Orig.
std::optional< VPLane > Lane
Hold the index to generate specific scalar instructions.
IRBuilderBase & Builder
Hold a reference to the IRBuilder used to generate output IR code.
VPlan * Plan
Pointer to the VPlan code is generated for.
InnerLoopVectorizer * ILV
Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
ElementCount VF
The chosen Vectorization Factor of the loop being vectorized.
void setDebugLocFrom(DebugLoc DL)
Set the debug location in the builder using the debug location DL.
void set(VPValue *Def, Value *V, bool IsScalar=false)
Set the generated vector Value for a given VPValue, if IsScalar is false.
A recipe for widening load operations, using the address to load from and an optional mask.
A recipe for widening select instructions.
A recipe for widening store operations, using the stored value, the address to store to and an option...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx)
Explicitly unroll Plan by UF.
static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed)
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static bool handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder)
Update Plan to account for the uncountable early exit block in UncountableExitingBlock by.
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static bool tryAddExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
TODO: The following VectorizationFactor was pulled out of LoopVectorizationCostModel class.
InstructionCost Cost
Cost of the loop with that width.
ElementCount MinProfitableTripCount
The minimum trip count required to make vectorization profitable, e.g.
ElementCount Width
Vector width with best cost.
InstructionCost ScalarCost
Cost of the scalar loop.
static VectorizationFactor Disabled()
Width 1 means no vectorization, cost 0 means uncomputed cost.
static bool HoistRuntimeChecks