Movatterモバイル変換

Go to the documentation of this file.

1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//

2//

3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4// See https://llvm.org/LICENSE.txt for license information.

5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6//

7//===----------------------------------------------------------------------===//

8//

9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops

10// and generates target-independent LLVM-IR.

11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs

12// of instructions in order to estimate the profitability of vectorization.

13//

14// The loop vectorizer combines consecutive loop iterations into a single

15// 'wide' iteration. After this transformation the index is incremented

16// by the SIMD vector width, and not by one.

17//

18// This pass has three parts:

19// 1. The main loop pass that drives the different parts.

20// 2. LoopVectorizationLegality - A unit that checks for the legality

21// of the vectorization.

22// 3. InnerLoopVectorizer - A unit that performs the actual

23// widening of instructions.

24// 4. LoopVectorizationCostModel - A unit that checks for the profitability

25// of vectorization. It decides on the optimal vector width, which

26// can be one, if vectorization is not profitable.

27//

28// There is a development effort going on to migrate loop vectorizer to the

29// VPlan infrastructure and to introduce outer loop vectorization support (see

30// docs/VectorizationPlan.rst and

31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this

32// purpose, we temporarily introduced the VPlan-native vectorization path: an

33// alternative vectorization path that is natively implemented on top of the

34// VPlan infrastructure. See EnableVPlanNativePath for enabling.

35//

36//===----------------------------------------------------------------------===//

37//

38// The reduction-variable vectorization is based on the paper:

39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.

40//

41// Variable uniformity checks are inspired by:

42// Karrenberg, R. and Hack, S. Whole Function Vectorization.

43//

44// The interleaved access vectorization is based on the paper:

45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved

46// Data for SIMD

47//

48// Other ideas/concepts are from:

49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.

50//

51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of

52// Vectorizing Compilers.

53//

54//===----------------------------------------------------------------------===//

56#include "llvm/Transforms/Vectorize/LoopVectorize.h"

57#include "LoopVectorizationPlanner.h"

58#include "VPRecipeBuilder.h"

59#include "VPlan.h"

60#include "VPlanAnalysis.h"

61#include "VPlanHCFGBuilder.h"

62#include "VPlanPatternMatch.h"

63#include "VPlanTransforms.h"

64#include "VPlanUtils.h"

65#include "VPlanVerifier.h"

66#include "llvm/ADT/APInt.h"

67#include "llvm/ADT/ArrayRef.h"

68#include "llvm/ADT/DenseMap.h"

69#include "llvm/ADT/DenseMapInfo.h"

70#include "llvm/ADT/Hashing.h"

71#include "llvm/ADT/MapVector.h"

72#include "llvm/ADT/STLExtras.h"

73#include "llvm/ADT/SmallPtrSet.h"

74#include "llvm/ADT/SmallVector.h"

75#include "llvm/ADT/Statistic.h"

76#include "llvm/ADT/StringRef.h"

77#include "llvm/ADT/Twine.h"

78#include "llvm/ADT/TypeSwitch.h"

79#include "llvm/ADT/iterator_range.h"

80#include "llvm/Analysis/AssumptionCache.h"

81#include "llvm/Analysis/BasicAliasAnalysis.h"

82#include "llvm/Analysis/BlockFrequencyInfo.h"

83#include "llvm/Analysis/CFG.h"

84#include "llvm/Analysis/CodeMetrics.h"

85#include "llvm/Analysis/DemandedBits.h"

86#include "llvm/Analysis/GlobalsModRef.h"

87#include "llvm/Analysis/LoopAccessAnalysis.h"

88#include "llvm/Analysis/LoopAnalysisManager.h"

89#include "llvm/Analysis/LoopInfo.h"

90#include "llvm/Analysis/LoopIterator.h"

91#include "llvm/Analysis/OptimizationRemarkEmitter.h"

92#include "llvm/Analysis/ProfileSummaryInfo.h"

93#include "llvm/Analysis/ScalarEvolution.h"

94#include "llvm/Analysis/ScalarEvolutionExpressions.h"

95#include "llvm/Analysis/TargetLibraryInfo.h"

96#include "llvm/Analysis/TargetTransformInfo.h"

97#include "llvm/Analysis/ValueTracking.h"

98#include "llvm/Analysis/VectorUtils.h"

99#include "llvm/IR/Attributes.h"

100#include "llvm/IR/BasicBlock.h"

101#include "llvm/IR/CFG.h"

102#include "llvm/IR/Constant.h"

103#include "llvm/IR/Constants.h"

104#include "llvm/IR/DataLayout.h"

105#include "llvm/IR/DebugInfo.h"

106#include "llvm/IR/DebugLoc.h"

107#include "llvm/IR/DerivedTypes.h"

108#include "llvm/IR/DiagnosticInfo.h"

109#include "llvm/IR/Dominators.h"

110#include "llvm/IR/Function.h"

111#include "llvm/IR/IRBuilder.h"

112#include "llvm/IR/InstrTypes.h"

113#include "llvm/IR/Instruction.h"

114#include "llvm/IR/Instructions.h"

115#include "llvm/IR/IntrinsicInst.h"

116#include "llvm/IR/Intrinsics.h"

117#include "llvm/IR/MDBuilder.h"

118#include "llvm/IR/Metadata.h"

119#include "llvm/IR/Module.h"

120#include "llvm/IR/Operator.h"

121#include "llvm/IR/PatternMatch.h"

122#include "llvm/IR/ProfDataUtils.h"

123#include "llvm/IR/Type.h"

124#include "llvm/IR/Use.h"

125#include "llvm/IR/User.h"

126#include "llvm/IR/Value.h"

127#include "llvm/IR/Verifier.h"

128#include "llvm/Support/Casting.h"

129#include "llvm/Support/CommandLine.h"

130#include "llvm/Support/Debug.h"

131#include "llvm/Support/ErrorHandling.h"

132#include "llvm/Support/InstructionCost.h"

133#include "llvm/Support/MathExtras.h"

134#include "llvm/Support/NativeFormatting.h"

135#include "llvm/Support/raw_ostream.h"

136#include "llvm/Transforms/Utils/BasicBlockUtils.h"

137#include "llvm/Transforms/Utils/InjectTLIMappings.h"

138#include "llvm/Transforms/Utils/Local.h"

139#include "llvm/Transforms/Utils/LoopSimplify.h"

140#include "llvm/Transforms/Utils/LoopUtils.h"

141#include "llvm/Transforms/Utils/LoopVersioning.h"

142#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"

143#include "llvm/Transforms/Utils/SizeOpts.h"

144#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"

145#include <algorithm>

146#include <cassert>

147#include <cstdint>

148#include <functional>

149#include <iterator>

150#include <limits>

151#include <memory>

152#include <string>

153#include <tuple>

154#include <utility>

155

156using namespacellvm;

157

158#define LV_NAME "loop-vectorize"

159#define DEBUG_TYPE LV_NAME

160

161#ifndef NDEBUG

162constcharVerboseDebug[] =DEBUG_TYPE"-verbose";

163#endif

164

165/// @{

166/// Metadata attribute names

167constcharLLVMLoopVectorizeFollowupAll[] ="llvm.loop.vectorize.followup_all";

168constcharLLVMLoopVectorizeFollowupVectorized[] =

169"llvm.loop.vectorize.followup_vectorized";

170constcharLLVMLoopVectorizeFollowupEpilogue[] =

171"llvm.loop.vectorize.followup_epilogue";

172/// @}

173

174STATISTIC(LoopsVectorized,"Number of loops vectorized");

175STATISTIC(LoopsAnalyzed,"Number of loops analyzed for vectorization");

176STATISTIC(LoopsEpilogueVectorized,"Number of epilogues vectorized");

177

178staticcl::opt<bool>EnableEpilogueVectorization(

179"enable-epilogue-vectorization",cl::init(true),cl::Hidden,

180cl::desc("Enable vectorization of epilogue loops."));

181

182staticcl::opt<unsigned>EpilogueVectorizationForceVF(

183"epilogue-vectorization-force-VF",cl::init(1),cl::Hidden,

184cl::desc("When epilogue vectorization is enabled, and a value greater than "

185"1 is specified, forces the given VF for all applicable epilogue "

186"loops."));

187

188staticcl::opt<unsigned>EpilogueVectorizationMinVF(

189"epilogue-vectorization-minimum-VF",cl::Hidden,

190cl::desc("Only loops with vectorization factor equal to or larger than "

191"the specified value are considered for epilogue vectorization."));

192

193/// Loops with a known constant trip count below this number are vectorized only

194/// if no scalar iteration overheads are incurred.

195staticcl::opt<unsigned>TinyTripCountVectorThreshold(

196"vectorizer-min-trip-count",cl::init(16),cl::Hidden,

197cl::desc("Loops with a constant trip count that is smaller than this "

198"value are vectorized only if no scalar iteration overheads "

199"are incurred."));

200

201staticcl::opt<unsigned>VectorizeMemoryCheckThreshold(

202"vectorize-memory-check-threshold",cl::init(128),cl::Hidden,

203cl::desc("The maximum allowed number of runtime memory checks"));

204

205// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,

206// that predication is preferred, and this lists all options. I.e., the

207// vectorizer will try to fold the tail-loop (epilogue) into the vector body

208// and predicate the instructions accordingly. If tail-folding fails, there are

209// different fallback strategies depending on these values:

210namespacePreferPredicateTy {

211enumOption {

212ScalarEpilogue = 0,

213PredicateElseScalarEpilogue,

214PredicateOrDontVectorize

215 };

216}// namespace PreferPredicateTy

217

218staticcl::opt<PreferPredicateTy::Option>PreferPredicateOverEpilogue(

219"prefer-predicate-over-epilogue",

220cl::init(PreferPredicateTy::ScalarEpilogue),

221cl::Hidden,

222cl::desc("Tail-folding and predication preferences over creating a scalar "

223"epilogue loop."),

224cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,

225"scalar-epilogue",

226"Don't tail-predicate loops, create scalar epilogue"),

227clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,

228"predicate-else-scalar-epilogue",

229"prefer tail-folding, create scalar epilogue if tail "

230"folding fails."),

231clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,

232"predicate-dont-vectorize",

233"prefers tail-folding, don't attempt vectorization if "

234"tail-folding fails.")));

235

236staticcl::opt<TailFoldingStyle>ForceTailFoldingStyle(

237"force-tail-folding-style",cl::desc("Force the tail folding style"),

238cl::init(TailFoldingStyle::None),

239cl::values(

240clEnumValN(TailFoldingStyle::None,"none","Disable tail folding"),

241clEnumValN(

242 TailFoldingStyle::Data,"data",

243"Create lane mask for data only, using active.lane.mask intrinsic"),

244clEnumValN(TailFoldingStyle::DataWithoutLaneMask,

245"data-without-lane-mask",

246"Create lane mask with compare/stepvector"),

247clEnumValN(TailFoldingStyle::DataAndControlFlow,"data-and-control",

248"Create lane mask using active.lane.mask intrinsic, and use "

249"it for both data and control flow"),

250clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,

251"data-and-control-without-rt-check",

252"Similar to data-and-control, but remove the runtime check"),

253clEnumValN(TailFoldingStyle::DataWithEVL,"data-with-evl",

254"Use predicated EVL instructions for tail folding. If EVL "

255"is unsupported, fallback to data-without-lane-mask.")));

256

257staticcl::opt<bool>MaximizeBandwidth(

258"vectorizer-maximize-bandwidth",cl::init(false),cl::Hidden,

259cl::desc("Maximize bandwidth when selecting vectorization factor which "

260"will be determined by the smallest type in loop."));

261

262staticcl::opt<bool>EnableInterleavedMemAccesses(

263"enable-interleaved-mem-accesses",cl::init(false),cl::Hidden,

264cl::desc("Enable vectorization on interleaved memory accesses in a loop"));

265

266/// An interleave-group may need masking if it resides in a block that needs

267/// predication, or in order to mask away gaps.

268staticcl::opt<bool>EnableMaskedInterleavedMemAccesses(

269"enable-masked-interleaved-mem-accesses",cl::init(false),cl::Hidden,

270cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));

271

272staticcl::opt<unsigned>ForceTargetNumScalarRegs(

273"force-target-num-scalar-regs",cl::init(0),cl::Hidden,

274cl::desc("A flag that overrides the target's number of scalar registers."));

275

276staticcl::opt<unsigned>ForceTargetNumVectorRegs(

277"force-target-num-vector-regs",cl::init(0),cl::Hidden,

278cl::desc("A flag that overrides the target's number of vector registers."));

279

280staticcl::opt<unsigned>ForceTargetMaxScalarInterleaveFactor(

281"force-target-max-scalar-interleave",cl::init(0),cl::Hidden,

282cl::desc("A flag that overrides the target's max interleave factor for "

283"scalar loops."));

284

285staticcl::opt<unsigned>ForceTargetMaxVectorInterleaveFactor(

286"force-target-max-vector-interleave",cl::init(0),cl::Hidden,

287cl::desc("A flag that overrides the target's max interleave factor for "

288"vectorized loops."));

289

290cl::opt<unsigned>ForceTargetInstructionCost(

291"force-target-instruction-cost",cl::init(0),cl::Hidden,

292cl::desc("A flag that overrides the target's expected cost for "

293"an instruction to a single constant value. Mostly "

294"useful for getting consistent testing."));

295

296staticcl::opt<bool>ForceTargetSupportsScalableVectors(

297"force-target-supports-scalable-vectors",cl::init(false),cl::Hidden,

298cl::desc(

299"Pretend that scalable vectors are supported, even if the target does "

300"not support them. This flag should only be used for testing."));

301

302staticcl::opt<unsigned>SmallLoopCost(

303"small-loop-cost",cl::init(20),cl::Hidden,

304cl::desc(

305"The cost of a loop that is considered 'small' by the interleaver."));

306

307staticcl::opt<bool>LoopVectorizeWithBlockFrequency(

308"loop-vectorize-with-block-frequency",cl::init(true),cl::Hidden,

309cl::desc("Enable the use of the block frequency analysis to access PGO "

310"heuristics minimizing code growth in cold regions and being more "

311"aggressive in hot regions."));

312

313// Runtime interleave loops for load/store throughput.

314staticcl::opt<bool>EnableLoadStoreRuntimeInterleave(

315"enable-loadstore-runtime-interleave",cl::init(true),cl::Hidden,

316cl::desc(

317"Enable runtime interleaving until load/store ports are saturated"));

318

319/// The number of stores in a loop that are allowed to need predication.

320staticcl::opt<unsigned>NumberOfStoresToPredicate(

321"vectorize-num-stores-pred",cl::init(1),cl::Hidden,

322cl::desc("Max number of stores to be predicated behind an if."));

323

324staticcl::opt<bool>EnableIndVarRegisterHeur(

325"enable-ind-var-reg-heur",cl::init(true),cl::Hidden,

326cl::desc("Count the induction variable only once when interleaving"));

327

328staticcl::opt<bool>EnableCondStoresVectorization(

329"enable-cond-stores-vec",cl::init(true),cl::Hidden,

330cl::desc("Enable if predication of stores during vectorization."));

331

332staticcl::opt<unsigned>MaxNestedScalarReductionIC(

333"max-nested-scalar-reduction-interleave",cl::init(2),cl::Hidden,

334cl::desc("The maximum interleave count to use when interleaving a scalar "

335"reduction in a nested loop."));

336

337staticcl::opt<bool>

338PreferInLoopReductions("prefer-inloop-reductions",cl::init(false),

339cl::Hidden,

340cl::desc("Prefer in-loop vector reductions, "

341"overriding the targets preference."));

342

343staticcl::opt<bool>ForceOrderedReductions(

344"force-ordered-reductions",cl::init(false),cl::Hidden,

345cl::desc("Enable the vectorisation of loops with in-order (strict) "

346"FP reductions"));

347

348staticcl::opt<bool>PreferPredicatedReductionSelect(

349"prefer-predicated-reduction-select",cl::init(false),cl::Hidden,

350cl::desc(

351"Prefer predicating a reduction operation over an after loop select."));

352

353namespacellvm {

354cl::opt<bool>EnableVPlanNativePath(

355"enable-vplan-native-path",cl::Hidden,

356cl::desc("Enable VPlan-native vectorization path with "

357"support for outer loop vectorization."));

358}// namespace llvm

359

360// This flag enables the stress testing of the VPlan H-CFG construction in the

361// VPlan-native vectorization path. It must be used in conjuction with

362// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the

363// verification of the H-CFGs built.

364staticcl::opt<bool>VPlanBuildStressTest(

365"vplan-build-stress-test",cl::init(false),cl::Hidden,

366cl::desc(

367"Build VPlan for every supported loop nest in the function and bail "

368"out right after the build (stress test the VPlan H-CFG construction "

369"in the VPlan-native vectorization path)."));

370

371cl::opt<bool>llvm::EnableLoopInterleaving(

372"interleave-loops",cl::init(true),cl::Hidden,

373cl::desc("Enable loop interleaving in Loop vectorization passes"));

374cl::opt<bool>llvm::EnableLoopVectorization(

375"vectorize-loops",cl::init(true),cl::Hidden,

376cl::desc("Run the Loop vectorization passes"));

377

378staticcl::opt<cl::boolOrDefault>ForceSafeDivisor(

379"force-widen-divrem-via-safe-divisor",cl::Hidden,

380cl::desc(

381"Override cost based safe divisor widening for div/rem instructions"));

382

383staticcl::opt<bool>UseWiderVFIfCallVariantsPresent(

384"vectorizer-maximize-bandwidth-for-vector-calls",cl::init(true),

385cl::Hidden,

386cl::desc("Try wider VFs if they enable the use of vector variants"));

387

388staticcl::opt<bool>EnableEarlyExitVectorization(

389"enable-early-exit-vectorization",cl::init(false),cl::Hidden,

390cl::desc(

391"Enable vectorization of early exit loops with uncountable exits."));

392

393// Likelyhood of bypassing the vectorized loop because assumptions about SCEV

394// variables not overflowing do not hold. See `emitSCEVChecks`.

395staticconstexpruint32_t SCEVCheckBypassWeights[] = {1, 127};

396// Likelyhood of bypassing the vectorized loop because pointers overlap. See

397// `emitMemRuntimeChecks`.

398staticconstexpruint32_t MemCheckBypassWeights[] = {1, 127};

399// Likelyhood of bypassing the vectorized loop because there are zero trips left

400// after prolog. See `emitIterationCountCheck`.

401staticconstexpruint32_t MinItersBypassWeights[] = {1, 127};

402

403/// A helper function that returns true if the given type is irregular. The

404/// type is irregular if its allocated size doesn't equal the store size of an

405/// element of the corresponding vector type.

406staticboolhasIrregularType(Type *Ty,constDataLayout &DL) {

407// Determine if an array of N elements of type Ty is "bitcast compatible"

408// with a <N x Ty> vector.

409// This is only true if there is no padding between the array elements.

410returnDL.getTypeAllocSizeInBits(Ty) !=DL.getTypeSizeInBits(Ty);

411}

412

413/// Returns "best known" trip count for the specified loop \p L as defined by

414/// the following procedure:

415/// 1) Returns exact trip count if it is known.

416/// 2) Returns expected trip count according to profile data if any.

417/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.

418/// 4) Returns std::nullopt if all of the above failed.

419static std::optional<unsigned>

420getSmallBestKnownTC(PredicatedScalarEvolution &PSE,Loop *L,

421bool CanUseConstantMax =true) {

422// Check if exact trip count is known.

423if (unsigned ExpectedTC = PSE.getSE()->getSmallConstantTripCount(L))

424return ExpectedTC;

425

426// Check if there is an expected trip count available from profile data.

427if (LoopVectorizeWithBlockFrequency)

428if (auto EstimatedTC =getLoopEstimatedTripCount(L))

429return *EstimatedTC;

430

431if (!CanUseConstantMax)

432return std::nullopt;

433

434// Check if upper bound estimate is known.

435if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())

436return ExpectedTC;

437

438return std::nullopt;

439}

440

441namespace{

442// Forward declare GeneratedRTChecks.

443classGeneratedRTChecks;

444

445usingSCEV2ValueTy =DenseMap<const SCEV *, Value *>;

446}// namespace

447

448namespacellvm {

449

450AnalysisKey ShouldRunExtraVectorPasses::Key;

451

452/// InnerLoopVectorizer vectorizes loops which contain only one basic

453/// block to a specified vectorization factor (VF).

454/// This class performs the widening of scalars into vectors, or multiple

455/// scalars. This class also implements the following features:

456/// * It inserts an epilogue loop for handling loops that don't have iteration

457/// counts that are known to be a multiple of the vectorization factor.

458/// * It handles the code generation for reduction variables.

459/// * Scalarization (implementation using scalars) of un-vectorizable

460/// instructions.

461/// InnerLoopVectorizer does not perform any vectorization-legality

462/// checks, and relies on the caller to check for the different legality

463/// aspects. The InnerLoopVectorizer relies on the

464/// LoopVectorizationLegality class to provide information about the induction

465/// and reduction variables that were found to a given vectorization factor.

466classInnerLoopVectorizer {

467public:

468InnerLoopVectorizer(Loop *OrigLoop,PredicatedScalarEvolution &PSE,

469LoopInfo *LI,DominatorTree *DT,

470constTargetLibraryInfo *TLI,

471constTargetTransformInfo *TTI,AssumptionCache *AC,

472OptimizationRemarkEmitter *ORE,ElementCount VecWidth,

473ElementCount MinProfitableTripCount,

474unsigned UnrollFactor,LoopVectorizationLegality *LVL,

475LoopVectorizationCostModel *CM,BlockFrequencyInfo *BFI,

476ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,

477VPlan &Plan)

478 :OrigLoop(OrigLoop),PSE(PSE),LI(LI),DT(DT),TLI(TLI),TTI(TTI),

479AC(AC),ORE(ORE),VF(VecWidth),

480MinProfitableTripCount(MinProfitableTripCount),UF(UnrollFactor),

481Builder(PSE.getSE()->getContext()),Legal(LVL),Cost(CM),BFI(BFI),

482PSI(PSI),RTChecks(RTChecks),Plan(Plan),

483VectorPHVPB(Plan.getEntry()->getSingleSuccessor()) {

484// Query this against the original loop and save it here because the profile

485// of the original loop header may change as the transformation happens.

486OptForSizeBasedOnProfile =llvm::shouldOptimizeForSize(

487OrigLoop->getHeader(),PSI,BFI,PGSOQueryType::IRPass);

488 }

489

490virtual~InnerLoopVectorizer() =default;

491

492 /// Create a new empty loop that will contain vectorized instructions later

493 /// on, while the old loop will be used as the scalar remainder. Control flow

494 /// is generated around the vectorized (and scalar epilogue) loops consisting

495 /// of various checks and bypasses. Return the pre-header block of the new

496 /// loop. In the case of epilogue vectorization, this function is overriden to

497 /// handle the more complex control flow around the loops. \p ExpandedSCEVs is

498 /// used to look up SCEV expansions for expressions needed during skeleton

499 /// creation.

500virtualBasicBlock *

501createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);

502

503 /// Fix the vectorized code, taking care of header phi's, and more.

504voidfixVectorizedLoop(VPTransformState &State);

505

506// Return true if any runtime check is added.

507boolareSafetyChecksAdded() {returnAddedSafetyChecks; }

508

509 /// A helper function to scalarize a single Instruction in the innermost loop.

510 /// Generates a sequence of scalar instances for each lane between \p MinLane

511 /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,

512 /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p

513 /// Instr's operands.

514voidscalarizeInstruction(constInstruction *Instr,

515VPReplicateRecipe *RepRecipe,constVPLane &Lane,

516VPTransformState &State);

517

518 /// Fix the non-induction PHIs in \p Plan.

519voidfixNonInductionPHIs(VPTransformState &State);

520

521 /// Returns the original loop trip count.

522Value *getTripCount() const{returnTripCount; }

523

524 /// Used to set the trip count after ILV's construction and after the

525 /// preheader block has been executed. Note that this always holds the trip

526 /// count of the original loop for both main loop and epilogue vectorization.

527voidsetTripCount(Value *TC) {TripCount = TC; }

528

529// Retrieve the additional bypass value associated with an original

530 /// induction header phi.

531Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const{

532returnInduction2AdditionalBypassValue.at(OrigPhi);

533 }

534

535 /// Return the additional bypass block which targets the scalar loop by

536 /// skipping the epilogue loop after completing the main loop.

537BasicBlock *getAdditionalBypassBlock() const{

538assert(AdditionalBypassBlock &&

539"Trying to access AdditionalBypassBlock but it has not been set");

540returnAdditionalBypassBlock;

541 }

542

543protected:

544friendclassLoopVectorizationPlanner;

545

546 /// Iteratively sink the scalarized operands of a predicated instruction into

547 /// the block that was created for it.

548voidsinkScalarOperands(Instruction *PredInst);

549

550 /// Returns (and creates if needed) the trip count of the widened loop.

551Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);

552

553 /// Emit a bypass check to see if the vector trip count is zero, including if

554 /// it overflows.

555voidemitIterationCountCheck(BasicBlock *Bypass);

556

557 /// Emit a bypass check to see if all of the SCEV assumptions we've

558 /// had to make are correct. Returns the block containing the checks or

559 /// nullptr if no checks have been added.

560BasicBlock *emitSCEVChecks(BasicBlock *Bypass);

561

562 /// Emit bypass checks to check any memory assumptions we may have made.

563 /// Returns the block containing the checks or nullptr if no checks have been

564 /// added.

565BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);

566

567 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,

568 /// vector loop preheader, middle block and scalar preheader.

569voidcreateVectorLoopSkeleton(StringRef Prefix);

570

571 /// Create and record the values for induction variables to resume coming from

572 /// the additional bypass block.

573voidcreateInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,

574Value *MainVectorTripCount);

575

576 /// Allow subclasses to override and print debug traces before/after vplan

577 /// execution, when trace information is requested.

578virtualvoidprintDebugTracesAtStart() {}

579virtualvoidprintDebugTracesAtEnd() {}

580

581 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the

582 /// vector preheader and its predecessor, also connecting the new block to the

583 /// scalar preheader.

584voidintroduceCheckBlockInVPlan(BasicBlock *CheckIRBB);

585

586 /// The original loop.

587Loop *OrigLoop;

588

589 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies

590 /// dynamic knowledge to simplify SCEV expressions and converts them to a

591 /// more usable form.

592PredicatedScalarEvolution &PSE;

593

594 /// Loop Info.

595LoopInfo *LI;

596

597 /// Dominator Tree.

598DominatorTree *DT;

599

600 /// Target Library Info.

601constTargetLibraryInfo *TLI;

602

603 /// Target Transform Info.

604constTargetTransformInfo *TTI;

605

606 /// Assumption Cache.

607AssumptionCache *AC;

608

609 /// Interface to emit optimization remarks.

610OptimizationRemarkEmitter *ORE;

611

612 /// The vectorization SIMD factor to use. Each vector will have this many

613 /// vector elements.

614ElementCount VF;

615

616ElementCount MinProfitableTripCount;

617

618 /// The vectorization unroll factor to use. Each scalar is vectorized to this

619 /// many different vector instructions.

620unsignedUF;

621

622 /// The builder that we use

623IRBuilder<>Builder;

624

625// --- Vectorization state ---

626

627 /// The vector-loop preheader.

628BasicBlock *LoopVectorPreHeader;

629

630 /// The scalar-loop preheader.

631BasicBlock *LoopScalarPreHeader;

632

633 /// Middle Block between the vector and the scalar.

634BasicBlock *LoopMiddleBlock;

635

636 /// A list of all bypass blocks. The first block is the entry of the loop.

637SmallVector<BasicBlock *, 4>LoopBypassBlocks;

638

639 /// Store instructions that were predicated.

640SmallVector<Instruction *, 4>PredicatedInstructions;

641

642 /// Trip count of the original loop.

643Value *TripCount =nullptr;

644

645 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))

646Value *VectorTripCount =nullptr;

647

648 /// The legality analysis.

649LoopVectorizationLegality *Legal;

650

651 /// The profitablity analysis.

652LoopVectorizationCostModel *Cost;

653

654// Record whether runtime checks are added.

655boolAddedSafetyChecks =false;

656

657 /// BFI and PSI are used to check for profile guided size optimizations.

658BlockFrequencyInfo *BFI;

659ProfileSummaryInfo *PSI;

660

661// Whether this loop should be optimized for size based on profile guided size

662// optimizatios.

663boolOptForSizeBasedOnProfile;

664

665 /// Structure to hold information about generated runtime checks, responsible

666 /// for cleaning the checks, if vectorization turns out unprofitable.

667 GeneratedRTChecks &RTChecks;

668

669 /// Mapping of induction phis to their additional bypass values. They

670 /// need to be added as operands to phi nodes in the scalar loop preheader

671 /// after the epilogue skeleton has been created.

672DenseMap<PHINode *, Value *>Induction2AdditionalBypassValue;

673

674 /// The additional bypass block which conditionally skips over the epilogue

675 /// loop after executing the main loop. Needed to resume inductions and

676 /// reductions during epilogue vectorization.

677BasicBlock *AdditionalBypassBlock =nullptr;

678

679VPlan &Plan;

680

681 /// The vector preheader block of \p Plan, used as target for check blocks

682 /// introduced during skeleton creation.

683VPBlockBase *VectorPHVPB;

684};

685

686/// Encapsulate information regarding vectorization of a loop and its epilogue.

687/// This information is meant to be updated and used across two stages of

688/// epilogue vectorization.

689structEpilogueLoopVectorizationInfo {

690ElementCount MainLoopVF =ElementCount::getFixed(0);

691unsignedMainLoopUF = 0;

692ElementCount EpilogueVF =ElementCount::getFixed(0);

693unsignedEpilogueUF = 0;

694BasicBlock *MainLoopIterationCountCheck =nullptr;

695BasicBlock *EpilogueIterationCountCheck =nullptr;

696BasicBlock *SCEVSafetyCheck =nullptr;

697BasicBlock *MemSafetyCheck =nullptr;

698Value *TripCount =nullptr;

699Value *VectorTripCount =nullptr;

700VPlan &EpiloguePlan;

701

702EpilogueLoopVectorizationInfo(ElementCount MVF,unsigned MUF,

703ElementCount EVF,unsigned EUF,

704VPlan &EpiloguePlan)

705 :MainLoopVF(MVF),MainLoopUF(MUF),EpilogueVF(EVF),EpilogueUF(EUF),

706EpiloguePlan(EpiloguePlan) {

707assert(EUF == 1 &&

708"A high UF for the epilogue loop is likely not beneficial.");

709 }

710};

711

712/// An extension of the inner loop vectorizer that creates a skeleton for a

713/// vectorized loop that has its epilogue (residual) also vectorized.

714/// The idea is to run the vplan on a given loop twice, firstly to setup the

715/// skeleton and vectorize the main loop, and secondly to complete the skeleton

716/// from the first step and vectorize the epilogue. This is achieved by

717/// deriving two concrete strategy classes from this base class and invoking

718/// them in succession from the loop vectorizer planner.

719classInnerLoopAndEpilogueVectorizer :publicInnerLoopVectorizer {

720public:

721InnerLoopAndEpilogueVectorizer(

722Loop *OrigLoop,PredicatedScalarEvolution &PSE,LoopInfo *LI,

723DominatorTree *DT,constTargetLibraryInfo *TLI,

724constTargetTransformInfo *TTI,AssumptionCache *AC,

725OptimizationRemarkEmitter *ORE,EpilogueLoopVectorizationInfo &EPI,

726LoopVectorizationLegality *LVL,llvm::LoopVectorizationCostModel *CM,

727BlockFrequencyInfo *BFI,ProfileSummaryInfo *PSI,

728 GeneratedRTChecks &Checks,VPlan &Plan)

729 :InnerLoopVectorizer(OrigLoop,PSE,LI,DT,TLI,TTI,AC,ORE,

730EPI.MainLoopVF,EPI.MainLoopVF,EPI.MainLoopUF, LVL,

731 CM,BFI,PSI, Checks,Plan),

732EPI(EPI) {}

733

734// Override this function to handle the more complex control flow around the

735// three loops.

736BasicBlock *

737createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)final {

738returncreateEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);

739 }

740

741 /// The interface for creating a vectorized skeleton using one of two

742 /// different strategies, each corresponding to one execution of the vplan

743 /// as described above.

744virtualBasicBlock *

745createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;

746

747 /// Holds and updates state information required to vectorize the main loop

748 /// and its epilogue in two separate passes. This setup helps us avoid

749 /// regenerating and recomputing runtime safety checks. It also helps us to

750 /// shorten the iteration-count-check path length for the cases where the

751 /// iteration count of the loop is so small that the main vector loop is

752 /// completely skipped.

753EpilogueLoopVectorizationInfo &EPI;

754};

755

756/// A specialized derived class of inner loop vectorizer that performs

757/// vectorization of *main* loops in the process of vectorizing loops and their

758/// epilogues.

759classEpilogueVectorizerMainLoop :publicInnerLoopAndEpilogueVectorizer {

760public:

761EpilogueVectorizerMainLoop(

762Loop *OrigLoop,PredicatedScalarEvolution &PSE,LoopInfo *LI,

763DominatorTree *DT,constTargetLibraryInfo *TLI,

764constTargetTransformInfo *TTI,AssumptionCache *AC,

765OptimizationRemarkEmitter *ORE,EpilogueLoopVectorizationInfo &EPI,

766LoopVectorizationLegality *LVL,llvm::LoopVectorizationCostModel *CM,

767BlockFrequencyInfo *BFI,ProfileSummaryInfo *PSI,

768 GeneratedRTChecks &Check,VPlan &Plan)

769 :InnerLoopAndEpilogueVectorizer(OrigLoop,PSE,LI,DT,TLI,TTI,AC,ORE,

770EPI, LVL, CM,BFI,PSI,Check,Plan) {}

771 /// Implements the interface for creating a vectorized skeleton using the

772 /// *main loop* strategy (ie the first pass of vplan execution).

773BasicBlock *

774createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)final;

775

776protected:

777 /// Emits an iteration count bypass check once for the main loop (when \p

778 /// ForEpilogue is false) and once for the epilogue loop (when \p

779 /// ForEpilogue is true).

780BasicBlock *emitIterationCountCheck(BasicBlock *Bypass,bool ForEpilogue);

781voidprintDebugTracesAtStart()override;

782voidprintDebugTracesAtEnd()override;

783};

784

785// A specialized derived class of inner loop vectorizer that performs

786// vectorization of *epilogue* loops in the process of vectorizing loops and

787// their epilogues.

788classEpilogueVectorizerEpilogueLoop :publicInnerLoopAndEpilogueVectorizer {

789public:

790EpilogueVectorizerEpilogueLoop(

791Loop *OrigLoop,PredicatedScalarEvolution &PSE,LoopInfo *LI,

792DominatorTree *DT,constTargetLibraryInfo *TLI,

793constTargetTransformInfo *TTI,AssumptionCache *AC,

794OptimizationRemarkEmitter *ORE,EpilogueLoopVectorizationInfo &EPI,

795LoopVectorizationLegality *LVL,llvm::LoopVectorizationCostModel *CM,

796BlockFrequencyInfo *BFI,ProfileSummaryInfo *PSI,

797 GeneratedRTChecks &Checks,VPlan &Plan)

798 :InnerLoopAndEpilogueVectorizer(OrigLoop,PSE,LI,DT,TLI,TTI,AC,ORE,

799EPI, LVL, CM,BFI,PSI, Checks,Plan) {

800TripCount =EPI.TripCount;

801 }

802 /// Implements the interface for creating a vectorized skeleton using the

803 /// *epilogue loop* strategy (ie the second pass of vplan execution).

804BasicBlock *

805createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)final;

806

807protected:

808 /// Emits an iteration count bypass check after the main vector loop has

809 /// finished to see if there are any iterations left to execute by either

810 /// the vector epilogue or the scalar epilogue.

811BasicBlock *emitMinimumVectorEpilogueIterCountCheck(

812BasicBlock *Bypass,

813BasicBlock *Insert);

814voidprintDebugTracesAtStart()override;

815voidprintDebugTracesAtEnd()override;

816};

817}// end namespace llvm

818

819/// Look for a meaningful debug location on the instruction or its operands.

820staticDebugLoc getDebugLocFromInstOrOperands(Instruction *I) {

821if (!I)

822returnDebugLoc();

823

824DebugLoc Empty;

825if (I->getDebugLoc() != Empty)

826returnI->getDebugLoc();

827

828for (Use &Op :I->operands()) {

829if (Instruction *OpInst = dyn_cast<Instruction>(Op))

830if (OpInst->getDebugLoc() != Empty)

831return OpInst->getDebugLoc();

832 }

833

834returnI->getDebugLoc();

835}

836

837/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I

838/// is passed, the message relates to that particular instruction.

839#ifndef NDEBUG

840staticvoiddebugVectorizationMessage(constStringRef Prefix,

841constStringRef DebugMsg,

842Instruction *I) {

843dbgs() <<"LV: " << Prefix << DebugMsg;

844if (I !=nullptr)

845dbgs() <<" " << *I;

846else

847dbgs() <<'.';

848dbgs() <<'\n';

849}

850#endif

851

852/// Create an analysis remark that explains why vectorization failed

853///

854/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p

855/// RemarkName is the identifier for the remark. If \p I is passed it is an

856/// instruction that prevents vectorization. Otherwise \p TheLoop is used for

857/// the location of the remark. If \p DL is passed, use it as debug location for

858/// the remark. \return the remark object that can be streamed to.

859staticOptimizationRemarkAnalysis

860createLVAnalysis(constchar *PassName,StringRef RemarkName,Loop *TheLoop,

861Instruction *I,DebugLoc DL = {}) {

862Value *CodeRegion =I ?I->getParent() : TheLoop->getHeader();

863// If debug location is attached to the instruction, use it. Otherwise if DL

864// was not provided, use the loop's.

865if (I &&I->getDebugLoc())

866DL =I->getDebugLoc();

867elseif (!DL)

868DL = TheLoop->getStartLoc();

869

870returnOptimizationRemarkAnalysis(PassName, RemarkName,DL, CodeRegion);

871}

872

873namespacellvm {

874

875/// Return a value for Step multiplied by VF.

876Value *createStepForVF(IRBuilderBase &B,Type *Ty,ElementCount VF,

877 int64_t Step) {

878assert(Ty->isIntegerTy() &&"Expected an integer step");

879returnB.CreateElementCount(Ty, VF.multiplyCoefficientBy(Step));

880}

881

882/// Return the runtime value for VF.

883Value *getRuntimeVF(IRBuilderBase &B,Type *Ty,ElementCount VF) {

884returnB.CreateElementCount(Ty, VF);

885}

886

887voidreportVectorizationFailure(constStringRef DebugMsg,

888constStringRef OREMsg,constStringRef ORETag,

889OptimizationRemarkEmitter *ORE,Loop *TheLoop,

890Instruction *I) {

891LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg,I));

892LoopVectorizeHints Hints(TheLoop,true/* doesn't matter */, *ORE);

893 ORE->emit(

894createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,I)

895 <<"loop not vectorized: " << OREMsg);

896}

897

898/// Reports an informative message: print \p Msg for debugging purposes as well

899/// as an optimization remark. Uses either \p I as location of the remark, or

900/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the

901/// remark. If \p DL is passed, use it as debug location for the remark.

902staticvoidreportVectorizationInfo(constStringRef Msg,constStringRef ORETag,

903OptimizationRemarkEmitter *ORE,

904Loop *TheLoop,Instruction *I =nullptr,

905DebugLoc DL = {}) {

906LLVM_DEBUG(debugVectorizationMessage("", Msg,I));

907LoopVectorizeHints Hints(TheLoop,true/* doesn't matter */, *ORE);

908 ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop,

909I,DL)

910 << Msg);

911}

912

913/// Report successful vectorization of the loop. In case an outer loop is

914/// vectorized, prepend "outer" to the vectorization remark.

915staticvoidreportVectorization(OptimizationRemarkEmitter *ORE,Loop *TheLoop,

916VectorizationFactor VF,unsigned IC) {

917LLVM_DEBUG(debugVectorizationMessage(

918"Vectorizing: ", TheLoop->isInnermost() ?"innermost loop" :"outer loop",

919nullptr));

920StringRef LoopType = TheLoop->isInnermost() ?"" :"outer ";

921 ORE->emit([&]() {

922returnOptimizationRemark(LV_NAME,"Vectorized", TheLoop->getStartLoc(),

923 TheLoop->getHeader())

924 <<"vectorized " << LoopType <<"loop (vectorization width: "

925 <<ore::NV("VectorizationFactor", VF.Width)

926 <<", interleaved count: " <<ore::NV("InterleaveCount", IC) <<")";

927 });

928}

929

930}// end namespace llvm

931

932namespacellvm {

933

934// Loop vectorization cost-model hints how the scalar epilogue loop should be

935// lowered.

936enumScalarEpilogueLowering {

937

938// The default: allowing scalar epilogues.

939CM_ScalarEpilogueAllowed,

940

941// Vectorization with OptForSize: don't allow epilogues.

942CM_ScalarEpilogueNotAllowedOptSize,

943

944// A special case of vectorisation with OptForSize: loops with a very small

945// trip count are considered for vectorization under OptForSize, thereby

946// making sure the cost of their loop body is dominant, free of runtime

947// guards and scalar iteration overheads.

948CM_ScalarEpilogueNotAllowedLowTripLoop,

949

950// Loop hint predicate indicating an epilogue is undesired.

951CM_ScalarEpilogueNotNeededUsePredicate,

952

953// Directive indicating we must either tail fold or not vectorize

954CM_ScalarEpilogueNotAllowedUsePredicate

955};

956

957usingInstructionVFPair = std::pair<Instruction *, ElementCount>;

958

959/// LoopVectorizationCostModel - estimates the expected speedups due to

960/// vectorization.

961/// In many cases vectorization is not profitable. This can happen because of

962/// a number of reasons. In this class we mainly attempt to predict the

963/// expected speedup/slowdowns due to the supported instruction set. We use the

964/// TargetTransformInfo to query the different backends for the cost of

965/// different operations.

966classLoopVectorizationCostModel {

967friendclassLoopVectorizationPlanner;

968

969public:

970LoopVectorizationCostModel(ScalarEpilogueLoweringSEL,Loop *L,

971PredicatedScalarEvolution &PSE,LoopInfo *LI,

972LoopVectorizationLegality *Legal,

973constTargetTransformInfo &TTI,

974constTargetLibraryInfo *TLI,DemandedBits *DB,

975AssumptionCache *AC,

976OptimizationRemarkEmitter *ORE,constFunction *F,

977constLoopVectorizeHints *Hints,

978InterleavedAccessInfo &IAI)

979 : ScalarEpilogueStatus(SEL),TheLoop(L),PSE(PSE),LI(LI),Legal(Legal),

980TTI(TTI),TLI(TLI),DB(DB),AC(AC),ORE(ORE),TheFunction(F),

981Hints(Hints),InterleaveInfo(IAI),CostKind(TTI::TCK_RecipThroughput) {}

982

983 /// \return An upper bound for the vectorization factors (both fixed and

984 /// scalable). If the factors are 0, vectorization and interleaving should be

985 /// avoided up front.

986FixedScalableVFPair computeMaxVF(ElementCount UserVF,unsigned UserIC);

987

988 /// \return True if runtime checks are required for vectorization, and false

989 /// otherwise.

990boolruntimeChecksRequired();

991

992 /// Setup cost-based decisions for user vectorization factor.

993 /// \return true if the UserVF is a feasible VF to be chosen.

994boolselectUserVectorizationFactor(ElementCount UserVF) {

995collectUniformsAndScalars(UserVF);

996collectInstsToScalarize(UserVF);

997returnexpectedCost(UserVF).isValid();

998 }

999

1000 /// \return The size (in bits) of the smallest and widest types in the code

1001 /// that needs to be vectorized. We ignore values that remain scalar such as

1002 /// 64 bit loop indices.

1003 std::pair<unsigned, unsigned>getSmallestAndWidestTypes();

1004

1005 /// \return The desired interleave count.

1006 /// If interleave count has been specified by metadata it will be returned.

1007 /// Otherwise, the interleave count is computed and returned. VF and LoopCost

1008 /// are the selected vectorization factor and the cost of the selected VF.

1009unsignedselectInterleaveCount(ElementCount VF,InstructionCost LoopCost);

1010

1011 /// Memory access instruction may be vectorized in more than one way.

1012 /// Form of instruction after vectorization depends on cost.

1013 /// This function takes cost-based decisions for Load/Store instructions

1014 /// and collects them in a map. This decisions map is used for building

1015 /// the lists of loop-uniform and loop-scalar instructions.

1016 /// The calculated cost is saved with widening decision in order to

1017 /// avoid redundant calculations.

1018voidsetCostBasedWideningDecision(ElementCount VF);

1019

1020 /// A call may be vectorized in different ways depending on whether we have

1021 /// vectorized variants available and whether the target supports masking.

1022 /// This function analyzes all calls in the function at the supplied VF,

1023 /// makes a decision based on the costs of available options, and stores that

1024 /// decision in a map for use in planning and plan execution.

1025voidsetVectorizedCallDecision(ElementCount VF);

1026

1027 /// A struct that represents some properties of the register usage

1028 /// of a loop.

1029structRegisterUsage {

1030 /// Holds the number of loop invariant values that are used in the loop.

1031 /// The key is ClassID of target-provided register class.

1032SmallMapVector<unsigned, unsigned, 4>LoopInvariantRegs;

1033 /// Holds the maximum number of concurrent live intervals in the loop.

1034 /// The key is ClassID of target-provided register class.

1035SmallMapVector<unsigned, unsigned, 4>MaxLocalUsers;

1036 };

1037

1038 /// \return Returns information about the register usages of the loop for the

1039 /// given vectorization factors.

1040SmallVector<RegisterUsage, 8>

1041calculateRegisterUsage(ArrayRef<ElementCount> VFs);

1042

1043 /// Collect values we want to ignore in the cost model.

1044voidcollectValuesToIgnore();

1045

1046 /// Collect all element types in the loop for which widening is needed.

1047voidcollectElementTypesForWidening();

1048

1049 /// Split reductions into those that happen in the loop, and those that happen

1050 /// outside. In loop reductions are collected into InLoopReductions.

1051voidcollectInLoopReductions();

1052

1053 /// Returns true if we should use strict in-order reductions for the given

1054 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,

1055 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering

1056 /// of FP operations.

1057booluseOrderedReductions(constRecurrenceDescriptor &RdxDesc) const{

1058return !Hints->allowReordering() && RdxDesc.isOrdered();

1059 }

1060

1061 /// \returns The smallest bitwidth each instruction can be represented with.

1062 /// The vector equivalents of these instructions should be truncated to this

1063 /// type.

1064constMapVector<Instruction *, uint64_t> &getMinimalBitwidths() const{

1065return MinBWs;

1066 }

1067

1068 /// \returns True if it is more profitable to scalarize instruction \p I for

1069 /// vectorization factor \p VF.

1070boolisProfitableToScalarize(Instruction *I,ElementCount VF) const{

1071assert(VF.isVector() &&

1072"Profitable to scalarize relevant only for VF > 1.");

1073assert(

1074TheLoop->isInnermost() &&

1075"cost-model should not be used for outer loops (in VPlan-native path)");

1076

1077auto Scalars = InstsToScalarize.find(VF);

1078assert(Scalars != InstsToScalarize.end() &&

1079"VF not yet analyzed for scalarization profitability");

1080return Scalars->second.contains(I);

1081 }

1082

1083 /// Returns true if \p I is known to be uniform after vectorization.

1084boolisUniformAfterVectorization(Instruction *I,ElementCount VF) const{

1085assert(

1086TheLoop->isInnermost() &&

1087"cost-model should not be used for outer loops (in VPlan-native path)");

1088// Pseudo probe needs to be duplicated for each unrolled iteration and

1089// vector lane so that profiled loop trip count can be accurately

1090// accumulated instead of being under counted.

1091if (isa<PseudoProbeInst>(I))

1092returnfalse;

1093

1094if (VF.isScalar())

1095returntrue;

1096

1097auto UniformsPerVF = Uniforms.find(VF);

1098assert(UniformsPerVF != Uniforms.end() &&

1099"VF not yet analyzed for uniformity");

1100return UniformsPerVF->second.count(I);

1101 }

1102

1103 /// Returns true if \p I is known to be scalar after vectorization.

1104boolisScalarAfterVectorization(Instruction *I,ElementCount VF) const{

1105assert(

1106TheLoop->isInnermost() &&

1107"cost-model should not be used for outer loops (in VPlan-native path)");

1108if (VF.isScalar())

1109returntrue;

1110

1111auto ScalarsPerVF = Scalars.find(VF);

1112assert(ScalarsPerVF != Scalars.end() &&

1113"Scalar values are not calculated for VF");

1114return ScalarsPerVF->second.count(I);

1115 }

1116

1117 /// \returns True if instruction \p I can be truncated to a smaller bitwidth

1118 /// for vectorization factor \p VF.

1119boolcanTruncateToMinimalBitwidth(Instruction *I,ElementCount VF) const{

1120return VF.isVector() && MinBWs.contains(I) &&

1121 !isProfitableToScalarize(I, VF) &&

1122 !isScalarAfterVectorization(I, VF);

1123 }

1124

1125 /// Decision that was taken during cost calculation for memory instruction.

1126enumInstWidening {

1127CM_Unknown,

1128CM_Widen,// For consecutive accesses with stride +1.

1129CM_Widen_Reverse,// For consecutive accesses with stride -1.

1130CM_Interleave,

1131CM_GatherScatter,

1132CM_Scalarize,

1133CM_VectorCall,

1134CM_IntrinsicCall

1135 };

1136

1137 /// Save vectorization decision \p W and \p Cost taken by the cost model for

1138 /// instruction \p I and vector width \p VF.

1139voidsetWideningDecision(Instruction *I,ElementCount VF,InstWidening W,

1140InstructionCost Cost) {

1141assert(VF.isVector() &&"Expected VF >=2");

1142 WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W,Cost);

1143 }

1144

1145 /// Save vectorization decision \p W and \p Cost taken by the cost model for

1146 /// interleaving group \p Grp and vector width \p VF.

1147voidsetWideningDecision(constInterleaveGroup<Instruction> *Grp,

1148ElementCount VF,InstWidening W,

1149InstructionCost Cost) {

1150assert(VF.isVector() &&"Expected VF >=2");

1151 /// Broadcast this decicion to all instructions inside the group.

1152 /// When interleaving, the cost will only be assigned one instruction, the

1153 /// insert position. For other cases, add the appropriate fraction of the

1154 /// total cost to each instruction. This ensures accurate costs are used,

1155 /// even if the insert position instruction is not used.

1156InstructionCost InsertPosCost =Cost;

1157InstructionCost OtherMemberCost = 0;

1158if (W !=CM_Interleave)

1159 OtherMemberCost = InsertPosCost =Cost / Grp->getNumMembers();

1160 ;

1161for (unsignedIdx = 0;Idx < Grp->getFactor(); ++Idx) {

1162if (auto *I = Grp->getMember(Idx)) {

1163if (Grp->getInsertPos() ==I)

1164 WideningDecisions[std::make_pair(I, VF)] =

1165 std::make_pair(W, InsertPosCost);

1166else

1167 WideningDecisions[std::make_pair(I, VF)] =

1168 std::make_pair(W, OtherMemberCost);

1169 }

1170 }

1171 }

1172

1173 /// Return the cost model decision for the given instruction \p I and vector

1174 /// width \p VF. Return CM_Unknown if this instruction did not pass

1175 /// through the cost modeling.

1176InstWidening getWideningDecision(Instruction *I,ElementCount VF) const{

1177assert(VF.isVector() &&"Expected VF to be a vector VF");

1178assert(

1179TheLoop->isInnermost() &&

1180"cost-model should not be used for outer loops (in VPlan-native path)");

1181

1182 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);

1183auto Itr = WideningDecisions.find(InstOnVF);

1184if (Itr == WideningDecisions.end())

1185returnCM_Unknown;

1186return Itr->second.first;

1187 }

1188

1189 /// Return the vectorization cost for the given instruction \p I and vector

1190 /// width \p VF.

1191InstructionCost getWideningCost(Instruction *I,ElementCount VF) {

1192assert(VF.isVector() &&"Expected VF >=2");

1193 std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);

1194assert(WideningDecisions.contains(InstOnVF) &&

1195"The cost is not calculated");

1196return WideningDecisions[InstOnVF].second;

1197 }

1198

1199structCallWideningDecision {

1200InstWidening Kind;

1201Function *Variant;

1202Intrinsic::ID IID;

1203 std::optional<unsigned>MaskPos;

1204InstructionCost Cost;

1205 };

1206

1207voidsetCallWideningDecision(CallInst *CI,ElementCount VF,InstWidening Kind,

1208Function *Variant,Intrinsic::ID IID,

1209 std::optional<unsigned> MaskPos,

1210InstructionCost Cost) {

1211assert(!VF.isScalar() &&"Expected vector VF");

1212 CallWideningDecisions[std::make_pair(CI, VF)] = {Kind, Variant, IID,

1213 MaskPos,Cost};

1214 }

1215

1216CallWideningDecision getCallWideningDecision(CallInst *CI,

1217ElementCount VF) const{

1218assert(!VF.isScalar() &&"Expected vector VF");

1219return CallWideningDecisions.at(std::make_pair(CI, VF));

1220 }

1221

1222 /// Return True if instruction \p I is an optimizable truncate whose operand

1223 /// is an induction variable. Such a truncate will be removed by adding a new

1224 /// induction variable with the destination type.

1225boolisOptimizableIVTruncate(Instruction *I,ElementCount VF) {

1226// If the instruction is not a truncate, return false.

1227auto *Trunc = dyn_cast<TruncInst>(I);

1228if (!Trunc)

1229returnfalse;

1230

1231// Get the source and destination types of the truncate.

1232Type *SrcTy =toVectorTy(cast<CastInst>(I)->getSrcTy(), VF);

1233Type *DestTy =toVectorTy(cast<CastInst>(I)->getDestTy(), VF);

1234

1235// If the truncate is free for the given types, return false. Replacing a

1236// free truncate with an induction variable would add an induction variable

1237// update instruction to each iteration of the loop. We exclude from this

1238// check the primary induction variable since it will need an update

1239// instruction regardless.

1240Value *Op = Trunc->getOperand(0);

1241if (Op !=Legal->getPrimaryInduction() &&TTI.isTruncateFree(SrcTy, DestTy))

1242returnfalse;

1243

1244// If the truncated value is not an induction variable, return false.

1245returnLegal->isInductionPhi(Op);

1246 }

1247

1248 /// Collects the instructions to scalarize for each predicated instruction in

1249 /// the loop.

1250voidcollectInstsToScalarize(ElementCount VF);

1251

1252 /// Collect Uniform and Scalar values for the given \p VF.

1253 /// The sets depend on CM decision for Load/Store instructions

1254 /// that may be vectorized as interleave, gather-scatter or scalarized.

1255 /// Also make a decision on what to do about call instructions in the loop

1256 /// at that VF -- scalarize, call a known vector routine, or call a

1257 /// vector intrinsic.

1258voidcollectUniformsAndScalars(ElementCount VF) {

1259// Do the analysis once.

1260if (VF.isScalar() || Uniforms.contains(VF))

1261return;

1262setCostBasedWideningDecision(VF);

1263 collectLoopUniforms(VF);

1264setVectorizedCallDecision(VF);

1265 collectLoopScalars(VF);

1266 }

1267

1268 /// Returns true if the target machine supports masked store operation

1269 /// for the given \p DataType and kind of access to \p Ptr.

1270boolisLegalMaskedStore(Type *DataType,Value *Ptr,Align Alignment) const{

1271returnLegal->isConsecutivePtr(DataType,Ptr) &&

1272TTI.isLegalMaskedStore(DataType, Alignment);

1273 }

1274

1275 /// Returns true if the target machine supports masked load operation

1276 /// for the given \p DataType and kind of access to \p Ptr.

1277boolisLegalMaskedLoad(Type *DataType,Value *Ptr,Align Alignment) const{

1278returnLegal->isConsecutivePtr(DataType,Ptr) &&

1279TTI.isLegalMaskedLoad(DataType, Alignment);

1280 }

1281

1282 /// Returns true if the target machine can represent \p V as a masked gather

1283 /// or scatter operation.

1284boolisLegalGatherOrScatter(Value *V,ElementCount VF) {

1285boolLI = isa<LoadInst>(V);

1286bool SI = isa<StoreInst>(V);

1287if (!LI && !SI)

1288returnfalse;

1289auto *Ty =getLoadStoreType(V);

1290Align Align =getLoadStoreAlignment(V);

1291if (VF.isVector())

1292 Ty =VectorType::get(Ty, VF);

1293return (LI &&TTI.isLegalMaskedGather(Ty,Align)) ||

1294 (SI &&TTI.isLegalMaskedScatter(Ty,Align));

1295 }

1296

1297 /// Returns true if the target machine supports all of the reduction

1298 /// variables found for the given VF.

1299boolcanVectorizeReductions(ElementCount VF) const{

1300return (all_of(Legal->getReductionVars(), [&](auto &Reduction) ->bool {

1301 const RecurrenceDescriptor &RdxDesc = Reduction.second;

1302 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);

1303 }));

1304 }

1305

1306 /// Given costs for both strategies, return true if the scalar predication

1307 /// lowering should be used for div/rem. This incorporates an override

1308 /// option so it is not simply a cost comparison.

1309boolisDivRemScalarWithPredication(InstructionCost ScalarCost,

1310InstructionCost SafeDivisorCost) const{

1311switch (ForceSafeDivisor) {

1312casecl::BOU_UNSET:

1313return ScalarCost < SafeDivisorCost;

1314casecl::BOU_TRUE:

1315returnfalse;

1316casecl::BOU_FALSE:

1317returntrue;

1318 }

1319llvm_unreachable("impossible case value");

1320 }

1321

1322 /// Returns true if \p I is an instruction which requires predication and

1323 /// for which our chosen predication strategy is scalarization (i.e. we

1324 /// don't have an alternate strategy such as masking available).

1325 /// \p VF is the vectorization factor that will be used to vectorize \p I.

1326boolisScalarWithPredication(Instruction *I,ElementCount VF)const;

1327

1328 /// Returns true if \p I is an instruction that needs to be predicated

1329 /// at runtime. The result is independent of the predication mechanism.

1330 /// Superset of instructions that return true for isScalarWithPredication.

1331boolisPredicatedInst(Instruction *I)const;

1332

1333 /// Return the costs for our two available strategies for lowering a

1334 /// div/rem operation which requires speculating at least one lane.

1335 /// First result is for scalarization (will be invalid for scalable

1336 /// vectors); second is for the safe-divisor strategy.

1337 std::pair<InstructionCost, InstructionCost>

1338getDivRemSpeculationCost(Instruction *I,

1339ElementCount VF)const;

1340

1341 /// Returns true if \p I is a memory instruction with consecutive memory

1342 /// access that can be widened.

1343boolmemoryInstructionCanBeWidened(Instruction *I,ElementCount VF);

1344

1345 /// Returns true if \p I is a memory instruction in an interleaved-group

1346 /// of memory accesses that can be vectorized with wide vector loads/stores

1347 /// and shuffles.

1348boolinterleavedAccessCanBeWidened(Instruction *I,ElementCount VF)const;

1349

1350 /// Check if \p Instr belongs to any interleaved access group.

1351boolisAccessInterleaved(Instruction *Instr) const{

1352returnInterleaveInfo.isInterleaved(Instr);

1353 }

1354

1355 /// Get the interleaved access group that \p Instr belongs to.

1356constInterleaveGroup<Instruction> *

1357getInterleavedAccessGroup(Instruction *Instr) const{

1358returnInterleaveInfo.getInterleaveGroup(Instr);

1359 }

1360

1361 /// Returns true if we're required to use a scalar epilogue for at least

1362 /// the final iteration of the original loop.

1363boolrequiresScalarEpilogue(bool IsVectorizing) const{

1364if (!isScalarEpilogueAllowed()) {

1365LLVM_DEBUG(dbgs() <<"LV: Loop does not require scalar epilogue\n");

1366returnfalse;

1367 }

1368// If we might exit from anywhere but the latch and early exit vectorization

1369// is disabled, we must run the exiting iteration in scalar form.

1370if (TheLoop->getExitingBlock() !=TheLoop->getLoopLatch() &&

1371 !(EnableEarlyExitVectorization &&Legal->hasUncountableEarlyExit())) {

1372LLVM_DEBUG(dbgs() <<"LV: Loop requires scalar epilogue: not exiting "

1373"from latch block\n");

1374returntrue;

1375 }

1376if (IsVectorizing &&InterleaveInfo.requiresScalarEpilogue()) {

1377LLVM_DEBUG(dbgs() <<"LV: Loop requires scalar epilogue: "

1378"interleaved group requires scalar epilogue\n");

1379returntrue;

1380 }

1381LLVM_DEBUG(dbgs() <<"LV: Loop does not require scalar epilogue\n");

1382returnfalse;

1383 }

1384

1385 /// Returns true if we're required to use a scalar epilogue for at least

1386 /// the final iteration of the original loop for all VFs in \p Range.

1387 /// A scalar epilogue must either be required for all VFs in \p Range or for

1388 /// none.

1389boolrequiresScalarEpilogue(VFRange Range) const{

1390auto RequiresScalarEpilogue = [this](ElementCount VF) {

1391returnrequiresScalarEpilogue(VF.isVector());

1392 };

1393bool IsRequired =all_of(Range, RequiresScalarEpilogue);

1394assert(

1395 (IsRequired ||none_of(Range, RequiresScalarEpilogue)) &&

1396"all VFs in range must agree on whether a scalar epilogue is required");

1397return IsRequired;

1398 }

1399

1400 /// Returns true if a scalar epilogue is not allowed due to optsize or a

1401 /// loop hint annotation.

1402boolisScalarEpilogueAllowed() const{

1403return ScalarEpilogueStatus ==CM_ScalarEpilogueAllowed;

1404 }

1405

1406 /// Returns the TailFoldingStyle that is best for the current loop.

1407TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow =true) const{

1408if (!ChosenTailFoldingStyle)

1409returnTailFoldingStyle::None;

1410return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first

1411 : ChosenTailFoldingStyle->second;

1412 }

1413

1414 /// Selects and saves TailFoldingStyle for 2 options - if IV update may

1415 /// overflow or not.

1416 /// \param IsScalableVF true if scalable vector factors enabled.

1417 /// \param UserIC User specific interleave count.

1418voidsetTailFoldingStyles(bool IsScalableVF,unsigned UserIC) {

1419assert(!ChosenTailFoldingStyle &&"Tail folding must not be selected yet.");

1420if (!Legal->canFoldTailByMasking()) {

1421 ChosenTailFoldingStyle =

1422 std::make_pair(TailFoldingStyle::None,TailFoldingStyle::None);

1423return;

1424 }

1425

1426if (!ForceTailFoldingStyle.getNumOccurrences()) {

1427 ChosenTailFoldingStyle = std::make_pair(

1428TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),

1429TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false));

1430return;

1431 }

1432

1433// Set styles when forced.

1434 ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(),

1435ForceTailFoldingStyle.getValue());

1436if (ForceTailFoldingStyle !=TailFoldingStyle::DataWithEVL)

1437return;

1438// Override forced styles if needed.

1439// FIXME: use actual opcode/data type for analysis here.

1440// FIXME: Investigate opportunity for fixed vector factor.

1441// FIXME: support fixed-order recurrences by fixing splice of non VFxUF

1442// penultimate EVL.

1443bool EVLIsLegal =

1444 UserIC <= 1 &&TTI.hasActiveVectorLength(0,nullptr,Align()) &&

1445 !EnableVPlanNativePath &&Legal->getFixedOrderRecurrences().empty();

1446if (!EVLIsLegal) {

1447// If for some reason EVL mode is unsupported, fallback to

1448// DataWithoutLaneMask to try to vectorize the loop with folded tail

1449// in a generic way.

1450 ChosenTailFoldingStyle =

1451 std::make_pair(TailFoldingStyle::DataWithoutLaneMask,

1452TailFoldingStyle::DataWithoutLaneMask);

1453LLVM_DEBUG(

1454dbgs()

1455 <<"LV: Preference for VP intrinsics indicated. Will "

1456"not try to generate VP Intrinsics "

1457 << (UserIC > 1

1458 ?"since interleave count specified is greater than 1.\n"

1459 :"due to non-interleaving reasons.\n"));

1460 }

1461 }

1462

1463 /// Returns true if all loop blocks should be masked to fold tail loop.

1464boolfoldTailByMasking() const{

1465// TODO: check if it is possible to check for None style independent of

1466// IVUpdateMayOverflow flag in getTailFoldingStyle.

1467returngetTailFoldingStyle() !=TailFoldingStyle::None;

1468 }

1469

1470 /// Return maximum safe number of elements to be processed per vector

1471 /// iteration, which do not prevent store-load forwarding and are safe with

1472 /// regard to the memory dependencies. Required for EVL-based VPlans to

1473 /// correctly calculate AVL (application vector length) as min(remaining AVL,

1474 /// MaxSafeElements).

1475 /// TODO: need to consider adjusting cost model to use this value as a

1476 /// vectorization factor for EVL-based vectorization.

1477 std::optional<unsigned>getMaxSafeElements() const{return MaxSafeElements; }

1478

1479 /// Returns true if the instructions in this block requires predication

1480 /// for any reason, e.g. because tail folding now requires a predicate

1481 /// or because the block in the original loop was predicated.

1482boolblockNeedsPredicationForAnyReason(BasicBlock *BB) const{

1483returnfoldTailByMasking() ||Legal->blockNeedsPredication(BB);

1484 }

1485

1486 /// Returns true if VP intrinsics with explicit vector length support should

1487 /// be generated in the tail folded loop.

1488boolfoldTailWithEVL() const{

1489returngetTailFoldingStyle() ==TailFoldingStyle::DataWithEVL;

1490 }

1491

1492 /// Returns true if the Phi is part of an inloop reduction.

1493boolisInLoopReduction(PHINode *Phi) const{

1494return InLoopReductions.contains(Phi);

1495 }

1496

1497 /// Returns true if the predicated reduction select should be used to set the

1498 /// incoming value for the reduction phi.

1499boolusePredicatedReductionSelect(unsigned Opcode,Type *PhiTy) const{

1500// Force to use predicated reduction select since the EVL of the

1501// second-to-last iteration might not be VF*UF.

1502if (foldTailWithEVL())

1503returntrue;

1504returnPreferPredicatedReductionSelect ||

1505TTI.preferPredicatedReductionSelect(

1506 Opcode, PhiTy,TargetTransformInfo::ReductionFlags());

1507 }

1508

1509 /// Estimate cost of an intrinsic call instruction CI if it were vectorized

1510 /// with factor VF. Return the cost of the instruction, including

1511 /// scalarization overhead if it's needed.

1512InstructionCost getVectorIntrinsicCost(CallInst *CI,ElementCount VF)const;

1513

1514 /// Estimate cost of a call instruction CI if it were vectorized with factor

1515 /// VF. Return the cost of the instruction, including scalarization overhead

1516 /// if it's needed.

1517InstructionCost getVectorCallCost(CallInst *CI,ElementCount VF)const;

1518

1519 /// Invalidates decisions already taken by the cost model.

1520voidinvalidateCostModelingDecisions() {

1521 WideningDecisions.clear();

1522 CallWideningDecisions.clear();

1523 Uniforms.clear();

1524 Scalars.clear();

1525 }

1526

1527 /// Returns the expected execution cost. The unit of the cost does

1528 /// not matter because we use the 'cost' units to compare different

1529 /// vector widths. The cost that is returned is *not* normalized by

1530 /// the factor width.

1531InstructionCost expectedCost(ElementCount VF);

1532

1533boolhasPredStores() const{return NumPredStores > 0; }

1534

1535 /// Returns true if epilogue vectorization is considered profitable, and

1536 /// false otherwise.

1537 /// \p VF is the vectorization factor chosen for the original loop.

1538 /// \p Multiplier is an aditional scaling factor applied to VF before

1539 /// comparing to EpilogueVectorizationMinVF.

1540boolisEpilogueVectorizationProfitable(constElementCount VF,

1541constunsigned IC)const;

1542

1543 /// Returns the execution time cost of an instruction for a given vector

1544 /// width. Vector width of one means scalar.

1545InstructionCost getInstructionCost(Instruction *I,ElementCount VF);

1546

1547 /// Return the cost of instructions in an inloop reduction pattern, if I is

1548 /// part of that pattern.

1549 std::optional<InstructionCost>getReductionPatternCost(Instruction *I,

1550ElementCount VF,

1551Type *VectorTy)const;

1552

1553 /// Returns true if \p Op should be considered invariant and if it is

1554 /// trivially hoistable.

1555boolshouldConsiderInvariant(Value *Op);

1556

1557private:

1558unsigned NumPredStores = 0;

1559

1560 /// \return An upper bound for the vectorization factors for both

1561 /// fixed and scalable vectorization, where the minimum-known number of

1562 /// elements is a power-of-2 larger than zero. If scalable vectorization is

1563 /// disabled or unsupported, then the scalable part will be equal to

1564 /// ElementCount::getScalable(0).

1565FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,

1566ElementCount UserVF,

1567bool FoldTailByMasking);

1568

1569 /// \return the maximized element count based on the targets vector

1570 /// registers and the loop trip-count, but limited to a maximum safe VF.

1571 /// This is a helper function of computeFeasibleMaxVF.

1572ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,

1573unsigned SmallestType,

1574unsigned WidestType,

1575ElementCount MaxSafeVF,

1576bool FoldTailByMasking);

1577

1578 /// Checks if scalable vectorization is supported and enabled. Caches the

1579 /// result to avoid repeated debug dumps for repeated queries.

1580bool isScalableVectorizationAllowed();

1581

1582 /// \return the maximum legal scalable VF, based on the safe max number

1583 /// of elements.

1584ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);

1585

1586 /// Calculate vectorization cost of memory instruction \p I.

1587InstructionCost getMemoryInstructionCost(Instruction *I,ElementCount VF);

1588

1589 /// The cost computation for scalarized memory instruction.

1590InstructionCost getMemInstScalarizationCost(Instruction *I,ElementCount VF);

1591

1592 /// The cost computation for interleaving group of memory instructions.

1593InstructionCost getInterleaveGroupCost(Instruction *I,ElementCount VF);

1594

1595 /// The cost computation for Gather/Scatter instruction.

1596InstructionCost getGatherScatterCost(Instruction *I,ElementCount VF);

1597

1598 /// The cost computation for widening instruction \p I with consecutive

1599 /// memory access.

1600InstructionCost getConsecutiveMemOpCost(Instruction *I,ElementCount VF);

1601

1602 /// The cost calculation for Load/Store instruction \p I with uniform pointer -

1603 /// Load: scalar load + broadcast.

1604 /// Store: scalar store + (loop invariant value stored? 0 : extract of last

1605 /// element)

1606InstructionCost getUniformMemOpCost(Instruction *I,ElementCount VF);

1607

1608 /// Estimate the overhead of scalarizing an instruction. This is a

1609 /// convenience wrapper for the type-based getScalarizationOverhead API.

1610InstructionCost getScalarizationOverhead(Instruction *I,

1611ElementCount VF)const;

1612

1613 /// Returns true if an artificially high cost for emulated masked memrefs

1614 /// should be used.

1615bool useEmulatedMaskMemRefHack(Instruction *I,ElementCount VF);

1616

1617 /// Map of scalar integer values to the smallest bitwidth they can be legally

1618 /// represented as. The vector equivalents of these values should be truncated

1619 /// to this type.

1620MapVector<Instruction *, uint64_t> MinBWs;

1621

1622 /// A type representing the costs for instructions if they were to be

1623 /// scalarized rather than vectorized. The entries are Instruction-Cost

1624 /// pairs.

1625usingScalarCostsTy =DenseMap<Instruction *, InstructionCost>;

1626

1627 /// A set containing all BasicBlocks that are known to present after

1628 /// vectorization as a predicated block.

1629DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>

1630 PredicatedBBsAfterVectorization;

1631

1632 /// Records whether it is allowed to have the original scalar loop execute at

1633 /// least once. This may be needed as a fallback loop in case runtime

1634 /// aliasing/dependence checks fail, or to handle the tail/remainder

1635 /// iterations when the trip count is unknown or doesn't divide by the VF,

1636 /// or as a peel-loop to handle gaps in interleave-groups.

1637 /// Under optsize and when the trip count is very small we don't allow any

1638 /// iterations to execute in the scalar loop.

1639ScalarEpilogueLowering ScalarEpilogueStatus =CM_ScalarEpilogueAllowed;

1640

1641 /// Control finally chosen tail folding style. The first element is used if

1642 /// the IV update may overflow, the second element - if it does not.

1643 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>

1644 ChosenTailFoldingStyle;

1645

1646 /// true if scalable vectorization is supported and enabled.

1647 std::optional<bool> IsScalableVectorizationAllowed;

1648

1649 /// Maximum safe number of elements to be processed per vector iteration,

1650 /// which do not prevent store-load forwarding and are safe with regard to the

1651 /// memory dependencies. Required for EVL-based veectorization, where this

1652 /// value is used as the upper bound of the safe AVL.

1653 std::optional<unsigned> MaxSafeElements;

1654

1655 /// A map holding scalar costs for different vectorization factors. The

1656 /// presence of a cost for an instruction in the mapping indicates that the

1657 /// instruction will be scalarized when vectorizing with the associated

1658 /// vectorization factor. The entries are VF-ScalarCostTy pairs.

1659DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;

1660

1661 /// Holds the instructions known to be uniform after vectorization.

1662 /// The data is collected per VF.

1663DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;

1664

1665 /// Holds the instructions known to be scalar after vectorization.

1666 /// The data is collected per VF.

1667DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;

1668

1669 /// Holds the instructions (address computations) that are forced to be

1670 /// scalarized.

1671DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;

1672

1673 /// PHINodes of the reductions that should be expanded in-loop.

1674SmallPtrSet<PHINode *, 4> InLoopReductions;

1675

1676 /// A Map of inloop reduction operations and their immediate chain operand.

1677 /// FIXME: This can be removed once reductions can be costed correctly in

1678 /// VPlan. This was added to allow quick lookup of the inloop operations.

1679DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;

1680

1681 /// Returns the expected difference in cost from scalarizing the expression

1682 /// feeding a predicated instruction \p PredInst. The instructions to

1683 /// scalarize and their scalar costs are collected in \p ScalarCosts. A

1684 /// non-negative return value implies the expression will be scalarized.

1685 /// Currently, only single-use chains are considered for scalarization.

1686InstructionCost computePredInstDiscount(Instruction *PredInst,

1687 ScalarCostsTy &ScalarCosts,

1688ElementCount VF);

1689

1690 /// Collect the instructions that are uniform after vectorization. An

1691 /// instruction is uniform if we represent it with a single scalar value in

1692 /// the vectorized loop corresponding to each vector iteration. Examples of

1693 /// uniform instructions include pointer operands of consecutive or

1694 /// interleaved memory accesses. Note that although uniformity implies an

1695 /// instruction will be scalar, the reverse is not true. In general, a

1696 /// scalarized instruction will be represented by VF scalar values in the

1697 /// vectorized loop, each corresponding to an iteration of the original

1698 /// scalar loop.

1699void collectLoopUniforms(ElementCount VF);

1700

1701 /// Collect the instructions that are scalar after vectorization. An

1702 /// instruction is scalar if it is known to be uniform or will be scalarized

1703 /// during vectorization. collectLoopScalars should only add non-uniform nodes

1704 /// to the list if they are used by a load/store instruction that is marked as

1705 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by

1706 /// VF values in the vectorized loop, each corresponding to an iteration of

1707 /// the original scalar loop.

1708void collectLoopScalars(ElementCount VF);

1709

1710 /// Keeps cost model vectorization decision and cost for instructions.

1711 /// Right now it is used for memory instructions only.

1712usingDecisionList =DenseMap<std::pair<Instruction *, ElementCount>,

1713 std::pair<InstWidening, InstructionCost>>;

1714

1715 DecisionList WideningDecisions;

1716

1717usingCallDecisionList =

1718DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;

1719

1720 CallDecisionList CallWideningDecisions;

1721

1722 /// Returns true if \p V is expected to be vectorized and it needs to be

1723 /// extracted.

1724bool needsExtract(Value *V,ElementCount VF) const{

1725Instruction *I = dyn_cast<Instruction>(V);

1726if (VF.isScalar() || !I || !TheLoop->contains(I) ||

1727TheLoop->isLoopInvariant(I) ||

1728getWideningDecision(I, VF) ==CM_Scalarize)

1729returnfalse;

1730

1731// Assume we can vectorize V (and hence we need extraction) if the

1732// scalars are not computed yet. This can happen, because it is called

1733// via getScalarizationOverhead from setCostBasedWideningDecision, before

1734// the scalars are collected. That should be a safe assumption in most

1735// cases, because we check if the operands have vectorizable types

1736// beforehand in LoopVectorizationLegality.

1737return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF);

1738 };

1739

1740 /// Returns a range containing only operands needing to be extracted.

1741SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,

1742ElementCount VF) const{

1743returnSmallVector<Value *, 4>(make_filter_range(

1744 Ops, [this, VF](Value *V) {return this->needsExtract(V, VF); }));

1745 }

1746

1747public:

1748 /// The loop that we evaluate.

1749Loop *TheLoop;

1750

1751 /// Predicated scalar evolution analysis.

1752PredicatedScalarEvolution &PSE;

1753

1754 /// Loop Info analysis.

1755LoopInfo *LI;

1756

1757 /// Vectorization legality.

1758LoopVectorizationLegality *Legal;

1759

1760 /// Vector target information.

1761constTargetTransformInfo &TTI;

1762

1763 /// Target Library Info.

1764constTargetLibraryInfo *TLI;

1765

1766 /// Demanded bits analysis.

1767DemandedBits *DB;

1768

1769 /// Assumption cache.

1770AssumptionCache *AC;

1771

1772 /// Interface to emit optimization remarks.

1773OptimizationRemarkEmitter *ORE;

1774

1775constFunction *TheFunction;

1776

1777 /// Loop Vectorize Hint.

1778constLoopVectorizeHints *Hints;

1779

1780 /// The interleave access information contains groups of interleaved accesses

1781 /// with the same stride and close to each other.

1782InterleavedAccessInfo &InterleaveInfo;

1783

1784 /// Values to ignore in the cost model.

1785SmallPtrSet<const Value *, 16>ValuesToIgnore;

1786

1787 /// Values to ignore in the cost model when VF > 1.

1788SmallPtrSet<const Value *, 16>VecValuesToIgnore;

1789

1790 /// All element types found in the loop.

1791SmallPtrSet<Type *, 16>ElementTypesInLoop;

1792

1793 /// The kind of cost that we are calculating

1794TTI::TargetCostKind CostKind;

1795};

1796}// end namespace llvm

1797

1798namespace{

1799/// Helper struct to manage generating runtime checks for vectorization.

1800///

1801/// The runtime checks are created up-front in temporary blocks to allow better

1802/// estimating the cost and un-linked from the existing IR. After deciding to

1803/// vectorize, the checks are moved back. If deciding not to vectorize, the

1804/// temporary blocks are completely removed.

1805classGeneratedRTChecks {

1806 /// Basic block which contains the generated SCEV checks, if any.

1807BasicBlock *SCEVCheckBlock =nullptr;

1808

1809 /// The value representing the result of the generated SCEV checks. If it is

1810 /// nullptr, either no SCEV checks have been generated or they have been used.

1811Value *SCEVCheckCond =nullptr;

1812

1813 /// Basic block which contains the generated memory runtime checks, if any.

1814BasicBlock *MemCheckBlock =nullptr;

1815

1816 /// The value representing the result of the generated memory runtime checks.

1817 /// If it is nullptr, either no memory runtime checks have been generated or

1818 /// they have been used.

1819Value *MemRuntimeCheckCond =nullptr;

1820

1821DominatorTree *DT;

1822LoopInfo *LI;

1823TargetTransformInfo *TTI;

1824

1825SCEVExpander SCEVExp;

1826SCEVExpander MemCheckExp;

1827

1828bool CostTooHigh =false;

1829constbool AddBranchWeights;

1830

1831Loop *OuterLoop =nullptr;

1832

1833PredicatedScalarEvolution &PSE;

1834

1835 /// The kind of cost that we are calculating

1836TTI::TargetCostKind CostKind;

1837

1838public:

1839 GeneratedRTChecks(PredicatedScalarEvolution &PSE,DominatorTree *DT,

1840LoopInfo *LI,TargetTransformInfo *TTI,

1841constDataLayout &DL,bool AddBranchWeights,

1842TTI::TargetCostKind CostKind)

1843 : DT(DT), LI(LI),TTI(TTI), SCEVExp(*PSE.getSE(),DL,"scev.check"),

1844 MemCheckExp(*PSE.getSE(),DL,"scev.check"),

1845 AddBranchWeights(AddBranchWeights), PSE(PSE), CostKind(CostKind) {}

1846

1847 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can

1848 /// accurately estimate the cost of the runtime checks. The blocks are

1849 /// un-linked from the IR and are added back during vector code generation. If

1850 /// there is no vector code generation, the check blocks are removed

1851 /// completely.

1852void create(Loop *L,constLoopAccessInfo &LAI,

1853constSCEVPredicate &UnionPred,ElementCount VF,unsigned IC) {

1854

1855// Hard cutoff to limit compile-time increase in case a very large number of

1856// runtime checks needs to be generated.

1857// TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to

1858// profile info.

1859 CostTooHigh =

1860 LAI.getNumRuntimePointerChecks() >VectorizeMemoryCheckThreshold;

1861if (CostTooHigh)

1862return;

1863

1864BasicBlock *LoopHeader =L->getHeader();

1865BasicBlock *Preheader =L->getLoopPreheader();

1866

1867// Use SplitBlock to create blocks for SCEV & memory runtime checks to

1868// ensure the blocks are properly added to LoopInfo & DominatorTree. Those

1869// may be used by SCEVExpander. The blocks will be un-linked from their

1870// predecessors and removed from LI & DT at the end of the function.

1871if (!UnionPred.isAlwaysTrue()) {

1872 SCEVCheckBlock =SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,

1873nullptr,"vector.scevcheck");

1874

1875 SCEVCheckCond = SCEVExp.expandCodeForPredicate(

1876 &UnionPred, SCEVCheckBlock->getTerminator());

1877 }

1878

1879constauto &RtPtrChecking = *LAI.getRuntimePointerChecking();

1880if (RtPtrChecking.Need) {

1881auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;

1882 MemCheckBlock =SplitBlock(Pred, Pred->getTerminator(), DT, LI,nullptr,

1883"vector.memcheck");

1884

1885auto DiffChecks = RtPtrChecking.getDiffChecks();

1886if (DiffChecks) {

1887Value *RuntimeVF =nullptr;

1888 MemRuntimeCheckCond =addDiffRuntimeChecks(

1889 MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,

1890 [VF, &RuntimeVF](IRBuilderBase &B,unsigned Bits) {

1891 if (!RuntimeVF)

1892 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);

1893 return RuntimeVF;

1894 },

1895 IC);

1896 }else {

1897 MemRuntimeCheckCond =addRuntimeChecks(

1898 MemCheckBlock->getTerminator(), L, RtPtrChecking.getChecks(),

1899 MemCheckExp,VectorizerParams::HoistRuntimeChecks);

1900 }

1901assert(MemRuntimeCheckCond &&

1902"no RT checks generated although RtPtrChecking "

1903"claimed checks are required");

1904 }

1905

1906if (!MemCheckBlock && !SCEVCheckBlock)

1907return;

1908

1909// Unhook the temporary block with the checks, update various places

1910// accordingly.

1911if (SCEVCheckBlock)

1912 SCEVCheckBlock->replaceAllUsesWith(Preheader);

1913if (MemCheckBlock)

1914 MemCheckBlock->replaceAllUsesWith(Preheader);

1915

1916if (SCEVCheckBlock) {

1917 SCEVCheckBlock->getTerminator()->moveBefore(

1918 Preheader->getTerminator()->getIterator());

1919newUnreachableInst(Preheader->getContext(), SCEVCheckBlock);

1920 Preheader->getTerminator()->eraseFromParent();

1921 }

1922if (MemCheckBlock) {

1923 MemCheckBlock->getTerminator()->moveBefore(

1924 Preheader->getTerminator()->getIterator());

1925newUnreachableInst(Preheader->getContext(), MemCheckBlock);

1926 Preheader->getTerminator()->eraseFromParent();

1927 }

1928

1929 DT->changeImmediateDominator(LoopHeader, Preheader);

1930if (MemCheckBlock) {

1931 DT->eraseNode(MemCheckBlock);

1932 LI->removeBlock(MemCheckBlock);

1933 }

1934if (SCEVCheckBlock) {

1935 DT->eraseNode(SCEVCheckBlock);

1936 LI->removeBlock(SCEVCheckBlock);

1937 }

1938

1939// Outer loop is used as part of the later cost calculations.

1940 OuterLoop =L->getParentLoop();

1941 }

1942

1943InstructionCost getCost() {

1944if (SCEVCheckBlock || MemCheckBlock)

1945LLVM_DEBUG(dbgs() <<"Calculating cost of runtime checks:\n");

1946

1947if (CostTooHigh) {

1948InstructionCost Cost;

1949Cost.setInvalid();

1950LLVM_DEBUG(dbgs() <<" number of checks exceeded threshold\n");

1951returnCost;

1952 }

1953

1954InstructionCost RTCheckCost = 0;

1955if (SCEVCheckBlock)

1956for (Instruction &I : *SCEVCheckBlock) {

1957if (SCEVCheckBlock->getTerminator() == &I)

1958continue;

1959InstructionCost C =TTI->getInstructionCost(&I, CostKind);

1960LLVM_DEBUG(dbgs() <<" " <<C <<" for " <<I <<"\n");

1961 RTCheckCost +=C;

1962 }

1963if (MemCheckBlock) {

1964InstructionCost MemCheckCost = 0;

1965for (Instruction &I : *MemCheckBlock) {

1966if (MemCheckBlock->getTerminator() == &I)

1967continue;

1968InstructionCost C =TTI->getInstructionCost(&I, CostKind);

1969LLVM_DEBUG(dbgs() <<" " <<C <<" for " <<I <<"\n");

1970 MemCheckCost +=C;

1971 }

1972

1973// If the runtime memory checks are being created inside an outer loop

1974// we should find out if these checks are outer loop invariant. If so,

1975// the checks will likely be hoisted out and so the effective cost will

1976// reduce according to the outer loop trip count.

1977if (OuterLoop) {

1978ScalarEvolution *SE = MemCheckExp.getSE();

1979// TODO: If profitable, we could refine this further by analysing every

1980// individual memory check, since there could be a mixture of loop

1981// variant and invariant checks that mean the final condition is

1982// variant.

1983constSCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);

1984if (SE->isLoopInvariant(Cond, OuterLoop)) {

1985// It seems reasonable to assume that we can reduce the effective

1986// cost of the checks even when we know nothing about the trip

1987// count. Assume that the outer loop executes at least twice.

1988unsigned BestTripCount = 2;

1989

1990// Get the best known TC estimate.

1991if (auto EstimatedTC =getSmallBestKnownTC(

1992 PSE, OuterLoop,/* CanUseConstantMax = */false))

1993 BestTripCount = *EstimatedTC;

1994

1995 BestTripCount = std::max(BestTripCount, 1U);

1996InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;

1997

1998// Let's ensure the cost is always at least 1.

1999 NewMemCheckCost = std::max(*NewMemCheckCost.getValue(),

2000 (InstructionCost::CostType)1);

2001

2002if (BestTripCount > 1)

2003LLVM_DEBUG(dbgs()

2004 <<"We expect runtime memory checks to be hoisted "

2005 <<"out of the outer loop. Cost reduced from "

2006 << MemCheckCost <<" to " << NewMemCheckCost <<'\n');

2007

2008 MemCheckCost = NewMemCheckCost;

2009 }

2010 }

2011

2012 RTCheckCost += MemCheckCost;

2013 }

2014

2015if (SCEVCheckBlock || MemCheckBlock)

2016LLVM_DEBUG(dbgs() <<"Total cost of runtime checks: " << RTCheckCost

2017 <<"\n");

2018

2019return RTCheckCost;

2020 }

2021

2022 /// Remove the created SCEV & memory runtime check blocks & instructions, if

2023 /// unused.

2024 ~GeneratedRTChecks() {

2025SCEVExpanderCleaner SCEVCleaner(SCEVExp);

2026SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);

2027if (!SCEVCheckCond)

2028 SCEVCleaner.markResultUsed();

2029

2030if (!MemRuntimeCheckCond)

2031 MemCheckCleaner.markResultUsed();

2032

2033if (MemRuntimeCheckCond) {

2034auto &SE = *MemCheckExp.getSE();

2035// Memory runtime check generation creates compares that use expanded

2036// values. Remove them before running the SCEVExpanderCleaners.

2037for (auto &I :make_early_inc_range(reverse(*MemCheckBlock))) {

2038if (MemCheckExp.isInsertedInstruction(&I))

2039continue;

2040 SE.forgetValue(&I);

2041I.eraseFromParent();

2042 }

2043 }

2044 MemCheckCleaner.cleanup();

2045 SCEVCleaner.cleanup();

2046

2047if (SCEVCheckCond)

2048 SCEVCheckBlock->eraseFromParent();

2049if (MemRuntimeCheckCond)

2050 MemCheckBlock->eraseFromParent();

2051 }

2052

2053 /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and

2054 /// adjusts the branches to branch to the vector preheader or \p Bypass,

2055 /// depending on the generated condition.

2056BasicBlock *emitSCEVChecks(BasicBlock *Bypass,

2057BasicBlock *LoopVectorPreHeader) {

2058if (!SCEVCheckCond)

2059returnnullptr;

2060

2061Value *Cond = SCEVCheckCond;

2062// Mark the check as used, to prevent it from being removed during cleanup.

2063 SCEVCheckCond =nullptr;

2064if (auto *C = dyn_cast<ConstantInt>(Cond))

2065if (C->isZero())

2066returnnullptr;

2067

2068auto *Pred = LoopVectorPreHeader->getSinglePredecessor();

2069

2070BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);

2071// Create new preheader for vector loop.

2072if (OuterLoop)

2073 OuterLoop->addBasicBlockToLoop(SCEVCheckBlock, *LI);

2074

2075 SCEVCheckBlock->getTerminator()->eraseFromParent();

2076 SCEVCheckBlock->moveBefore(LoopVectorPreHeader);

2077 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,

2078 SCEVCheckBlock);

2079

2080 DT->addNewBlock(SCEVCheckBlock, Pred);

2081 DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);

2082

2083BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader,Cond);

2084if (AddBranchWeights)

2085setBranchWeights(BI,SCEVCheckBypassWeights,/*IsExpected=*/false);

2086ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);

2087return SCEVCheckBlock;

2088 }

2089

2090 /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts

2091 /// the branches to branch to the vector preheader or \p Bypass, depending on

2092 /// the generated condition.

2093BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,

2094BasicBlock *LoopVectorPreHeader) {

2095// Check if we generated code that checks in runtime if arrays overlap.

2096if (!MemRuntimeCheckCond)

2097returnnullptr;

2098

2099auto *Pred = LoopVectorPreHeader->getSinglePredecessor();

2100 Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,

2101 MemCheckBlock);

2102

2103 DT->addNewBlock(MemCheckBlock, Pred);

2104 DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);

2105 MemCheckBlock->moveBefore(LoopVectorPreHeader);

2106

2107if (OuterLoop)

2108 OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);

2109

2110BranchInst &BI =

2111 *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);

2112if (AddBranchWeights) {

2113setBranchWeights(BI,MemCheckBypassWeights,/*IsExpected=*/false);

2114 }

2115ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);

2116 MemCheckBlock->getTerminator()->setDebugLoc(

2117 Pred->getTerminator()->getDebugLoc());

2118

2119// Mark the check as used, to prevent it from being removed during cleanup.

2120 MemRuntimeCheckCond =nullptr;

2121return MemCheckBlock;

2122 }

2123};

2124}// namespace

2125

2126staticbooluseActiveLaneMask(TailFoldingStyle Style) {

2127return Style == TailFoldingStyle::Data ||

2128 Style == TailFoldingStyle::DataAndControlFlow ||

2129 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;

2130}

2131

2132staticbooluseActiveLaneMaskForControlFlow(TailFoldingStyle Style) {

2133return Style == TailFoldingStyle::DataAndControlFlow ||

2134 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;

2135}

2136

2137// Return true if \p OuterLp is an outer loop annotated with hints for explicit

2138// vectorization. The loop needs to be annotated with #pragma omp simd

2139// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the

2140// vector length information is not provided, vectorization is not considered

2141// explicit. Interleave hints are not allowed either. These limitations will be

2142// relaxed in the future.

2143// Please, note that we are currently forced to abuse the pragma 'clang

2144// vectorize' semantics. This pragma provides *auto-vectorization hints*

2145// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'

2146// provides *explicit vectorization hints* (LV can bypass legal checks and

2147// assume that vectorization is legal). However, both hints are implemented

2148// using the same metadata (llvm.loop.vectorize, processed by

2149// LoopVectorizeHints). This will be fixed in the future when the native IR

2150// representation for pragma 'omp simd' is introduced.

2151staticboolisExplicitVecOuterLoop(Loop *OuterLp,

2152OptimizationRemarkEmitter *ORE) {

2153assert(!OuterLp->isInnermost() &&"This is not an outer loop");

2154LoopVectorizeHints Hints(OuterLp,true/*DisableInterleaving*/, *ORE);

2155

2156// Only outer loops with an explicit vectorization hint are supported.

2157// Unannotated outer loops are ignored.

2158if (Hints.getForce() ==LoopVectorizeHints::FK_Undefined)

2159returnfalse;

2160

2161Function *Fn = OuterLp->getHeader()->getParent();

2162if (!Hints.allowVectorization(Fn, OuterLp,

2163true/*VectorizeOnlyWhenForced*/)) {

2164LLVM_DEBUG(dbgs() <<"LV: Loop hints prevent outer loop vectorization.\n");

2165returnfalse;

2166 }

2167

2168if (Hints.getInterleave() > 1) {

2169// TODO: Interleave support is future work.

2170LLVM_DEBUG(dbgs() <<"LV: Not vectorizing: Interleave is not supported for "

2171"outer loops.\n");

2172 Hints.emitRemarkWithHints();

2173returnfalse;

2174 }

2175

2176returntrue;

2177}

2178

2179staticvoidcollectSupportedLoops(Loop &L,LoopInfo *LI,

2180OptimizationRemarkEmitter *ORE,

2181SmallVectorImpl<Loop *> &V) {

2182// Collect inner loops and outer loops without irreducible control flow. For

2183// now, only collect outer loops that have explicit vectorization hints. If we

2184// are stress testing the VPlan H-CFG construction, we collect the outermost

2185// loop of every loop nest.

2186if (L.isInnermost() ||VPlanBuildStressTest ||

2187 (EnableVPlanNativePath &&isExplicitVecOuterLoop(&L, ORE))) {

2188LoopBlocksRPO RPOT(&L);

2189 RPOT.perform(LI);

2190if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {

2191 V.push_back(&L);

2192// TODO: Collect inner loops inside marked outer loops in case

2193// vectorization fails for the outer loop. Do not invoke

2194// 'containsIrreducibleCFG' again for inner loops when the outer loop is

2195// already known to be reducible. We can use an inherited attribute for

2196// that.

2197return;

2198 }

2199 }

2200for (Loop *InnerL : L)

2201collectSupportedLoops(*InnerL, LI, ORE, V);

2202}

2203

2204//===----------------------------------------------------------------------===//

2205// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and

2206// LoopVectorizationCostModel and LoopVectorizationPlanner.

2207//===----------------------------------------------------------------------===//

2208

2209/// Compute the transformed value of Index at offset StartValue using step

2210/// StepValue.

2211/// For integer induction, returns StartValue + Index * StepValue.

2212/// For pointer induction, returns StartValue[Index * StepValue].

2213/// FIXME: The newly created binary instructions should contain nsw/nuw

2214/// flags, which can be found from the original scalar operations.

2215staticValue *

2216emitTransformedIndex(IRBuilderBase &B,Value *Index,Value *StartValue,

2217Value *Step,

2218InductionDescriptor::InductionKind InductionKind,

2219constBinaryOperator *InductionBinOp) {

2220Type *StepTy = Step->getType();

2221Value *CastedIndex = StepTy->isIntegerTy()

2222 ?B.CreateSExtOrTrunc(Index, StepTy)

2223 :B.CreateCast(Instruction::SIToFP, Index, StepTy);

2224if (CastedIndex != Index) {

2225 CastedIndex->setName(CastedIndex->getName() +".cast");

2226 Index = CastedIndex;

2227 }

2228

2229// Note: the IR at this point is broken. We cannot use SE to create any new

2230// SCEV and then expand it, hoping that SCEV's simplification will give us

2231// a more optimal code. Unfortunately, attempt of doing so on invalid IR may

2232// lead to various SCEV crashes. So all we can do is to use builder and rely

2233// on InstCombine for future simplifications. Here we handle some trivial

2234// cases only.

2235autoCreateAdd = [&B](Value *X,Value *Y) {

2236assert(X->getType() ==Y->getType() &&"Types don't match!");

2237if (auto *CX = dyn_cast<ConstantInt>(X))

2238if (CX->isZero())

2239returnY;

2240if (auto *CY = dyn_cast<ConstantInt>(Y))

2241if (CY->isZero())

2242returnX;

2243returnB.CreateAdd(X,Y);

2244 };

2245

2246// We allow X to be a vector type, in which case Y will potentially be

2247// splatted into a vector with the same element count.

2248autoCreateMul = [&B](Value *X,Value *Y) {

2249assert(X->getType()->getScalarType() ==Y->getType() &&

2250"Types don't match!");

2251if (auto *CX = dyn_cast<ConstantInt>(X))

2252if (CX->isOne())

2253returnY;

2254if (auto *CY = dyn_cast<ConstantInt>(Y))

2255if (CY->isOne())

2256returnX;

2257VectorType *XVTy = dyn_cast<VectorType>(X->getType());

2258if (XVTy && !isa<VectorType>(Y->getType()))

2259Y =B.CreateVectorSplat(XVTy->getElementCount(),Y);

2260returnB.CreateMul(X,Y);

2261 };

2262

2263switch (InductionKind) {

2264caseInductionDescriptor::IK_IntInduction: {

2265assert(!isa<VectorType>(Index->getType()) &&

2266"Vector indices not supported for integer inductions yet");

2267assert(Index->getType() == StartValue->getType() &&

2268"Index type does not match StartValue type");

2269if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())

2270returnB.CreateSub(StartValue, Index);

2271auto *Offset =CreateMul(Index, Step);

2272returnCreateAdd(StartValue,Offset);

2273 }

2274caseInductionDescriptor::IK_PtrInduction:

2275returnB.CreatePtrAdd(StartValue,CreateMul(Index, Step));

2276caseInductionDescriptor::IK_FpInduction: {

2277assert(!isa<VectorType>(Index->getType()) &&

2278"Vector indices not supported for FP inductions yet");

2279assert(Step->getType()->isFloatingPointTy() &&"Expected FP Step value");

2280assert(InductionBinOp &&

2281 (InductionBinOp->getOpcode() == Instruction::FAdd ||

2282 InductionBinOp->getOpcode() == Instruction::FSub) &&

2283"Original bin op should be defined for FP induction");

2284

2285Value *MulExp =B.CreateFMul(Step, Index);

2286returnB.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,

2287"induction");

2288 }

2289caseInductionDescriptor::IK_NoInduction:

2290returnnullptr;

2291 }

2292llvm_unreachable("invalid enum");

2293}

2294

2295std::optional<unsigned>getMaxVScale(constFunction &F,

2296constTargetTransformInfo &TTI) {

2297if (std::optional<unsigned> MaxVScale =TTI.getMaxVScale())

2298return MaxVScale;

2299

2300if (F.hasFnAttribute(Attribute::VScaleRange))

2301returnF.getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();

2302

2303return std::nullopt;

2304}

2305

2306/// For the given VF and UF and maximum trip count computed for the loop, return

2307/// whether the induction variable might overflow in the vectorized loop. If not,

2308/// then we know a runtime overflow check always evaluates to false and can be

2309/// removed.

2310staticboolisIndvarOverflowCheckKnownFalse(

2311constLoopVectorizationCostModel *Cost,

2312ElementCount VF, std::optional<unsigned> UF = std::nullopt) {

2313// Always be conservative if we don't know the exact unroll factor.

2314unsigned MaxUF = UF ? *UF :Cost->TTI.getMaxInterleaveFactor(VF);

2315

2316Type *IdxTy =Cost->Legal->getWidestInductionType();

2317APInt MaxUIntTripCount = cast<IntegerType>(IdxTy)->getMask();

2318

2319// We know the runtime overflow check is known false iff the (max) trip-count

2320// is known and (max) trip-count + (VF * UF) does not overflow in the type of

2321// the vector loop induction variable.

2322if (unsigned TC =Cost->PSE.getSmallConstantMaxTripCount()) {

2323uint64_t MaxVF = VF.getKnownMinValue();

2324if (VF.isScalable()) {

2325 std::optional<unsigned> MaxVScale =

2326getMaxVScale(*Cost->TheFunction,Cost->TTI);

2327if (!MaxVScale)

2328returnfalse;

2329 MaxVF *= *MaxVScale;

2330 }

2331

2332return (MaxUIntTripCount - TC).ugt(MaxVF * MaxUF);

2333 }

2334

2335returnfalse;

2336}

2337

2338// Return whether we allow using masked interleave-groups (for dealing with

2339// strided loads/stores that reside in predicated blocks, or for dealing

2340// with gaps).

2341staticbooluseMaskedInterleavedAccesses(constTargetTransformInfo &TTI) {

2342// If an override option has been passed in for interleaved accesses, use it.

2343if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)

2344returnEnableMaskedInterleavedMemAccesses;

2345

2346returnTTI.enableMaskedInterleavedAccessVectorization();

2347}

2348

2349voidInnerLoopVectorizer::scalarizeInstruction(constInstruction *Instr,

2350VPReplicateRecipe *RepRecipe,

2351constVPLane &Lane,

2352VPTransformState &State) {

2353assert(!Instr->getType()->isAggregateType() &&"Can't handle vectors");

2354

2355// Does this instruction return a value ?

2356bool IsVoidRetTy = Instr->getType()->isVoidTy();

2357

2358Instruction *Cloned = Instr->clone();

2359if (!IsVoidRetTy) {

2360 Cloned->setName(Instr->getName() +".cloned");

2361#if !defined(NDEBUG)

2362// Verify that VPlan type inference results agree with the type of the

2363// generated values.

2364assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&

2365"inferred type and type from generated instructions do not match");

2366#endif

2367 }

2368

2369 RepRecipe->setFlags(Cloned);

2370

2371if (autoDL = Instr->getDebugLoc())

2372 State.setDebugLocFrom(DL);

2373

2374// Replace the operands of the cloned instructions with their scalar

2375// equivalents in the new loop.

2376for (constauto &I :enumerate(RepRecipe->operands())) {

2377auto InputLane = Lane;

2378VPValue *Operand =I.value();

2379if (vputils::isUniformAfterVectorization(Operand))

2380 InputLane =VPLane::getFirstLane();

2381 Cloned->setOperand(I.index(), State.get(Operand, InputLane));

2382 }

2383 State.addNewMetadata(Cloned, Instr);

2384

2385// Place the cloned scalar in the new loop.

2386 State.Builder.Insert(Cloned);

2387

2388 State.set(RepRecipe, Cloned, Lane);

2389

2390// If we just cloned a new assumption, add it the assumption cache.

2391if (auto *II = dyn_cast<AssumeInst>(Cloned))

2392AC->registerAssumption(II);

2393

2394// End if-block.

2395VPRegionBlock *Parent = RepRecipe->getParent()->getParent();

2396bool IfPredicateInstr = Parent ? Parent->isReplicator() :false;

2397assert(

2398 (Parent || !RepRecipe->getParent()->getPlan()->getVectorLoopRegion() ||

2399all_of(RepRecipe->operands(),

2400 [](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) &&

2401"Expected a recipe is either within a region or all of its operands "

2402"are defined outside the vectorized region.");

2403if (IfPredicateInstr)

2404PredicatedInstructions.push_back(Cloned);

2405}

2406

2407Value *

2408InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {

2409if (VectorTripCount)

2410returnVectorTripCount;

2411

2412Value *TC =getTripCount();

2413IRBuilder<>Builder(InsertBlock->getTerminator());

2414

2415Type *Ty = TC->getType();

2416// This is where we can make the step a runtime constant.

2417Value *Step =createStepForVF(Builder, Ty,VF,UF);

2418

2419// If the tail is to be folded by masking, round the number of iterations N

2420// up to a multiple of Step instead of rounding down. This is done by first

2421// adding Step-1 and then rounding down. Note that it's ok if this addition

2422// overflows: the vector induction variable will eventually wrap to zero given

2423// that it starts at zero and its Step is a power of two; the loop will then

2424// exit, with the last early-exit vector comparison also producing all-true.

2425// For scalable vectors the VF is not guaranteed to be a power of 2, but this

2426// is accounted for in emitIterationCountCheck that adds an overflow check.

2427if (Cost->foldTailByMasking()) {

2428assert(isPowerOf2_32(VF.getKnownMinValue() *UF) &&

2429"VF*UF must be a power of 2 when folding tail by masking");

2430 TC =Builder.CreateAdd(TC,Builder.CreateSub(Step, ConstantInt::get(Ty, 1)),

2431"n.rnd.up");

2432 }

2433

2434// Now we need to generate the expression for the part of the loop that the

2435// vectorized body will execute. This is equal to N - (N % Step) if scalar

2436// iterations are not required for correctness, or N - Step, otherwise. Step

2437// is equal to the vectorization factor (number of SIMD elements) times the

2438// unroll factor (number of SIMD instructions).

2439Value *R =Builder.CreateURem(TC, Step,"n.mod.vf");

2440

2441// There are cases where we *must* run at least one iteration in the remainder

2442// loop. See the cost model for when this can happen. If the step evenly

2443// divides the trip count, we set the remainder to be equal to the step. If

2444// the step does not evenly divide the trip count, no adjustment is necessary

2445// since there will already be scalar iterations. Note that the minimum

2446// iterations check ensures that N >= Step.

2447if (Cost->requiresScalarEpilogue(VF.isVector())) {

2448auto *IsZero =Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));

2449 R =Builder.CreateSelect(IsZero, Step, R);

2450 }

2451

2452VectorTripCount =Builder.CreateSub(TC, R,"n.vec");

2453

2454returnVectorTripCount;

2455}

2456

2457voidInnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {

2458VPBlockBase *ScalarPH =Plan.getScalarPreheader();

2459VPBlockBase *PreVectorPH =VectorPHVPB->getSinglePredecessor();

2460if (PreVectorPH->getNumSuccessors() != 1) {

2461assert(PreVectorPH->getNumSuccessors() == 2 &&"Expected 2 successors");

2462assert(PreVectorPH->getSuccessors()[0] == ScalarPH &&

2463"Unexpected successor");

2464VPIRBasicBlock *CheckVPIRBB =Plan.createVPIRBasicBlock(CheckIRBB);

2465VPBlockUtils::insertOnEdge(PreVectorPH,VectorPHVPB, CheckVPIRBB);

2466 PreVectorPH = CheckVPIRBB;

2467 }

2468VPBlockUtils::connectBlocks(PreVectorPH, ScalarPH);

2469 PreVectorPH->swapSuccessors();

2470}

2471

2472voidInnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {

2473Value *Count =getTripCount();

2474// Reuse existing vector loop preheader for TC checks.

2475// Note that new preheader block is generated for vector loop.

2476BasicBlock *const TCCheckBlock =LoopVectorPreHeader;

2477IRBuilder<>Builder(TCCheckBlock->getTerminator());

2478

2479// Generate code to check if the loop's trip count is less than VF * UF, or

2480// equal to it in case a scalar epilogue is required; this implies that the

2481// vector trip count is zero. This check also covers the case where adding one

2482// to the backedge-taken count overflowed leading to an incorrect trip count

2483// of zero. In this case we will also jump to the scalar loop.

2484autoP =Cost->requiresScalarEpilogue(VF.isVector()) ?ICmpInst::ICMP_ULE

2485 :ICmpInst::ICMP_ULT;

2486

2487// If tail is to be folded, vector loop takes care of all iterations.

2488Type *CountTy = Count->getType();

2489Value *CheckMinIters =Builder.getFalse();

2490auto CreateStep = [&]() ->Value * {

2491// Create step with max(MinProTripCount, UF * VF).

2492if (UF *VF.getKnownMinValue() >=MinProfitableTripCount.getKnownMinValue())

2493returncreateStepForVF(Builder, CountTy,VF,UF);

2494

2495Value *MinProfTC =

2496createStepForVF(Builder, CountTy,MinProfitableTripCount, 1);

2497if (!VF.isScalable())

2498return MinProfTC;

2499returnBuilder.CreateBinaryIntrinsic(

2500 Intrinsic::umax, MinProfTC,createStepForVF(Builder, CountTy,VF,UF));

2501 };

2502

2503TailFoldingStyle Style =Cost->getTailFoldingStyle();

2504if (Style ==TailFoldingStyle::None) {

2505Value *Step = CreateStep();

2506ScalarEvolution &SE = *PSE.getSE();

2507// TODO: Emit unconditional branch to vector preheader instead of

2508// conditional branch with known condition.

2509constSCEV *TripCountSCEV = SE.applyLoopGuards(SE.getSCEV(Count),OrigLoop);

2510// Check if the trip count is < the step.

2511if (SE.isKnownPredicate(P, TripCountSCEV, SE.getSCEV(Step))) {

2512// TODO: Ensure step is at most the trip count when determining max VF and

2513// UF, w/o tail folding.

2514 CheckMinIters =Builder.getTrue();

2515 }elseif (!SE.isKnownPredicate(CmpInst::getInversePredicate(P),

2516 TripCountSCEV, SE.getSCEV(Step))) {

2517// Generate the minimum iteration check only if we cannot prove the

2518// check is known to be true, or known to be false.

2519 CheckMinIters =Builder.CreateICmp(P, Count, Step,"min.iters.check");

2520 }// else step known to be < trip count, use CheckMinIters preset to false.

2521 }elseif (VF.isScalable() &&

2522 !isIndvarOverflowCheckKnownFalse(Cost,VF,UF) &&

2523 Style !=TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {

2524// vscale is not necessarily a power-of-2, which means we cannot guarantee

2525// an overflow to zero when updating induction variables and so an

2526// additional overflow check is required before entering the vector loop.

2527

2528// Get the maximum unsigned value for the type.

2529Value *MaxUIntTripCount =

2530 ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());

2531Value *LHS =Builder.CreateSub(MaxUIntTripCount, Count);

2532

2533// Don't execute the vector loop if (UMax - n) < (VF * UF).

2534 CheckMinIters =Builder.CreateICmp(ICmpInst::ICMP_ULT,LHS, CreateStep());

2535 }

2536

2537// Create new preheader for vector loop.

2538LoopVectorPreHeader =

2539SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),DT,LI,nullptr,

2540"vector.ph");

2541

2542assert(DT->properlyDominates(DT->getNode(TCCheckBlock),

2543DT->getNode(Bypass)->getIDom()) &&

2544"TC check is expected to dominate Bypass");

2545

2546BranchInst &BI =

2547 *BranchInst::Create(Bypass,LoopVectorPreHeader, CheckMinIters);

2548if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))

2549setBranchWeights(BI,MinItersBypassWeights,/*IsExpected=*/false);

2550ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);

2551LoopBypassBlocks.push_back(TCCheckBlock);

2552

2553// TODO: Wrap LoopVectorPreHeader in VPIRBasicBlock here.

2554introduceCheckBlockInVPlan(TCCheckBlock);

2555}

2556

2557BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {

2558BasicBlock *const SCEVCheckBlock =

2559RTChecks.emitSCEVChecks(Bypass,LoopVectorPreHeader);

2560if (!SCEVCheckBlock)

2561returnnullptr;

2562

2563assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||

2564 (OptForSizeBasedOnProfile &&

2565Cost->Hints->getForce() !=LoopVectorizeHints::FK_Enabled)) &&

2566"Cannot SCEV check stride or overflow when optimizing for size");

2567assert(!LoopBypassBlocks.empty() &&

2568"Should already be a bypass block due to iteration count check");

2569LoopBypassBlocks.push_back(SCEVCheckBlock);

2570AddedSafetyChecks =true;

2571

2572introduceCheckBlockInVPlan(SCEVCheckBlock);

2573return SCEVCheckBlock;

2574}

2575

2576BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {

2577// VPlan-native path does not do any analysis for runtime checks currently.

2578if (EnableVPlanNativePath)

2579returnnullptr;

2580

2581BasicBlock *const MemCheckBlock =

2582RTChecks.emitMemRuntimeChecks(Bypass,LoopVectorPreHeader);

2583

2584// Check if we generated code that checks in runtime if arrays overlap. We put

2585// the checks into a separate block to make the more common case of few

2586// elements faster.

2587if (!MemCheckBlock)

2588returnnullptr;

2589

2590if (MemCheckBlock->getParent()->hasOptSize() ||OptForSizeBasedOnProfile) {

2591assert(Cost->Hints->getForce() ==LoopVectorizeHints::FK_Enabled &&

2592"Cannot emit memory checks when optimizing for size, unless forced "

2593"to vectorize.");

2594ORE->emit([&]() {

2595returnOptimizationRemarkAnalysis(DEBUG_TYPE,"VectorizationCodeSize",

2596OrigLoop->getStartLoc(),

2597OrigLoop->getHeader())

2598 <<"Code-size may be reduced by not forcing "

2599"vectorization, or by source-code modifications "

2600"eliminating the need for runtime checks "

2601"(e.g., adding 'restrict').";

2602 });

2603 }

2604

2605LoopBypassBlocks.push_back(MemCheckBlock);

2606

2607AddedSafetyChecks =true;

2608

2609introduceCheckBlockInVPlan(MemCheckBlock);

2610return MemCheckBlock;

2611}

2612

2613/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p

2614/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must

2615/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All

2616/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.

2617staticvoidreplaceVPBBWithIRVPBB(VPBasicBlock *VPBB,BasicBlock *IRBB) {

2618VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);

2619for (auto &R :make_early_inc_range(*VPBB)) {

2620assert(!R.isPhi() &&"Tried to move phi recipe to end of block");

2621 R.moveBefore(*IRVPBB, IRVPBB->end());

2622 }

2623

2624VPBlockUtils::reassociateBlocks(VPBB, IRVPBB);

2625// VPBB is now dead and will be cleaned up when the plan gets destroyed.

2626}

2627

2628voidInnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {

2629LoopVectorPreHeader =OrigLoop->getLoopPreheader();

2630assert(LoopVectorPreHeader &&"Invalid loop structure");

2631assert((OrigLoop->getUniqueLatchExitBlock() ||

2632Cost->requiresScalarEpilogue(VF.isVector())) &&

2633"loops not exiting via the latch without required epilogue?");

2634

2635LoopMiddleBlock =

2636SplitBlock(LoopVectorPreHeader,LoopVectorPreHeader->getTerminator(),DT,

2637LI,nullptr,Twine(Prefix) +"middle.block");

2638replaceVPBBWithIRVPBB(Plan.getMiddleBlock(),LoopMiddleBlock);

2639LoopScalarPreHeader =

2640SplitBlock(LoopMiddleBlock,LoopMiddleBlock->getTerminator(),DT,LI,

2641nullptr,Twine(Prefix) +"scalar.ph");

2642replaceVPBBWithIRVPBB(Plan.getScalarPreheader(),LoopScalarPreHeader);

2643}

2644

2645/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV

2646/// expansion results.

2647staticValue *getExpandedStep(constInductionDescriptor &ID,

2648const SCEV2ValueTy &ExpandedSCEVs) {

2649constSCEV *Step =ID.getStep();

2650if (auto *C = dyn_cast<SCEVConstant>(Step))

2651returnC->getValue();

2652if (auto *U = dyn_cast<SCEVUnknown>(Step))

2653return U->getValue();

2654autoI = ExpandedSCEVs.find(Step);

2655assert(I != ExpandedSCEVs.end() &&"SCEV must be expanded at this point");

2656returnI->second;

2657}

2658

2659/// Knowing that loop \p L executes a single vector iteration, add instructions

2660/// that will get simplified and thus should not have any cost to \p

2661/// InstsToIgnore.

2662staticvoidaddFullyUnrolledInstructionsToIgnore(

2663Loop *L,constLoopVectorizationLegality::InductionList &IL,

2664SmallPtrSetImpl<Instruction *> &InstsToIgnore) {

2665auto *Cmp = L->getLatchCmpInst();

2666if (Cmp)

2667 InstsToIgnore.insert(Cmp);

2668for (constauto &KV : IL) {

2669// Extract the key by hand so that it can be used in the lambda below. Note

2670// that captured structured bindings are a C++20 extension.

2671constPHINode *IV = KV.first;

2672

2673// Get next iteration value of the induction variable.

2674Instruction *IVInst =

2675 cast<Instruction>(IV->getIncomingValueForBlock(L->getLoopLatch()));

2676if (all_of(IVInst->users(),

2677 [&](constUser *U) { return U == IV || U == Cmp; }))

2678 InstsToIgnore.insert(IVInst);

2679 }

2680}

2681

2682voidInnerLoopVectorizer::createInductionAdditionalBypassValues(

2683const SCEV2ValueTy &ExpandedSCEVs,Value *MainVectorTripCount) {

2684assert(MainVectorTripCount &&"Must have bypass information");

2685

2686Instruction *OldInduction =Legal->getPrimaryInduction();

2687IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),

2688getAdditionalBypassBlock()->getFirstInsertionPt());

2689for (constauto &InductionEntry :Legal->getInductionVars()) {

2690PHINode *OrigPhi = InductionEntry.first;

2691constInductionDescriptor &II = InductionEntry.second;

2692Value *Step =getExpandedStep(II, ExpandedSCEVs);

2693// For the primary induction the additional bypass end value is known.

2694// Otherwise it is computed.

2695Value *EndValueFromAdditionalBypass = MainVectorTripCount;

2696if (OrigPhi != OldInduction) {

2697auto *BinOp =II.getInductionBinOp();

2698// Fast-math-flags propagate from the original induction instruction.

2699if (isa_and_nonnull<FPMathOperator>(BinOp))

2700 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());

2701

2702// Compute the end value for the additional bypass.

2703 EndValueFromAdditionalBypass =

2704emitTransformedIndex(BypassBuilder, MainVectorTripCount,

2705II.getStartValue(), Step,II.getKind(), BinOp);

2706 EndValueFromAdditionalBypass->setName("ind.end");

2707 }

2708

2709// Store the bypass value here, as it needs to be added as operand to its

2710// scalar preheader phi node after the epilogue skeleton has been created.

2711// TODO: Directly add as extra operand to the VPResumePHI recipe.

2712assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&

2713"entry for OrigPhi already exits");

2714Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;

2715 }

2716}

2717

2718BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(

2719const SCEV2ValueTy &ExpandedSCEVs) {

2720/*

2721 In this function we generate a new loop. The new loop will contain

2722 the vectorized instructions while the old loop will continue to run the

2723 scalar remainder.

2724

2725 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's

2726 / | preheader are expanded here. Eventually all required SCEV

2727 / | expansion should happen here.

2728 / v

2729 | [ ] <-- vector loop bypass (may consist of multiple blocks).

2730 | / |

2731 | / v

2732 || [ ] <-- vector pre header.

2733 |/ |

2734 | v

2735 | [ ] \

2736 | [ ]_| <-- vector loop (created during VPlan execution).

2737 | |

2738 | v

2739 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to

2740 | | successors created during VPlan execution)

2741 \/ |

2742 /\ v

2743 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).

2744 | |

2745 (opt) v <-- edge from middle to exit iff epilogue is not required.

2746 | [ ] \

2747 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header

2748 | | wrapped in VPIRBasicBlock).

2749 \ |

2750 \ v

2751 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)

2752 ...

2753 */

2754

2755// Create an empty vector loop, and prepare basic blocks for the runtime

2756// checks.

2757createVectorLoopSkeleton("");

2758

2759// Now, compare the new count to zero. If it is zero skip the vector loop and

2760// jump to the scalar loop. This check also covers the case where the

2761// backedge-taken count is uint##_max: adding one to it will overflow leading

2762// to an incorrect trip count of zero. In this (rare) case we will also jump

2763// to the scalar loop.

2764emitIterationCountCheck(LoopScalarPreHeader);

2765

2766// Generate the code to check any assumptions that we've made for SCEV

2767// expressions.

2768emitSCEVChecks(LoopScalarPreHeader);

2769

2770// Generate the code that checks in runtime if arrays overlap. We put the

2771// checks into a separate block to make the more common case of few elements

2772// faster.

2773emitMemRuntimeChecks(LoopScalarPreHeader);

2774

2775returnLoopVectorPreHeader;

2776}

2777

2778namespace{

2779

2780structCSEDenseMapInfo {

2781staticbool canHandle(constInstruction *I) {

2782return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||

2783 isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);

2784 }

2785

2786staticinlineInstruction *getEmptyKey() {

2787returnDenseMapInfo<Instruction *>::getEmptyKey();

2788 }

2789

2790staticinlineInstruction *getTombstoneKey() {

2791returnDenseMapInfo<Instruction *>::getTombstoneKey();

2792 }

2793

2794staticunsigned getHashValue(constInstruction *I) {

2795assert(canHandle(I) &&"Unknown instruction!");

2796returnhash_combine(I->getOpcode(),hash_combine_range(I->value_op_begin(),

2797I->value_op_end()));

2798 }

2799

2800staticboolisEqual(constInstruction *LHS,constInstruction *RHS) {

2801if (LHS == getEmptyKey() ||RHS == getEmptyKey() ||

2802LHS == getTombstoneKey() ||RHS == getTombstoneKey())

2803returnLHS ==RHS;

2804returnLHS->isIdenticalTo(RHS);

2805 }

2806};

2807

2808}// end anonymous namespace

2809

2810///Perform cse of induction variable instructions.

2811staticvoidcse(BasicBlock *BB) {

2812// Perform simple cse.

2813SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;

2814for (Instruction &In :llvm::make_early_inc_range(*BB)) {

2815if (!CSEDenseMapInfo::canHandle(&In))

2816continue;

2817

2818// Check if we can replace this instruction with any of the

2819// visited instructions.

2820if (Instruction *V = CSEMap.lookup(&In)) {

2821 In.replaceAllUsesWith(V);

2822 In.eraseFromParent();

2823continue;

2824 }

2825

2826 CSEMap[&In] = &In;

2827 }

2828}

2829

2830InstructionCost

2831LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,

2832ElementCount VF) const{

2833// We only need to calculate a cost if the VF is scalar; for actual vectors

2834// we should already have a pre-calculated cost at each VF.

2835if (!VF.isScalar())

2836return CallWideningDecisions.at(std::make_pair(CI, VF)).Cost;

2837

2838Type *RetTy = CI->getType();

2839if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))

2840if (auto RedCost =getReductionPatternCost(CI, VF,RetTy))

2841return *RedCost;

2842

2843SmallVector<Type *, 4> Tys;

2844for (auto &ArgOp : CI->args())

2845 Tys.push_back(ArgOp->getType());

2846

2847InstructionCost ScalarCallCost =

2848TTI.getCallInstrCost(CI->getCalledFunction(),RetTy, Tys,CostKind);

2849

2850// If this is an intrinsic we may have a lower cost for it.

2851if (getVectorIntrinsicIDForCall(CI,TLI)) {

2852InstructionCost IntrinsicCost =getVectorIntrinsicCost(CI, VF);

2853return std::min(ScalarCallCost, IntrinsicCost);

2854 }

2855return ScalarCallCost;

2856}

2857

2858staticType *maybeVectorizeType(Type *Elt,ElementCount VF) {

2859if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))

2860return Elt;

2861returnVectorType::get(Elt, VF);

2862}

2863

2864InstructionCost

2865LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,

2866ElementCount VF) const{

2867Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI,TLI);

2868assert(ID &&"Expected intrinsic call!");

2869Type *RetTy =maybeVectorizeType(CI->getType(), VF);

2870FastMathFlags FMF;

2871if (auto *FPMO = dyn_cast<FPMathOperator>(CI))

2872 FMF = FPMO->getFastMathFlags();

2873

2874SmallVector<const Value *>Arguments(CI->args());

2875FunctionType *FTy = CI->getCalledFunction()->getFunctionType();

2876SmallVector<Type *> ParamTys;

2877 std::transform(FTy->param_begin(), FTy->param_end(),

2878 std::back_inserter(ParamTys),

2879 [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });

2880

2881IntrinsicCostAttributes CostAttrs(ID,RetTy,Arguments, ParamTys, FMF,

2882 dyn_cast<IntrinsicInst>(CI));

2883returnTTI.getIntrinsicInstrCost(CostAttrs,CostKind);

2884}

2885

2886voidInnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {

2887// Fix widened non-induction PHIs by setting up the PHI operands.

2888if (EnableVPlanNativePath)

2889fixNonInductionPHIs(State);

2890

2891// Forget the original basic block.

2892PSE.getSE()->forgetLoop(OrigLoop);

2893PSE.getSE()->forgetBlockAndLoopDispositions();

2894

2895// After vectorization, the exit blocks of the original loop will have

2896// additional predecessors. Invalidate SCEVs for the exit phis in case SE

2897// looked through single-entry phis.

2898SmallVector<BasicBlock *> ExitBlocks;

2899OrigLoop->getExitBlocks(ExitBlocks);

2900for (BasicBlock *Exit : ExitBlocks)

2901for (PHINode &PN : Exit->phis())

2902PSE.getSE()->forgetLcssaPhiWithNewPredecessor(OrigLoop, &PN);

2903

2904// Don't apply optimizations below when no vector region remains, as they all

2905// require a vector loop at the moment.

2906if (!State.Plan->getVectorLoopRegion())

2907return;

2908

2909for (Instruction *PI :PredicatedInstructions)

2910sinkScalarOperands(&*PI);

2911

2912VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();

2913VPBasicBlock *HeaderVPBB = VectorRegion->getEntryBasicBlock();

2914BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];

2915

2916// Remove redundant induction instructions.

2917cse(HeaderBB);

2918

2919// Set/update profile weights for the vector and remainder loops as original

2920// loop iterations are now distributed among them. Note that original loop

2921// becomes the scalar remainder loop after vectorization.

2922//

2923// For cases like foldTailByMasking() and requiresScalarEpiloque() we may

2924// end up getting slightly roughened result but that should be OK since

2925// profile is not inherently precise anyway. Note also possible bypass of

2926// vector code caused by legality checks is ignored, assigning all the weight

2927// to the vector loop, optimistically.

2928//

2929// For scalable vectorization we can't know at compile time how many

2930// iterations of the loop are handled in one vector iteration, so instead

2931// assume a pessimistic vscale of '1'.

2932Loop *VectorLoop =LI->getLoopFor(HeaderBB);

2933setProfileInfoAfterUnrolling(OrigLoop, VectorLoop,OrigLoop,

2934VF.getKnownMinValue() *UF);

2935}

2936

2937voidInnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {

2938// The basic block and loop containing the predicated instruction.

2939auto *PredBB = PredInst->getParent();

2940auto *VectorLoop =LI->getLoopFor(PredBB);

2941

2942// Initialize a worklist with the operands of the predicated instruction.

2943SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());

2944

2945// Holds instructions that we need to analyze again. An instruction may be

2946// reanalyzed if we don't yet know if we can sink it or not.

2947SmallVector<Instruction *, 8> InstsToReanalyze;

2948

2949// Returns true if a given use occurs in the predicated block. Phi nodes use

2950// their operands in their corresponding predecessor blocks.

2951auto IsBlockOfUsePredicated = [&](Use &U) ->bool {

2952auto *I = cast<Instruction>(U.getUser());

2953BasicBlock *BB =I->getParent();

2954if (auto *Phi = dyn_cast<PHINode>(I))

2955 BB = Phi->getIncomingBlock(

2956PHINode::getIncomingValueNumForOperand(U.getOperandNo()));

2957return BB == PredBB;

2958 };

2959

2960// Iteratively sink the scalarized operands of the predicated instruction

2961// into the block we created for it. When an instruction is sunk, it's

2962// operands are then added to the worklist. The algorithm ends after one pass

2963// through the worklist doesn't sink a single instruction.

2964bool Changed;

2965do {

2966// Add the instructions that need to be reanalyzed to the worklist, and

2967// reset the changed indicator.

2968 Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());

2969 InstsToReanalyze.clear();

2970 Changed =false;

2971

2972while (!Worklist.empty()) {

2973auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());

2974

2975// We can't sink an instruction if it is a phi node, is not in the loop,

2976// may have side effects or may read from memory.

2977// TODO: Could do more granular checking to allow sinking

2978// a load past non-store instructions.

2979if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||

2980I->mayHaveSideEffects() ||I->mayReadFromMemory())

2981continue;

2982

2983// If the instruction is already in PredBB, check if we can sink its

2984// operands. In that case, VPlan's sinkScalarOperands() succeeded in

2985// sinking the scalar instruction I, hence it appears in PredBB; but it

2986// may have failed to sink I's operands (recursively), which we try

2987// (again) here.

2988if (I->getParent() == PredBB) {

2989 Worklist.insert(I->op_begin(),I->op_end());

2990continue;

2991 }

2992

2993// It's legal to sink the instruction if all its uses occur in the

2994// predicated block. Otherwise, there's nothing to do yet, and we may

2995// need to reanalyze the instruction.

2996if (!llvm::all_of(I->uses(), IsBlockOfUsePredicated)) {

2997 InstsToReanalyze.push_back(I);

2998continue;

2999 }

3000

3001// Move the instruction to the beginning of the predicated block, and add

3002// it's operands to the worklist.

3003I->moveBefore(PredBB->getFirstInsertionPt());

3004 Worklist.insert(I->op_begin(),I->op_end());

3005

3006// The sinking may have enabled other instructions to be sunk, so we will

3007// need to iterate.

3008 Changed =true;

3009 }

3010 }while (Changed);

3011}

3012

3013voidInnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {

3014auto Iter =vp_depth_first_deep(Plan.getEntry());

3015for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {

3016for (VPRecipeBase &P : VPBB->phis()) {

3017VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);

3018if (!VPPhi)

3019continue;

3020PHINode *NewPhi = cast<PHINode>(State.get(VPPhi));

3021// Make sure the builder has a valid insert point.

3022Builder.SetInsertPoint(NewPhi);

3023for (unsignedIdx = 0;Idx < VPPhi->getNumOperands(); ++Idx) {

3024VPValue *Inc = VPPhi->getIncomingValue(Idx);

3025VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);

3026 NewPhi->addIncoming(State.get(Inc), State.CFG.VPBB2IRBB[VPBB]);

3027 }

3028 }

3029 }

3030}

3031

3032void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {

3033// We should not collect Scalars more than once per VF. Right now, this

3034// function is called from collectUniformsAndScalars(), which already does

3035// this check. Collecting Scalars for VF=1 does not make any sense.

3036assert(VF.isVector() && !Scalars.contains(VF) &&

3037"This function should not be visited twice for the same VF");

3038

3039// This avoids any chances of creating a REPLICATE recipe during planning

3040// since that would result in generation of scalarized code during execution,

3041// which is not supported for scalable vectors.

3042if (VF.isScalable()) {

3043 Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());

3044return;

3045 }

3046

3047SmallSetVector<Instruction *, 8> Worklist;

3048

3049// These sets are used to seed the analysis with pointers used by memory

3050// accesses that will remain scalar.

3051SmallSetVector<Instruction *, 8> ScalarPtrs;

3052SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;

3053auto *Latch =TheLoop->getLoopLatch();

3054

3055// A helper that returns true if the use of Ptr by MemAccess will be scalar.

3056// The pointer operands of loads and stores will be scalar as long as the

3057// memory access is not a gather or scatter operation. The value operand of a

3058// store will remain scalar if the store is scalarized.

3059auto IsScalarUse = [&](Instruction *MemAccess,Value *Ptr) {

3060InstWidening WideningDecision =getWideningDecision(MemAccess, VF);

3061assert(WideningDecision !=CM_Unknown &&

3062"Widening decision should be ready at this moment");

3063if (auto *Store = dyn_cast<StoreInst>(MemAccess))

3064if (Ptr == Store->getValueOperand())

3065return WideningDecision ==CM_Scalarize;

3066assert(Ptr ==getLoadStorePointerOperand(MemAccess) &&

3067"Ptr is neither a value or pointer operand");

3068return WideningDecision !=CM_GatherScatter;

3069 };

3070

3071// A helper that returns true if the given value is a getelementptr

3072// instruction contained in the loop.

3073auto IsLoopVaryingGEP = [&](Value *V) {

3074return isa<GetElementPtrInst>(V) && !TheLoop->isLoopInvariant(V);

3075 };

3076

3077// A helper that evaluates a memory access's use of a pointer. If the use will

3078// be a scalar use and the pointer is only used by memory accesses, we place

3079// the pointer in ScalarPtrs. Otherwise, the pointer is placed in

3080// PossibleNonScalarPtrs.

3081auto EvaluatePtrUse = [&](Instruction *MemAccess,Value *Ptr) {

3082// We only care about bitcast and getelementptr instructions contained in

3083// the loop.

3084if (!IsLoopVaryingGEP(Ptr))

3085return;

3086

3087// If the pointer has already been identified as scalar (e.g., if it was

3088// also identified as uniform), there's nothing to do.

3089auto *I = cast<Instruction>(Ptr);

3090if (Worklist.count(I))

3091return;

3092

3093// If the use of the pointer will be a scalar use, and all users of the

3094// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,

3095// place the pointer in PossibleNonScalarPtrs.

3096if (IsScalarUse(MemAccess,Ptr) &&

3097all_of(I->users(), IsaPred<LoadInst, StoreInst>))

3098 ScalarPtrs.insert(I);

3099else

3100 PossibleNonScalarPtrs.insert(I);

3101 };

3102

3103// We seed the scalars analysis with three classes of instructions: (1)

3104// instructions marked uniform-after-vectorization and (2) bitcast,

3105// getelementptr and (pointer) phi instructions used by memory accesses

3106// requiring a scalar use.

3107//

3108// (1) Add to the worklist all instructions that have been identified as

3109// uniform-after-vectorization.

3110 Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());

3111

3112// (2) Add to the worklist all bitcast and getelementptr instructions used by

3113// memory accesses requiring a scalar use. The pointer operands of loads and

3114// stores will be scalar unless the operation is a gather or scatter.

3115// The value operand of a store will remain scalar if the store is scalarized.

3116for (auto *BB :TheLoop->blocks())

3117for (auto &I : *BB) {

3118if (auto *Load = dyn_cast<LoadInst>(&I)) {

3119 EvaluatePtrUse(Load,Load->getPointerOperand());

3120 }elseif (auto *Store = dyn_cast<StoreInst>(&I)) {

3121 EvaluatePtrUse(Store,Store->getPointerOperand());

3122 EvaluatePtrUse(Store,Store->getValueOperand());

3123 }

3124 }

3125for (auto *I : ScalarPtrs)

3126if (!PossibleNonScalarPtrs.count(I)) {

3127LLVM_DEBUG(dbgs() <<"LV: Found scalar instruction: " << *I <<"\n");

3128 Worklist.insert(I);

3129 }

3130

3131// Insert the forced scalars.

3132// FIXME: Currently VPWidenPHIRecipe() often creates a dead vector

3133// induction variable when the PHI user is scalarized.

3134auto ForcedScalar = ForcedScalars.find(VF);

3135if (ForcedScalar != ForcedScalars.end())

3136for (auto *I : ForcedScalar->second) {

3137LLVM_DEBUG(dbgs() <<"LV: Found (forced) scalar instruction: " << *I <<"\n");

3138 Worklist.insert(I);

3139 }

3140

3141// Expand the worklist by looking through any bitcasts and getelementptr

3142// instructions we've already identified as scalar. This is similar to the

3143// expansion step in collectLoopUniforms(); however, here we're only

3144// expanding to include additional bitcasts and getelementptr instructions.

3145unsignedIdx = 0;

3146while (Idx != Worklist.size()) {

3147Instruction *Dst = Worklist[Idx++];

3148if (!IsLoopVaryingGEP(Dst->getOperand(0)))

3149continue;

3150auto *Src = cast<Instruction>(Dst->getOperand(0));

3151if (llvm::all_of(Src->users(), [&](User *U) ->bool {

3152 auto *J = cast<Instruction>(U);

3153 return !TheLoop->contains(J) || Worklist.count(J) ||

3154 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&

3155 IsScalarUse(J, Src));

3156 })) {

3157 Worklist.insert(Src);

3158LLVM_DEBUG(dbgs() <<"LV: Found scalar instruction: " << *Src <<"\n");

3159 }

3160 }

3161

3162// An induction variable will remain scalar if all users of the induction

3163// variable and induction variable update remain scalar.

3164for (constauto &Induction :Legal->getInductionVars()) {

3165auto *Ind = Induction.first;

3166auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

3167

3168// If tail-folding is applied, the primary induction variable will be used

3169// to feed a vector compare.

3170if (Ind ==Legal->getPrimaryInduction() &&foldTailByMasking())

3171continue;

3172

3173// Returns true if \p Indvar is a pointer induction that is used directly by

3174// load/store instruction \p I.

3175auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,

3176Instruction *I) {

3177return Induction.second.getKind() ==

3178InductionDescriptor::IK_PtrInduction &&

3179 (isa<LoadInst>(I) || isa<StoreInst>(I)) &&

3180 Indvar ==getLoadStorePointerOperand(I) && IsScalarUse(I, Indvar);

3181 };

3182

3183// Determine if all users of the induction variable are scalar after

3184// vectorization.

3185bool ScalarInd =all_of(Ind->users(), [&](User *U) ->bool {

3186 auto *I = cast<Instruction>(U);

3187 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||

3188 IsDirectLoadStoreFromPtrIndvar(Ind, I);

3189 });

3190if (!ScalarInd)

3191continue;

3192

3193// If the induction variable update is a fixed-order recurrence, neither the

3194// induction variable or its update should be marked scalar after

3195// vectorization.

3196auto *IndUpdatePhi = dyn_cast<PHINode>(IndUpdate);

3197if (IndUpdatePhi &&Legal->isFixedOrderRecurrence(IndUpdatePhi))

3198continue;

3199

3200// Determine if all users of the induction variable update instruction are

3201// scalar after vectorization.

3202bool ScalarIndUpdate =all_of(IndUpdate->users(), [&](User *U) ->bool {

3203 auto *I = cast<Instruction>(U);

3204 return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||

3205 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);

3206 });

3207if (!ScalarIndUpdate)

3208continue;

3209

3210// The induction variable and its update instruction will remain scalar.

3211 Worklist.insert(Ind);

3212 Worklist.insert(IndUpdate);

3213LLVM_DEBUG(dbgs() <<"LV: Found scalar instruction: " << *Ind <<"\n");

3214LLVM_DEBUG(dbgs() <<"LV: Found scalar instruction: " << *IndUpdate

3215 <<"\n");

3216 }

3217

3218 Scalars[VF].insert(Worklist.begin(), Worklist.end());

3219}

3220

3221boolLoopVectorizationCostModel::isScalarWithPredication(

3222Instruction *I,ElementCount VF) const{

3223if (!isPredicatedInst(I))

3224returnfalse;

3225

3226// Do we have a non-scalar lowering for this predicated

3227// instruction? No - it is scalar with predication.

3228switch(I->getOpcode()) {

3229default:

3230returntrue;

3231case Instruction::Call:

3232if (VF.isScalar())

3233returntrue;

3234return CallWideningDecisions.at(std::make_pair(cast<CallInst>(I), VF))

3235 .Kind ==CM_Scalarize;

3236case Instruction::Load:

3237case Instruction::Store: {

3238auto *Ptr =getLoadStorePointerOperand(I);

3239auto *Ty =getLoadStoreType(I);

3240Type *VTy = Ty;

3241if (VF.isVector())

3242 VTy =VectorType::get(Ty, VF);

3243constAlign Alignment =getLoadStoreAlignment(I);

3244return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty,Ptr, Alignment) ||

3245TTI.isLegalMaskedGather(VTy, Alignment))

3246 : !(isLegalMaskedStore(Ty,Ptr, Alignment) ||

3247TTI.isLegalMaskedScatter(VTy, Alignment));

3248 }

3249case Instruction::UDiv:

3250case Instruction::SDiv:

3251case Instruction::SRem:

3252case Instruction::URem: {

3253// We have the option to use the safe-divisor idiom to avoid predication.

3254// The cost based decision here will always select safe-divisor for

3255// scalable vectors as scalarization isn't legal.

3256constauto [ScalarCost, SafeDivisorCost] =getDivRemSpeculationCost(I, VF);

3257returnisDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);

3258 }

3259 }

3260}

3261

3262// TODO: Fold into LoopVectorizationLegality::isMaskRequired.

3263boolLoopVectorizationCostModel::isPredicatedInst(Instruction *I) const{

3264// If predication is not needed, avoid it.

3265// TODO: We can use the loop-preheader as context point here and get

3266// context sensitive reasoning for isSafeToSpeculativelyExecute.

3267if (!blockNeedsPredicationForAnyReason(I->getParent()) ||

3268isSafeToSpeculativelyExecute(I) ||

3269 (isa<LoadInst, StoreInst, CallInst>(I) && !Legal->isMaskRequired(I)) ||

3270 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(I))

3271returnfalse;

3272

3273// If the instruction was executed conditionally in the original scalar loop,

3274// predication is needed with a mask whose lanes are all possibly inactive.

3275if (Legal->blockNeedsPredication(I->getParent()))

3276returntrue;

3277

3278// All that remain are instructions with side-effects originally executed in

3279// the loop unconditionally, but now execute under a tail-fold mask (only)

3280// having at least one active lane (the first). If the side-effects of the

3281// instruction are invariant, executing it w/o (the tail-folding) mask is safe

3282// - it will cause the same side-effects as when masked.

3283switch(I->getOpcode()) {

3284default:

3285llvm_unreachable(

3286"instruction should have been considered by earlier checks");

3287case Instruction::Call:

3288// Side-effects of a Call are assumed to be non-invariant, needing a

3289// (fold-tail) mask.

3290assert(Legal->isMaskRequired(I) &&

3291"should have returned earlier for calls not needing a mask");

3292returntrue;

3293case Instruction::Load:

3294// If the address is loop invariant no predication is needed.

3295return !Legal->isInvariant(getLoadStorePointerOperand(I));

3296case Instruction::Store: {

3297// For stores, we need to prove both speculation safety (which follows from

3298// the same argument as loads), but also must prove the value being stored

3299// is correct. The easiest form of the later is to require that all values

3300// stored are the same.

3301return !(Legal->isInvariant(getLoadStorePointerOperand(I)) &&

3302TheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand()));

3303 }

3304case Instruction::UDiv:

3305case Instruction::SDiv:

3306case Instruction::SRem:

3307case Instruction::URem:

3308// If the divisor is loop-invariant no predication is needed.

3309return !TheLoop->isLoopInvariant(I->getOperand(1));

3310 }

3311}

3312

3313std::pair<InstructionCost, InstructionCost>

3314LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,

3315ElementCount VF) const{

3316assert(I->getOpcode() == Instruction::UDiv ||

3317I->getOpcode() == Instruction::SDiv ||

3318I->getOpcode() == Instruction::SRem ||

3319I->getOpcode() == Instruction::URem);

3320assert(!isSafeToSpeculativelyExecute(I));

3321

3322// Scalarization isn't legal for scalable vector types

3323InstructionCost ScalarizationCost =InstructionCost::getInvalid();

3324if (!VF.isScalable()) {

3325// Get the scalarization cost and scale this amount by the probability of

3326// executing the predicated block. If the instruction is not predicated,

3327// we fall through to the next case.

3328 ScalarizationCost = 0;

3329

3330// These instructions have a non-void type, so account for the phi nodes

3331// that we will create. This cost is likely to be zero. The phi node

3332// cost, if any, should be scaled by the block probability because it

3333// models a copy at the end of each predicated block.

3334 ScalarizationCost += VF.getKnownMinValue() *

3335TTI.getCFInstrCost(Instruction::PHI,CostKind);

3336

3337// The cost of the non-predicated instruction.

3338 ScalarizationCost += VF.getKnownMinValue() *

3339TTI.getArithmeticInstrCost(I->getOpcode(),I->getType(),CostKind);

3340

3341// The cost of insertelement and extractelement instructions needed for

3342// scalarization.

3343 ScalarizationCost += getScalarizationOverhead(I, VF);

3344

3345// Scale the cost by the probability of executing the predicated blocks.

3346// This assumes the predicated block for each vector lane is equally

3347// likely.

3348 ScalarizationCost = ScalarizationCost /getReciprocalPredBlockProb();

3349 }

3350InstructionCost SafeDivisorCost = 0;

3351

3352auto *VecTy =toVectorTy(I->getType(), VF);

3353

3354// The cost of the select guard to ensure all lanes are well defined

3355// after we speculate above any internal control flow.

3356 SafeDivisorCost +=

3357TTI.getCmpSelInstrCost(Instruction::Select, VecTy,

3358toVectorTy(Type::getInt1Ty(I->getContext()), VF),

3359CmpInst::BAD_ICMP_PREDICATE,CostKind);

3360

3361// Certain instructions can be cheaper to vectorize if they have a constant

3362// second vector operand. One example of this are shifts on x86.

3363Value *Op2 =I->getOperand(1);

3364auto Op2Info =TTI.getOperandInfo(Op2);

3365if (Op2Info.Kind ==TargetTransformInfo::OK_AnyValue &&

3366Legal->isInvariant(Op2))

3367 Op2Info.Kind =TargetTransformInfo::OK_UniformValue;

3368

3369SmallVector<const Value *, 4>Operands(I->operand_values());

3370 SafeDivisorCost +=TTI.getArithmeticInstrCost(

3371I->getOpcode(), VecTy,CostKind,

3372 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},

3373 Op2Info,Operands,I);

3374return {ScalarizationCost, SafeDivisorCost};

3375}

3376

3377boolLoopVectorizationCostModel::interleavedAccessCanBeWidened(

3378Instruction *I,ElementCount VF) const{

3379assert(isAccessInterleaved(I) &&"Expecting interleaved access.");

3380assert(getWideningDecision(I, VF) ==CM_Unknown &&

3381"Decision should not be set yet.");

3382auto *Group =getInterleavedAccessGroup(I);

3383assert(Group &&"Must have a group.");

3384unsigned InterleaveFactor = Group->getFactor();

3385

3386// If the instruction's allocated size doesn't equal its type size, it

3387// requires padding and will be scalarized.

3388auto &DL =I->getDataLayout();

3389auto *ScalarTy =getLoadStoreType(I);

3390if (hasIrregularType(ScalarTy,DL))

3391returnfalse;

3392

3393// We currently only know how to emit interleave/deinterleave with

3394// Factor=2 for scalable vectors. This is purely an implementation

3395// limit.

3396if (VF.isScalable() && InterleaveFactor != 2)

3397returnfalse;

3398

3399// If the group involves a non-integral pointer, we may not be able to

3400// losslessly cast all values to a common type.

3401bool ScalarNI =DL.isNonIntegralPointerType(ScalarTy);

3402for (unsignedIdx = 0;Idx < InterleaveFactor;Idx++) {

3403Instruction *Member = Group->getMember(Idx);

3404if (!Member)

3405continue;

3406auto *MemberTy =getLoadStoreType(Member);

3407bool MemberNI =DL.isNonIntegralPointerType(MemberTy);

3408// Don't coerce non-integral pointers to integers or vice versa.

3409if (MemberNI != ScalarNI)

3410// TODO: Consider adding special nullptr value case here

3411returnfalse;

3412if (MemberNI && ScalarNI &&

3413 ScalarTy->getPointerAddressSpace() !=

3414MemberTy->getPointerAddressSpace())

3415returnfalse;

3416 }

3417

3418// Check if masking is required.

3419// A Group may need masking for one of two reasons: it resides in a block that

3420// needs predication, or it was decided to use masking to deal with gaps

3421// (either a gap at the end of a load-access that may result in a speculative

3422// load, or any gaps in a store-access).

3423bool PredicatedAccessRequiresMasking =

3424blockNeedsPredicationForAnyReason(I->getParent()) &&

3425Legal->isMaskRequired(I);

3426bool LoadAccessWithGapsRequiresEpilogMasking =

3427 isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&

3428 !isScalarEpilogueAllowed();

3429bool StoreAccessWithGapsRequiresMasking =

3430 isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());

3431if (!PredicatedAccessRequiresMasking &&

3432 !LoadAccessWithGapsRequiresEpilogMasking &&

3433 !StoreAccessWithGapsRequiresMasking)

3434returntrue;

3435

3436// If masked interleaving is required, we expect that the user/target had

3437// enabled it, because otherwise it either wouldn't have been created or

3438// it should have been invalidated by the CostModel.

3439assert(useMaskedInterleavedAccesses(TTI) &&

3440"Masked interleave-groups for predicated accesses are not enabled.");

3441

3442if (Group->isReverse())

3443returnfalse;

3444

3445auto *Ty =getLoadStoreType(I);

3446constAlign Alignment =getLoadStoreAlignment(I);

3447return isa<LoadInst>(I) ?TTI.isLegalMaskedLoad(Ty, Alignment)

3448 :TTI.isLegalMaskedStore(Ty, Alignment);

3449}

3450

3451boolLoopVectorizationCostModel::memoryInstructionCanBeWidened(

3452Instruction *I,ElementCount VF) {

3453// Get and ensure we have a valid memory instruction.

3454assert((isa<LoadInst, StoreInst>(I)) &&"Invalid memory instruction");

3455

3456auto *Ptr =getLoadStorePointerOperand(I);

3457auto *ScalarTy =getLoadStoreType(I);

3458

3459// In order to be widened, the pointer should be consecutive, first of all.

3460if (!Legal->isConsecutivePtr(ScalarTy,Ptr))

3461returnfalse;

3462

3463// If the instruction is a store located in a predicated block, it will be

3464// scalarized.

3465if (isScalarWithPredication(I, VF))

3466returnfalse;

3467

3468// If the instruction's allocated size doesn't equal it's type size, it

3469// requires padding and will be scalarized.

3470auto &DL =I->getDataLayout();

3471if (hasIrregularType(ScalarTy,DL))

3472returnfalse;

3473

3474returntrue;

3475}

3476

3477void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {

3478// We should not collect Uniforms more than once per VF. Right now,

3479// this function is called from collectUniformsAndScalars(), which

3480// already does this check. Collecting Uniforms for VF=1 does not make any

3481// sense.

3482

3483assert(VF.isVector() && !Uniforms.contains(VF) &&

3484"This function should not be visited twice for the same VF");

3485

3486// Visit the list of Uniforms. If we find no uniform value, we won't

3487// analyze again. Uniforms.count(VF) will return 1.

3488 Uniforms[VF].clear();

3489

3490// Now we know that the loop is vectorizable!

3491// Collect instructions inside the loop that will remain uniform after

3492// vectorization.

3493

3494// Global values, params and instructions outside of current loop are out of

3495// scope.

3496auto IsOutOfScope = [&](Value *V) ->bool {

3497Instruction *I = dyn_cast<Instruction>(V);

3498return (!I || !TheLoop->contains(I));

3499 };

3500

3501// Worklist containing uniform instructions demanding lane 0.

3502SetVector<Instruction *> Worklist;

3503

3504// Add uniform instructions demanding lane 0 to the worklist. Instructions

3505// that require predication must not be considered uniform after

3506// vectorization, because that would create an erroneous replicating region

3507// where only a single instance out of VF should be formed.

3508auto AddToWorklistIfAllowed = [&](Instruction *I) ->void {

3509if (IsOutOfScope(I)) {

3510LLVM_DEBUG(dbgs() <<"LV: Found not uniform due to scope: "

3511 << *I <<"\n");

3512return;

3513 }

3514if (isPredicatedInst(I)) {

3515LLVM_DEBUG(

3516dbgs() <<"LV: Found not uniform due to requiring predication: " << *I

3517 <<"\n");

3518return;

3519 }

3520LLVM_DEBUG(dbgs() <<"LV: Found uniform instruction: " << *I <<"\n");

3521 Worklist.insert(I);

3522 };

3523

3524// Start with the conditional branches exiting the loop. If the branch

3525// condition is an instruction contained in the loop that is only used by the

3526// branch, it is uniform. Note conditions from uncountable early exits are not

3527// uniform.

3528SmallVector<BasicBlock *> Exiting;

3529TheLoop->getExitingBlocks(Exiting);

3530for (BasicBlock *E : Exiting) {

3531if (Legal->hasUncountableEarlyExit() &&TheLoop->getLoopLatch() != E)

3532continue;

3533auto *Cmp = dyn_cast<Instruction>(E->getTerminator()->getOperand(0));

3534if (Cmp &&TheLoop->contains(Cmp) &&Cmp->hasOneUse())

3535 AddToWorklistIfAllowed(Cmp);

3536 }

3537

3538auto PrevVF = VF.divideCoefficientBy(2);

3539// Return true if all lanes perform the same memory operation, and we can

3540// thus choose to execute only one.

3541auto IsUniformMemOpUse = [&](Instruction *I) {

3542// If the value was already known to not be uniform for the previous

3543// (smaller VF), it cannot be uniform for the larger VF.

3544if (PrevVF.isVector()) {

3545auto Iter = Uniforms.find(PrevVF);

3546if (Iter != Uniforms.end() && !Iter->second.contains(I))

3547returnfalse;

3548 }

3549if (!Legal->isUniformMemOp(*I, VF))

3550returnfalse;

3551if (isa<LoadInst>(I))

3552// Loading the same address always produces the same result - at least

3553// assuming aliasing and ordering which have already been checked.

3554returntrue;

3555// Storing the same value on every iteration.

3556returnTheLoop->isLoopInvariant(cast<StoreInst>(I)->getValueOperand());

3557 };

3558

3559auto IsUniformDecision = [&](Instruction *I,ElementCount VF) {

3560InstWidening WideningDecision =getWideningDecision(I, VF);

3561assert(WideningDecision !=CM_Unknown &&

3562"Widening decision should be ready at this moment");

3563

3564if (IsUniformMemOpUse(I))

3565returntrue;

3566

3567return (WideningDecision ==CM_Widen ||

3568 WideningDecision ==CM_Widen_Reverse ||

3569 WideningDecision ==CM_Interleave);

3570 };

3571

3572// Returns true if Ptr is the pointer operand of a memory access instruction

3573// I, I is known to not require scalarization, and the pointer is not also

3574// stored.

3575auto IsVectorizedMemAccessUse = [&](Instruction *I,Value *Ptr) ->bool {

3576if (isa<StoreInst>(I) &&I->getOperand(0) ==Ptr)

3577returnfalse;

3578returngetLoadStorePointerOperand(I) ==Ptr &&

3579 (IsUniformDecision(I, VF) ||Legal->isInvariant(Ptr));

3580 };

3581

3582// Holds a list of values which are known to have at least one uniform use.

3583// Note that there may be other uses which aren't uniform. A "uniform use"

3584// here is something which only demands lane 0 of the unrolled iterations;

3585// it does not imply that all lanes produce the same value (e.g. this is not

3586// the usual meaning of uniform)

3587SetVector<Value *> HasUniformUse;

3588

3589// Scan the loop for instructions which are either a) known to have only

3590// lane 0 demanded or b) are uses which demand only lane 0 of their operand.

3591for (auto *BB :TheLoop->blocks())

3592for (auto &I : *BB) {

3593if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {

3594switch (II->getIntrinsicID()) {

3595case Intrinsic::sideeffect:

3596case Intrinsic::experimental_noalias_scope_decl:

3597case Intrinsic::assume:

3598case Intrinsic::lifetime_start:

3599case Intrinsic::lifetime_end:

3600if (TheLoop->hasLoopInvariantOperands(&I))

3601 AddToWorklistIfAllowed(&I);

3602break;

3603default:

3604break;

3605 }

3606 }

3607

3608// ExtractValue instructions must be uniform, because the operands are

3609// known to be loop-invariant.

3610if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {

3611assert(IsOutOfScope(EVI->getAggregateOperand()) &&

3612"Expected aggregate value to be loop invariant");

3613 AddToWorklistIfAllowed(EVI);

3614continue;

3615 }

3616

3617// If there's no pointer operand, there's nothing to do.

3618auto *Ptr =getLoadStorePointerOperand(&I);

3619if (!Ptr)

3620continue;

3621

3622if (IsUniformMemOpUse(&I))

3623 AddToWorklistIfAllowed(&I);

3624

3625if (IsVectorizedMemAccessUse(&I,Ptr))

3626 HasUniformUse.insert(Ptr);

3627 }

3628

3629// Add to the worklist any operands which have *only* uniform (e.g. lane 0

3630// demanding) users. Since loops are assumed to be in LCSSA form, this

3631// disallows uses outside the loop as well.

3632for (auto *V : HasUniformUse) {

3633if (IsOutOfScope(V))

3634continue;

3635auto *I = cast<Instruction>(V);

3636bool UsersAreMemAccesses =all_of(I->users(), [&](User *U) ->bool {

3637 auto *UI = cast<Instruction>(U);

3638 return TheLoop->contains(UI) && IsVectorizedMemAccessUse(UI, V);

3639 });

3640if (UsersAreMemAccesses)

3641 AddToWorklistIfAllowed(I);

3642 }

3643

3644// Expand Worklist in topological order: whenever a new instruction

3645// is added , its users should be already inside Worklist. It ensures

3646// a uniform instruction will only be used by uniform instructions.

3647unsignedIdx = 0;

3648while (Idx != Worklist.size()) {

3649Instruction *I = Worklist[Idx++];

3650

3651for (auto *OV :I->operand_values()) {

3652// isOutOfScope operands cannot be uniform instructions.

3653if (IsOutOfScope(OV))

3654continue;

3655// First order recurrence Phi's should typically be considered

3656// non-uniform.

3657auto *OP = dyn_cast<PHINode>(OV);

3658if (OP &&Legal->isFixedOrderRecurrence(OP))

3659continue;

3660// If all the users of the operand are uniform, then add the

3661// operand into the uniform worklist.

3662auto *OI = cast<Instruction>(OV);

3663if (llvm::all_of(OI->users(), [&](User *U) ->bool {

3664 auto *J = cast<Instruction>(U);

3665 return Worklist.count(J) || IsVectorizedMemAccessUse(J, OI);

3666 }))

3667 AddToWorklistIfAllowed(OI);

3668 }

3669 }

3670

3671// For an instruction to be added into Worklist above, all its users inside

3672// the loop should also be in Worklist. However, this condition cannot be

3673// true for phi nodes that form a cyclic dependence. We must process phi

3674// nodes separately. An induction variable will remain uniform if all users

3675// of the induction variable and induction variable update remain uniform.

3676// The code below handles both pointer and non-pointer induction variables.

3677BasicBlock *Latch =TheLoop->getLoopLatch();

3678for (constauto &Induction :Legal->getInductionVars()) {

3679auto *Ind = Induction.first;

3680auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));

3681

3682// Determine if all users of the induction variable are uniform after

3683// vectorization.

3684bool UniformInd =all_of(Ind->users(), [&](User *U) ->bool {

3685 auto *I = cast<Instruction>(U);

3686 return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||

3687 IsVectorizedMemAccessUse(I, Ind);

3688 });

3689if (!UniformInd)

3690continue;

3691

3692// Determine if all users of the induction variable update instruction are

3693// uniform after vectorization.

3694bool UniformIndUpdate =all_of(IndUpdate->users(), [&](User *U) ->bool {

3695 auto *I = cast<Instruction>(U);

3696 return I == Ind || Worklist.count(I) ||

3697 IsVectorizedMemAccessUse(I, IndUpdate);

3698 });

3699if (!UniformIndUpdate)

3700continue;

3701

3702// The induction variable and its update instruction will remain uniform.

3703 AddToWorklistIfAllowed(Ind);

3704 AddToWorklistIfAllowed(IndUpdate);

3705 }

3706

3707 Uniforms[VF].insert(Worklist.begin(), Worklist.end());

3708}

3709

3710boolLoopVectorizationCostModel::runtimeChecksRequired() {

3711LLVM_DEBUG(dbgs() <<"LV: Performing code size checks.\n");

3712

3713if (Legal->getRuntimePointerChecking()->Need) {

3714reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",

3715"runtime pointer checks needed. Enable vectorization of this "

3716"loop with '#pragma clang loop vectorize(enable)' when "

3717"compiling with -Os/-Oz",

3718"CantVersionLoopWithOptForSize",ORE,TheLoop);

3719returntrue;

3720 }

3721

3722if (!PSE.getPredicate().isAlwaysTrue()) {

3723reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",

3724"runtime SCEV checks needed. Enable vectorization of this "

3725"loop with '#pragma clang loop vectorize(enable)' when "

3726"compiling with -Os/-Oz",

3727"CantVersionLoopWithOptForSize",ORE,TheLoop);

3728returntrue;

3729 }

3730

3731// FIXME: Avoid specializing for stride==1 instead of bailing out.

3732if (!Legal->getLAI()->getSymbolicStrides().empty()) {

3733reportVectorizationFailure("Runtime stride check for small trip count",

3734"runtime stride == 1 checks needed. Enable vectorization of "

3735"this loop without such check by compiling with -Os/-Oz",

3736"CantVersionLoopWithOptForSize",ORE,TheLoop);

3737returntrue;

3738 }

3739

3740returnfalse;

3741}

3742

3743bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {

3744if (IsScalableVectorizationAllowed)

3745return *IsScalableVectorizationAllowed;

3746

3747 IsScalableVectorizationAllowed =false;

3748if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)

3749returnfalse;

3750

3751if (Hints->isScalableVectorizationDisabled()) {

3752reportVectorizationInfo("Scalable vectorization is explicitly disabled",

3753"ScalableVectorizationDisabled",ORE,TheLoop);

3754returnfalse;

3755 }

3756

3757LLVM_DEBUG(dbgs() <<"LV: Scalable vectorization is available\n");

3758

3759auto MaxScalableVF =ElementCount::getScalable(

3760 std::numeric_limits<ElementCount::ScalarTy>::max());

3761

3762// Test that the loop-vectorizer can legalize all operations for this MaxVF.

3763// FIXME: While for scalable vectors this is currently sufficient, this should

3764// be replaced by a more detailed mechanism that filters out specific VFs,

3765// instead of invalidating vectorization for a whole set of VFs based on the

3766// MaxVF.

3767

3768// Disable scalable vectorization if the loop contains unsupported reductions.

3769if (!canVectorizeReductions(MaxScalableVF)) {

3770reportVectorizationInfo(

3771"Scalable vectorization not supported for the reduction "

3772"operations found in this loop.",

3773"ScalableVFUnfeasible",ORE,TheLoop);

3774returnfalse;

3775 }

3776

3777// Disable scalable vectorization if the loop contains any instructions

3778// with element types not supported for scalable vectors.

3779if (any_of(ElementTypesInLoop, [&](Type *Ty) {

3780return !Ty->isVoidTy() &&

3781 !this->TTI.isElementTypeLegalForScalableVector(Ty);

3782 })) {

3783reportVectorizationInfo("Scalable vectorization is not supported "

3784"for all element types found in this loop.",

3785"ScalableVFUnfeasible",ORE,TheLoop);

3786returnfalse;

3787 }

3788

3789if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction,TTI)) {

3790reportVectorizationInfo("The target does not provide maximum vscale value "

3791"for safe distance analysis.",

3792"ScalableVFUnfeasible",ORE,TheLoop);

3793returnfalse;

3794 }

3795

3796 IsScalableVectorizationAllowed =true;

3797returntrue;

3798}

3799

3800ElementCount

3801LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

3802if (!isScalableVectorizationAllowed())

3803returnElementCount::getScalable(0);

3804

3805auto MaxScalableVF =ElementCount::getScalable(

3806 std::numeric_limits<ElementCount::ScalarTy>::max());

3807if (Legal->isSafeForAnyVectorWidth())

3808return MaxScalableVF;

3809

3810 std::optional<unsigned> MaxVScale =getMaxVScale(*TheFunction,TTI);

3811// Limit MaxScalableVF by the maximum safe dependence distance.

3812 MaxScalableVF =ElementCount::getScalable(MaxSafeElements / *MaxVScale);

3813

3814if (!MaxScalableVF)

3815reportVectorizationInfo(

3816"Max legal vector width too small, scalable vectorization "

3817"unfeasible.",

3818"ScalableVFUnfeasible",ORE,TheLoop);

3819

3820return MaxScalableVF;

3821}

3822

3823FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(

3824unsigned MaxTripCount,ElementCount UserVF,bool FoldTailByMasking) {

3825 MinBWs =computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);

3826unsigned SmallestType, WidestType;

3827 std::tie(SmallestType, WidestType) =getSmallestAndWidestTypes();

3828

3829// Get the maximum safe dependence distance in bits computed by LAA.

3830// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from

3831// the memory accesses that is most restrictive (involved in the smallest

3832// dependence distance).

3833unsigned MaxSafeElements =

3834llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);

3835

3836auto MaxSafeFixedVF =ElementCount::getFixed(MaxSafeElements);

3837auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);

3838if (!Legal->isSafeForAnyVectorWidth())

3839 this->MaxSafeElements = MaxSafeElements;

3840

3841LLVM_DEBUG(dbgs() <<"LV: The max safe fixed VF is: " << MaxSafeFixedVF

3842 <<".\n");

3843LLVM_DEBUG(dbgs() <<"LV: The max safe scalable VF is: " << MaxSafeScalableVF

3844 <<".\n");

3845

3846// First analyze the UserVF, fall back if the UserVF should be ignored.

3847if (UserVF) {

3848auto MaxSafeUserVF =

3849 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;

3850

3851if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {

3852// If `VF=vscale x N` is safe, then so is `VF=N`

3853if (UserVF.isScalable())

3854returnFixedScalableVFPair(

3855ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);

3856

3857return UserVF;

3858 }

3859

3860assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));

3861

3862// Only clamp if the UserVF is not scalable. If the UserVF is scalable, it

3863// is better to ignore the hint and let the compiler choose a suitable VF.

3864if (!UserVF.isScalable()) {

3865LLVM_DEBUG(dbgs() <<"LV: User VF=" << UserVF

3866 <<" is unsafe, clamping to max safe VF="

3867 << MaxSafeFixedVF <<".\n");

3868ORE->emit([&]() {

3869returnOptimizationRemarkAnalysis(DEBUG_TYPE,"VectorizationFactor",

3870TheLoop->getStartLoc(),

3871TheLoop->getHeader())

3872 <<"User-specified vectorization factor "

3873 <<ore::NV("UserVectorizationFactor", UserVF)

3874 <<" is unsafe, clamping to maximum safe vectorization factor "

3875 <<ore::NV("VectorizationFactor", MaxSafeFixedVF);

3876 });

3877return MaxSafeFixedVF;

3878 }

3879

3880if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {

3881LLVM_DEBUG(dbgs() <<"LV: User VF=" << UserVF

3882 <<" is ignored because scalable vectors are not "

3883"available.\n");

3884ORE->emit([&]() {

3885returnOptimizationRemarkAnalysis(DEBUG_TYPE,"VectorizationFactor",

3886TheLoop->getStartLoc(),

3887TheLoop->getHeader())

3888 <<"User-specified vectorization factor "

3889 <<ore::NV("UserVectorizationFactor", UserVF)

3890 <<" is ignored because the target does not support scalable "

3891"vectors. The compiler will pick a more suitable value.";

3892 });

3893 }else {

3894LLVM_DEBUG(dbgs() <<"LV: User VF=" << UserVF

3895 <<" is unsafe. Ignoring scalable UserVF.\n");

3896ORE->emit([&]() {

3897returnOptimizationRemarkAnalysis(DEBUG_TYPE,"VectorizationFactor",

3898TheLoop->getStartLoc(),

3899TheLoop->getHeader())

3900 <<"User-specified vectorization factor "

3901 <<ore::NV("UserVectorizationFactor", UserVF)

3902 <<" is unsafe. Ignoring the hint to let the compiler pick a "

3903"more suitable value.";

3904 });

3905 }

3906 }

3907

3908LLVM_DEBUG(dbgs() <<"LV: The Smallest and Widest types: " << SmallestType

3909 <<" / " << WidestType <<" bits.\n");

3910

3911FixedScalableVFPair Result(ElementCount::getFixed(1),

3912ElementCount::getScalable(0));

3913if (auto MaxVF =

3914 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,

3915 MaxSafeFixedVF, FoldTailByMasking))

3916Result.FixedVF = MaxVF;

3917

3918if (auto MaxVF =

3919 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,

3920 MaxSafeScalableVF, FoldTailByMasking))

3921if (MaxVF.isScalable()) {

3922Result.ScalableVF = MaxVF;

3923LLVM_DEBUG(dbgs() <<"LV: Found feasible scalable VF = " << MaxVF

3924 <<"\n");

3925 }

3926

3927returnResult;

3928}

3929

3930FixedScalableVFPair

3931LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF,unsigned UserIC) {

3932if (Legal->getRuntimePointerChecking()->Need &&TTI.hasBranchDivergence()) {

3933// TODO: It may be useful to do since it's still likely to be dynamically

3934// uniform if the target can skip.

3935reportVectorizationFailure(

3936"Not inserting runtime ptr check for divergent target",

3937"runtime pointer checks needed. Not enabled for divergent target",

3938"CantVersionLoopWithDivergentTarget",ORE,TheLoop);

3939returnFixedScalableVFPair::getNone();

3940 }

3941

3942ScalarEvolution *SE =PSE.getSE();

3943unsigned TC = SE->getSmallConstantTripCount(TheLoop);

3944unsigned MaxTC =PSE.getSmallConstantMaxTripCount();

3945LLVM_DEBUG(dbgs() <<"LV: Found trip count: " << TC <<'\n');

3946if (TC != MaxTC)

3947LLVM_DEBUG(dbgs() <<"LV: Found maximum trip count: " << MaxTC <<'\n');

3948if (TC == 1) {

3949reportVectorizationFailure("Single iteration (non) loop",

3950"loop trip count is one, irrelevant for vectorization",

3951"SingleIterationLoop",ORE,TheLoop);

3952returnFixedScalableVFPair::getNone();

3953 }

3954

3955// If BTC matches the widest induction type and is -1 then the trip count

3956// computation will wrap to 0 and the vector trip count will be 0. Do not try

3957// to vectorize.

3958constSCEV *BTC = SE->getBackedgeTakenCount(TheLoop);

3959if (!isa<SCEVCouldNotCompute>(BTC) &&

3960 BTC->getType()->getScalarSizeInBits() >=

3961Legal->getWidestInductionType()->getScalarSizeInBits() &&

3962 SE->isKnownPredicate(CmpInst::ICMP_EQ, BTC,

3963 SE->getMinusOne(BTC->getType()))) {

3964reportVectorizationFailure(

3965"Trip count computation wrapped",

3966"backedge-taken count is -1, loop trip count wrapped to 0",

3967"TripCountWrapped",ORE,TheLoop);

3968returnFixedScalableVFPair::getNone();

3969 }

3970

3971switch (ScalarEpilogueStatus) {

3972caseCM_ScalarEpilogueAllowed:

3973return computeFeasibleMaxVF(MaxTC, UserVF,false);

3974caseCM_ScalarEpilogueNotAllowedUsePredicate:

3975 [[fallthrough]];

3976caseCM_ScalarEpilogueNotNeededUsePredicate:

3977LLVM_DEBUG(

3978dbgs() <<"LV: vector predicate hint/switch found.\n"

3979 <<"LV: Not allowing scalar epilogue, creating predicated "

3980 <<"vector loop.\n");

3981break;

3982caseCM_ScalarEpilogueNotAllowedLowTripLoop:

3983// fallthrough as a special case of OptForSize

3984caseCM_ScalarEpilogueNotAllowedOptSize:

3985if (ScalarEpilogueStatus ==CM_ScalarEpilogueNotAllowedOptSize)

3986LLVM_DEBUG(

3987dbgs() <<"LV: Not allowing scalar epilogue due to -Os/-Oz.\n");

3988else

3989LLVM_DEBUG(dbgs() <<"LV: Not allowing scalar epilogue due to low trip "

3990 <<"count.\n");

3991

3992// Bail if runtime checks are required, which are not good when optimising

3993// for size.

3994if (runtimeChecksRequired())

3995returnFixedScalableVFPair::getNone();

3996

3997break;

3998 }

3999

4000// The only loops we can vectorize without a scalar epilogue, are loops with

4001// a bottom-test and a single exiting block. We'd have to handle the fact

4002// that not every instruction executes on the last iteration. This will

4003// require a lane mask which varies through the vector loop body. (TODO)

4004if (TheLoop->getExitingBlock() !=TheLoop->getLoopLatch()) {

4005// If there was a tail-folding hint/switch, but we can't fold the tail by

4006// masking, fallback to a vectorization with a scalar epilogue.

4007if (ScalarEpilogueStatus ==CM_ScalarEpilogueNotNeededUsePredicate) {

4008LLVM_DEBUG(dbgs() <<"LV: Cannot fold tail by masking: vectorize with a "

4009"scalar epilogue instead.\n");

4010 ScalarEpilogueStatus =CM_ScalarEpilogueAllowed;

4011return computeFeasibleMaxVF(MaxTC, UserVF,false);

4012 }

4013returnFixedScalableVFPair::getNone();

4014 }

4015

4016// Now try the tail folding

4017

4018// Invalidate interleave groups that require an epilogue if we can't mask

4019// the interleave-group.

4020if (!useMaskedInterleavedAccesses(TTI)) {

4021assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&

4022"No decisions should have been taken at this point");

4023// Note: There is no need to invalidate any cost modeling decisions here, as

4024// none were taken so far.

4025InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();

4026 }

4027

4028FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF,true);

4029

4030// Avoid tail folding if the trip count is known to be a multiple of any VF

4031// we choose.

4032 std::optional<unsigned> MaxPowerOf2RuntimeVF =

4033 MaxFactors.FixedVF.getFixedValue();

4034if (MaxFactors.ScalableVF) {

4035 std::optional<unsigned> MaxVScale =getMaxVScale(*TheFunction,TTI);

4036if (MaxVScale &&TTI.isVScaleKnownToBeAPowerOfTwo()) {

4037 MaxPowerOf2RuntimeVF = std::max<unsigned>(

4038 *MaxPowerOf2RuntimeVF,

4039 *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());

4040 }else

4041 MaxPowerOf2RuntimeVF = std::nullopt;// Stick with tail-folding for now.

4042 }

4043

4044if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {

4045assert((UserVF.isNonZero() ||isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&

4046"MaxFixedVF must be a power of 2");

4047unsigned MaxVFtimesIC =

4048 UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;

4049ScalarEvolution *SE =PSE.getSE();

4050// Currently only loops with countable exits are vectorized, but calling

4051// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with

4052// uncountable exits whilst also ensuring the symbolic maximum and known

4053// back-edge taken count remain identical for loops with countable exits.

4054constSCEV *BackedgeTakenCount =PSE.getSymbolicMaxBackedgeTakenCount();

4055assert(BackedgeTakenCount ==PSE.getBackedgeTakenCount() &&

4056"Invalid loop count");

4057constSCEV *ExitCount = SE->getAddExpr(

4058 BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));

4059constSCEV *Rem = SE->getURemExpr(

4060 SE->applyLoopGuards(ExitCount,TheLoop),

4061 SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));

4062if (Rem->isZero()) {

4063// Accept MaxFixedVF if we do not have a tail.

4064LLVM_DEBUG(dbgs() <<"LV: No tail will remain for any chosen VF.\n");

4065return MaxFactors;

4066 }

4067 }

4068

4069// If we don't know the precise trip count, or if the trip count that we

4070// found modulo the vectorization factor is not zero, try to fold the tail

4071// by masking.

4072// FIXME: look for a smaller MaxVF that does divide TC rather than masking.

4073setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);

4074if (foldTailByMasking()) {

4075if (getTailFoldingStyle() ==TailFoldingStyle::DataWithEVL) {

4076LLVM_DEBUG(

4077dbgs()

4078 <<"LV: tail is folded with EVL, forcing unroll factor to be 1. Will "

4079"try to generate VP Intrinsics with scalable vector "

4080"factors only.\n");

4081// Tail folded loop using VP intrinsics restricts the VF to be scalable

4082// for now.

4083// TODO: extend it for fixed vectors, if required.

4084assert(MaxFactors.ScalableVF.isScalable() &&

4085"Expected scalable vector factor.");

4086

4087 MaxFactors.FixedVF =ElementCount::getFixed(1);

4088 }

4089return MaxFactors;

4090 }

4091

4092// If there was a tail-folding hint/switch, but we can't fold the tail by

4093// masking, fallback to a vectorization with a scalar epilogue.

4094if (ScalarEpilogueStatus ==CM_ScalarEpilogueNotNeededUsePredicate) {

4095LLVM_DEBUG(dbgs() <<"LV: Cannot fold tail by masking: vectorize with a "

4096"scalar epilogue instead.\n");

4097 ScalarEpilogueStatus =CM_ScalarEpilogueAllowed;

4098return MaxFactors;

4099 }

4100

4101if (ScalarEpilogueStatus ==CM_ScalarEpilogueNotAllowedUsePredicate) {

4102LLVM_DEBUG(dbgs() <<"LV: Can't fold tail by masking: don't vectorize\n");

4103returnFixedScalableVFPair::getNone();

4104 }

4105

4106if (TC == 0) {

4107reportVectorizationFailure(

4108"unable to calculate the loop count due to complex control flow",

4109"UnknownLoopCountComplexCFG",ORE,TheLoop);

4110returnFixedScalableVFPair::getNone();

4111 }

4112

4113reportVectorizationFailure(

4114"Cannot optimize for size and vectorize at the same time.",

4115"cannot optimize for size and vectorize at the same time. "

4116"Enable vectorization of this loop with '#pragma clang loop "

4117"vectorize(enable)' when compiling with -Os/-Oz",

4118"NoTailLoopWithOptForSize",ORE,TheLoop);

4119returnFixedScalableVFPair::getNone();

4120}

4121

4122ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(

4123unsigned MaxTripCount,unsigned SmallestType,unsigned WidestType,

4124ElementCount MaxSafeVF,bool FoldTailByMasking) {

4125bool ComputeScalableMaxVF = MaxSafeVF.isScalable();

4126constTypeSize WidestRegister =TTI.getRegisterBitWidth(

4127 ComputeScalableMaxVF ?TargetTransformInfo::RGK_ScalableVector

4128 :TargetTransformInfo::RGK_FixedWidthVector);

4129

4130// Convenience function to return the minimum of two ElementCounts.

4131auto MinVF = [](constElementCount &LHS,constElementCount &RHS) {

4132assert((LHS.isScalable() ==RHS.isScalable()) &&

4133"Scalable flags must match");

4134returnElementCount::isKnownLT(LHS, RHS) ?LHS :RHS;

4135 };

4136

4137// Ensure MaxVF is a power of 2; the dependence distance bound may not be.

4138// Note that both WidestRegister and WidestType may not be a powers of 2.

4139auto MaxVectorElementCount =ElementCount::get(

4140llvm::bit_floor(WidestRegister.getKnownMinValue() / WidestType),

4141 ComputeScalableMaxVF);

4142 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);

4143LLVM_DEBUG(dbgs() <<"LV: The Widest register safe to use is: "

4144 << (MaxVectorElementCount * WidestType) <<" bits.\n");

4145

4146if (!MaxVectorElementCount) {

4147LLVM_DEBUG(dbgs() <<"LV: The target has no "

4148 << (ComputeScalableMaxVF ?"scalable" :"fixed")

4149 <<" vector registers.\n");

4150returnElementCount::getFixed(1);

4151 }

4152

4153unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();

4154if (MaxVectorElementCount.isScalable() &&

4155TheFunction->hasFnAttribute(Attribute::VScaleRange)) {

4156auto Attr =TheFunction->getFnAttribute(Attribute::VScaleRange);

4157auto Min = Attr.getVScaleRangeMin();

4158 WidestRegisterMinEC *= Min;

4159 }

4160

4161// When a scalar epilogue is required, at least one iteration of the scalar

4162// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a

4163// max VF that results in a dead vector loop.

4164if (MaxTripCount > 0 &&requiresScalarEpilogue(true))

4165 MaxTripCount -= 1;

4166

4167if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&

4168 (!FoldTailByMasking ||isPowerOf2_32(MaxTripCount))) {

4169// If upper bound loop trip count (TC) is known at compile time there is no

4170// point in choosing VF greater than TC (as done in the loop below). Select

4171// maximum power of two which doesn't exceed TC. If MaxVectorElementCount is

4172// scalable, we only fall back on a fixed VF when the TC is less than or

4173// equal to the known number of lanes.

4174auto ClampedUpperTripCount =llvm::bit_floor(MaxTripCount);

4175LLVM_DEBUG(dbgs() <<"LV: Clamping the MaxVF to maximum power of two not "

4176"exceeding the constant trip count: "

4177 << ClampedUpperTripCount <<"\n");

4178returnElementCount::get(

4179 ClampedUpperTripCount,

4180 FoldTailByMasking ? MaxVectorElementCount.isScalable() :false);

4181 }

4182

4183TargetTransformInfo::RegisterKind RegKind =

4184 ComputeScalableMaxVF ?TargetTransformInfo::RGK_ScalableVector

4185 :TargetTransformInfo::RGK_FixedWidthVector;

4186ElementCount MaxVF = MaxVectorElementCount;

4187if (MaximizeBandwidth ||

4188 (MaximizeBandwidth.getNumOccurrences() == 0 &&

4189 (TTI.shouldMaximizeVectorBandwidth(RegKind) ||

4190 (UseWiderVFIfCallVariantsPresent &&Legal->hasVectorCallVariants())))) {

4191auto MaxVectorElementCountMaxBW =ElementCount::get(

4192llvm::bit_floor(WidestRegister.getKnownMinValue() / SmallestType),

4193 ComputeScalableMaxVF);

4194 MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);

4195

4196// Collect all viable vectorization factors larger than the default MaxVF

4197// (i.e. MaxVectorElementCount).

4198SmallVector<ElementCount, 8> VFs;

4199for (ElementCount VS = MaxVectorElementCount * 2;

4200ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW);VS *= 2)

4201 VFs.push_back(VS);

4202

4203// For each VF calculate its register usage.

4204auto RUs =calculateRegisterUsage(VFs);

4205

4206// Select the largest VF which doesn't require more registers than existing

4207// ones.

4208for (intI = RUs.size() - 1;I >= 0; --I) {

4209constauto &MLU = RUs[I].MaxLocalUsers;

4210if (all_of(MLU, [&](decltype(MLU.front()) &LU) {

4211 return LU.second <= TTI.getNumberOfRegisters(LU.first);

4212 })) {

4213 MaxVF = VFs[I];

4214break;

4215 }

4216 }

4217if (ElementCount MinVF =

4218TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {

4219if (ElementCount::isKnownLT(MaxVF, MinVF)) {

4220LLVM_DEBUG(dbgs() <<"LV: Overriding calculated MaxVF(" << MaxVF

4221 <<") with target's minimum: " << MinVF <<'\n');

4222 MaxVF = MinVF;

4223 }

4224 }

4225

4226// Invalidate any widening decisions we might have made, in case the loop

4227// requires prediction (decided later), but we have already made some

4228// load/store widening decisions.

4229invalidateCostModelingDecisions();

4230 }

4231return MaxVF;

4232}

4233

4234/// Convenience function that returns the value of vscale_range iff

4235/// vscale_range.min == vscale_range.max or otherwise returns the value

4236/// returned by the corresponding TTI method.

4237static std::optional<unsigned>

4238getVScaleForTuning(constLoop *L,constTargetTransformInfo &TTI) {

4239constFunction *Fn = L->getHeader()->getParent();

4240if (Fn->hasFnAttribute(Attribute::VScaleRange)) {

4241auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);

4242auto Min = Attr.getVScaleRangeMin();

4243auto Max = Attr.getVScaleRangeMax();

4244if (Max && Min == Max)

4245return Max;

4246 }

4247

4248returnTTI.getVScaleForTuning();

4249}

4250

4251/// This function attempts to return a value that represents the vectorization

4252/// factor at runtime. For fixed-width VFs we know this precisely at compile

4253/// time, but for scalable VFs we calculate it based on an estimate of the

4254/// vscale value.

4255staticunsignedgetEstimatedRuntimeVF(constLoop *L,

4256constTargetTransformInfo &TTI,

4257ElementCount VF) {

4258unsigned EstimatedVF = VF.getKnownMinValue();

4259if (VF.isScalable())

4260if (std::optional<unsigned> VScale =getVScaleForTuning(L,TTI))

4261 EstimatedVF *= *VScale;

4262assert(EstimatedVF >= 1 &&"Estimated VF shouldn't be less than 1");

4263return EstimatedVF;

4264}

4265

4266bool LoopVectorizationPlanner::isMoreProfitable(

4267constVectorizationFactor &A,constVectorizationFactor &B,

4268constunsigned MaxTripCount) const{

4269InstructionCost CostA =A.Cost;

4270InstructionCost CostB =B.Cost;

4271

4272// Improve estimate for the vector width if it is scalable.

4273unsigned EstimatedWidthA =A.Width.getKnownMinValue();

4274unsigned EstimatedWidthB =B.Width.getKnownMinValue();

4275if (std::optional<unsigned> VScale =getVScaleForTuning(OrigLoop,TTI)) {

4276if (A.Width.isScalable())

4277 EstimatedWidthA *= *VScale;

4278if (B.Width.isScalable())

4279 EstimatedWidthB *= *VScale;

4280 }

4281

4282// Assume vscale may be larger than 1 (or the value being tuned for),

4283// so that scalable vectorization is slightly favorable over fixed-width

4284// vectorization.

4285bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&

4286A.Width.isScalable() && !B.Width.isScalable();

4287

4288auto CmpFn = [PreferScalable](constInstructionCost &LHS,

4289constInstructionCost &RHS) {

4290return PreferScalable ?LHS <=RHS :LHS <RHS;

4291 };

4292

4293// To avoid the need for FP division:

4294// (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)

4295// <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)

4296if (!MaxTripCount)

4297return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);

4298

4299auto GetCostForTC = [MaxTripCount,this](unsigned VF,

4300InstructionCost VectorCost,

4301InstructionCost ScalarCost) {

4302// If the trip count is a known (possibly small) constant, the trip count

4303// will be rounded up to an integer number of iterations under

4304// FoldTailByMasking. The total cost in that case will be

4305// VecCost*ceil(TripCount/VF). When not folding the tail, the total

4306// cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be

4307// some extra overheads, but for the purpose of comparing the costs of

4308// different VFs we can use this to compare the total loop-body cost

4309// expected after vectorization.

4310if (CM.foldTailByMasking())

4311return VectorCost *divideCeil(MaxTripCount, VF);

4312return VectorCost * (MaxTripCount / VF) + ScalarCost * (MaxTripCount % VF);

4313 };

4314

4315auto RTCostA = GetCostForTC(EstimatedWidthA, CostA,A.ScalarCost);

4316auto RTCostB = GetCostForTC(EstimatedWidthB, CostB,B.ScalarCost);

4317return CmpFn(RTCostA, RTCostB);

4318}

4319

4320bool LoopVectorizationPlanner::isMoreProfitable(

4321constVectorizationFactor &A,constVectorizationFactor &B) const{

4322constunsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();

4323return LoopVectorizationPlanner::isMoreProfitable(A,B, MaxTripCount);

4324}

4325

4326voidLoopVectorizationPlanner::emitInvalidCostRemarks(

4327OptimizationRemarkEmitter *ORE) {

4328usingRecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;

4329SmallVector<RecipeVFPair> InvalidCosts;

4330for (constauto &Plan : VPlans) {

4331for (ElementCount VF : Plan->vectorFactors()) {

4332VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),

4333 CM, CM.CostKind);

4334 precomputeCosts(*Plan, VF, CostCtx);

4335auto Iter =vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());

4336for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {

4337for (auto &R : *VPBB) {

4338if (!R.cost(VF, CostCtx).isValid())

4339 InvalidCosts.emplace_back(&R, VF);

4340 }

4341 }

4342 }

4343 }

4344if (InvalidCosts.empty())

4345return;

4346

4347// Emit a report of VFs with invalid costs in the loop.

4348

4349// Group the remarks per recipe, keeping the recipe order from InvalidCosts.

4350DenseMap<VPRecipeBase *, unsigned> Numbering;

4351unsignedI = 0;

4352for (auto &Pair : InvalidCosts)

4353if (!Numbering.count(Pair.first))

4354 Numbering[Pair.first] =I++;

4355

4356// Sort the list, first on recipe(number) then on VF.

4357sort(InvalidCosts, [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {

4358if (Numbering[A.first] != Numbering[B.first])

4359return Numbering[A.first] < Numbering[B.first];

4360constauto &LHS =A.second;

4361constauto &RHS =B.second;

4362return std::make_tuple(LHS.isScalable(),LHS.getKnownMinValue()) <

4363 std::make_tuple(RHS.isScalable(),RHS.getKnownMinValue());

4364 });

4365

4366// For a list of ordered recipe-VF pairs:

4367// [(load, VF1), (load, VF2), (store, VF1)]

4368// group the recipes together to emit separate remarks for:

4369// load (VF1, VF2)

4370// store (VF1)

4371autoTail =ArrayRef<RecipeVFPair>(InvalidCosts);

4372auto Subset =ArrayRef<RecipeVFPair>();

4373do {

4374if (Subset.empty())

4375 Subset =Tail.take_front(1);

4376

4377VPRecipeBase *R = Subset.front().first;

4378

4379unsigned Opcode =

4380TypeSwitch<const VPRecipeBase *, unsigned>(R)

4381 .Case<VPHeaderPHIRecipe>(

4382 [](constauto *R) {return Instruction::PHI; })

4383 .Case<VPWidenSelectRecipe>(

4384 [](constauto *R) {return Instruction::Select; })

4385 .Case<VPWidenStoreRecipe>(

4386 [](constauto *R) {return Instruction::Store; })

4387 .Case<VPWidenLoadRecipe>(

4388 [](constauto *R) {return Instruction::Load; })

4389 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(

4390 [](constauto *R) {return Instruction::Call; })

4391 .Case<VPInstruction,VPWidenRecipe,VPReplicateRecipe,

4392VPWidenCastRecipe>(

4393 [](constauto *R) {return R->getOpcode(); })

4394 .Case<VPInterleaveRecipe>([](constVPInterleaveRecipe *R) {

4395return R->getStoredValues().empty() ? Instruction::Load

4396 : Instruction::Store;

4397 });

4398

4399// If the next recipe is different, or if there are no other pairs,

4400// emit a remark for the collated subset. e.g.

4401// [(load, VF1), (load, VF2))]

4402// to emit:

4403// remark: invalid costs for 'load' at VF=(VF1, VF2)

4404if (Subset ==Tail ||Tail[Subset.size()].first != R) {

4405 std::string OutString;

4406raw_string_ostream OS(OutString);

4407assert(!Subset.empty() &&"Unexpected empty range");

4408OS <<"Recipe with invalid costs prevented vectorization at VF=(";

4409for (constauto &Pair : Subset)

4410OS << (Pair.second == Subset.front().second ?"" :", ") << Pair.second;

4411OS <<"):";

4412if (Opcode == Instruction::Call) {

4413StringRef Name ="";

4414if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(R)) {

4415Name =Int->getIntrinsicName();

4416 }else {

4417auto *WidenCall = dyn_cast<VPWidenCallRecipe>(R);

4418Function *CalledFn =

4419 WidenCall ? WidenCall->getCalledScalarFunction()

4420 : cast<Function>(R->getOperand(R->getNumOperands() - 1)

4421 ->getLiveInIRValue());

4422Name = CalledFn->getName();

4423 }

4424OS <<" call to " <<Name;

4425 }else

4426OS <<" " <<Instruction::getOpcodeName(Opcode);

4427reportVectorizationInfo(OutString,"InvalidCost", ORE, OrigLoop,nullptr,

4428 R->getDebugLoc());

4429Tail =Tail.drop_front(Subset.size());

4430 Subset = {};

4431 }else

4432// Grow the subset by one element

4433 Subset =Tail.take_front(Subset.size() + 1);

4434 }while (!Tail.empty());

4435}

4436

4437/// Check if any recipe of \p Plan will generate a vector value, which will be

4438/// assigned a vector register.

4439staticboolwillGenerateVectors(VPlan &Plan,ElementCount VF,

4440constTargetTransformInfo &TTI) {

4441assert(VF.isVector() &&"Checking a scalar VF?");

4442VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());

4443DenseSet<VPRecipeBase *> EphemeralRecipes;

4444collectEphemeralRecipesForVPlan(Plan, EphemeralRecipes);

4445// Set of already visited types.

4446DenseSet<Type *> Visited;

4447for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(

4448vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {

4449for (VPRecipeBase &R : *VPBB) {

4450if (EphemeralRecipes.contains(&R))

4451continue;

4452// Continue early if the recipe is considered to not produce a vector

4453// result. Note that this includes VPInstruction where some opcodes may

4454// produce a vector, to preserve existing behavior as VPInstructions model

4455// aspects not directly mapped to existing IR instructions.

4456switch (R.getVPDefID()) {

4457case VPDef::VPDerivedIVSC:

4458case VPDef::VPScalarIVStepsSC:

4459case VPDef::VPScalarCastSC:

4460case VPDef::VPReplicateSC:

4461case VPDef::VPInstructionSC:

4462case VPDef::VPCanonicalIVPHISC:

4463case VPDef::VPVectorPointerSC:

4464case VPDef::VPReverseVectorPointerSC:

4465case VPDef::VPExpandSCEVSC:

4466case VPDef::VPEVLBasedIVPHISC:

4467case VPDef::VPPredInstPHISC:

4468case VPDef::VPBranchOnMaskSC:

4469continue;

4470case VPDef::VPReductionSC:

4471case VPDef::VPActiveLaneMaskPHISC:

4472case VPDef::VPWidenCallSC:

4473case VPDef::VPWidenCanonicalIVSC:

4474case VPDef::VPWidenCastSC:

4475case VPDef::VPWidenGEPSC:

4476case VPDef::VPWidenIntrinsicSC:

4477case VPDef::VPWidenSC:

4478case VPDef::VPWidenSelectSC:

4479case VPDef::VPBlendSC:

4480case VPDef::VPFirstOrderRecurrencePHISC:

4481case VPDef::VPWidenPHISC:

4482case VPDef::VPWidenIntOrFpInductionSC:

4483case VPDef::VPWidenPointerInductionSC:

4484case VPDef::VPReductionPHISC:

4485case VPDef::VPInterleaveSC:

4486case VPDef::VPWidenLoadEVLSC:

4487case VPDef::VPWidenLoadSC:

4488case VPDef::VPWidenStoreEVLSC:

4489case VPDef::VPWidenStoreSC:

4490break;

4491default:

4492llvm_unreachable("unhandled recipe");

4493 }

4494

4495auto WillWiden = [&TTI, VF](Type *ScalarTy) {

4496Type *VectorTy =toVectorTy(ScalarTy, VF);

4497unsigned NumLegalParts =TTI.getNumberOfParts(VectorTy);

4498if (!NumLegalParts)

4499returnfalse;

4500if (VF.isScalable()) {

4501// <vscale x 1 x iN> is assumed to be profitable over iN because

4502// scalable registers are a distinct register class from scalar

4503// ones. If we ever find a target which wants to lower scalable

4504// vectors back to scalars, we'll need to update this code to

4505// explicitly ask TTI about the register class uses for each part.

4506return NumLegalParts <= VF.getKnownMinValue();

4507 }

4508// Two or more parts that share a register - are vectorized.

4509return NumLegalParts < VF.getKnownMinValue();

4510 };

4511

4512// If no def nor is a store, e.g., branches, continue - no value to check.

4513if (R.getNumDefinedValues() == 0 &&

4514 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(

4515 &R))

4516continue;

4517// For multi-def recipes, currently only interleaved loads, suffice to

4518// check first def only.

4519// For stores check their stored value; for interleaved stores suffice

4520// the check first stored value only. In all cases this is the second

4521// operand.

4522VPValue *ToCheck =

4523 R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);

4524Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);

4525if (!Visited.insert({ScalarTy}).second)

4526continue;

4527if (WillWiden(ScalarTy))

4528returntrue;

4529 }

4530 }

4531

4532returnfalse;

4533}

4534

4535#ifndef NDEBUG

4536VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {

4537InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));

4538LLVM_DEBUG(dbgs() <<"LV: Scalar loop costs: " << ExpectedCost <<".\n");

4539assert(ExpectedCost.isValid() &&"Unexpected invalid cost for scalar loop");

4540assert(any_of(VPlans,

4541 [](std::unique_ptr<VPlan> &P) {

4542returnP->hasVF(ElementCount::getFixed(1));

4543 }) &&

4544"Expected Scalar VF to be a candidate");

4545

4546constVectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,

4547 ExpectedCost);

4548VectorizationFactor ChosenFactor = ScalarCost;

4549

4550bool ForceVectorization = Hints.getForce() ==LoopVectorizeHints::FK_Enabled;

4551if (ForceVectorization &&

4552 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {

4553// Ignore scalar width, because the user explicitly wants vectorization.

4554// Initialize cost to max so that VF = 2 is, at least, chosen during cost

4555// evaluation.

4556 ChosenFactor.Cost =InstructionCost::getMax();

4557 }

4558

4559for (auto &P : VPlans) {

4560for (ElementCount VF :P->vectorFactors()) {

4561// The cost for scalar VF=1 is already calculated, so ignore it.

4562if (VF.isScalar())

4563continue;

4564

4565InstructionCost C = CM.expectedCost(VF);

4566VectorizationFactor Candidate(VF,C, ScalarCost.ScalarCost);

4567

4568unsigned Width =getEstimatedRuntimeVF(OrigLoop,TTI, Candidate.Width);

4569LLVM_DEBUG(dbgs() <<"LV: Vector loop of width " << VF

4570 <<" costs: " << (Candidate.Cost / Width));

4571if (VF.isScalable())

4572LLVM_DEBUG(dbgs() <<" (assuming a minimum vscale of "

4573 <<getVScaleForTuning(OrigLoop,TTI).value_or(1)

4574 <<")");

4575LLVM_DEBUG(dbgs() <<".\n");

4576

4577if (!ForceVectorization && !willGenerateVectors(*P, VF,TTI)) {

4578LLVM_DEBUG(

4579dbgs()

4580 <<"LV: Not considering vector loop of width " << VF

4581 <<" because it will not generate any vector instructions.\n");

4582continue;

4583 }

4584

4585if (isMoreProfitable(Candidate, ChosenFactor))

4586 ChosenFactor = Candidate;

4587 }

4588 }

4589

4590if (!EnableCondStoresVectorization && CM.hasPredStores()) {

4591reportVectorizationFailure(

4592"There are conditional stores.",

4593"store that is conditionally executed prevents vectorization",

4594"ConditionalStore", ORE, OrigLoop);

4595 ChosenFactor = ScalarCost;

4596 }

4597

4598LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&

4599 !isMoreProfitable(ChosenFactor, ScalarCost))dbgs()

4600 <<"LV: Vectorization seems to be not beneficial, "

4601 <<"but was forced by a user.\n");

4602return ChosenFactor;

4603}

4604#endif

4605

4606bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(

4607ElementCount VF) const{

4608// Cross iteration phis such as reductions need special handling and are

4609// currently unsupported.

4610if (any_of(OrigLoop->getHeader()->phis(),

4611 [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); }))

4612returnfalse;

4613

4614// Phis with uses outside of the loop require special handling and are

4615// currently unsupported.

4616for (constauto &Entry : Legal->getInductionVars()) {

4617// Look for uses of the value of the induction at the last iteration.

4618Value *PostInc =

4619Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch());

4620for (User *U :PostInc->users())

4621if (!OrigLoop->contains(cast<Instruction>(U)))

4622returnfalse;

4623// Look for uses of penultimate value of the induction.

4624for (User *U :Entry.first->users())

4625if (!OrigLoop->contains(cast<Instruction>(U)))

4626returnfalse;

4627 }

4628

4629// Epilogue vectorization code has not been auditted to ensure it handles

4630// non-latch exits properly. It may be fine, but it needs auditted and

4631// tested.

4632// TODO: Add support for loops with an early exit.

4633if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())

4634returnfalse;

4635

4636returntrue;

4637}

4638

4639boolLoopVectorizationCostModel::isEpilogueVectorizationProfitable(

4640constElementCount VF,constunsigned IC) const{

4641// FIXME: We need a much better cost-model to take different parameters such

4642// as register pressure, code size increase and cost of extra branches into

4643// account. For now we apply a very crude heuristic and only consider loops

4644// with vectorization factors larger than a certain value.

4645

4646// Allow the target to opt out entirely.

4647if (!TTI.preferEpilogueVectorization())

4648returnfalse;

4649

4650// We also consider epilogue vectorization unprofitable for targets that don't

4651// consider interleaving beneficial (eg. MVE).

4652if (TTI.getMaxInterleaveFactor(VF) <= 1)

4653returnfalse;

4654

4655// TODO: PR #108190 introduced a discrepancy between fixed-width and scalable

4656// VFs when deciding profitability.

4657// See related "TODO: extend to support scalable VFs." in

4658// selectEpilogueVectorizationFactor.

4659unsigned Multiplier = VF.isFixed() ? IC : 1;

4660unsigned MinVFThreshold =EpilogueVectorizationMinVF.getNumOccurrences() > 0

4661 ?EpilogueVectorizationMinVF

4662 :TTI.getEpilogueVectorizationMinVF();

4663returngetEstimatedRuntimeVF(TheLoop,TTI, VF * Multiplier) >= MinVFThreshold;

4664}

4665

4666VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(

4667constElementCount MainLoopVF,unsigned IC) {

4668VectorizationFactor Result =VectorizationFactor::Disabled();

4669if (!EnableEpilogueVectorization) {

4670LLVM_DEBUG(dbgs() <<"LEV: Epilogue vectorization is disabled.\n");

4671return Result;

4672 }

4673

4674if (!CM.isScalarEpilogueAllowed()) {

4675LLVM_DEBUG(dbgs() <<"LEV: Unable to vectorize epilogue because no "

4676"epilogue is allowed.\n");

4677return Result;

4678 }

4679

4680// Not really a cost consideration, but check for unsupported cases here to

4681// simplify the logic.

4682if (!isCandidateForEpilogueVectorization(MainLoopVF)) {

4683LLVM_DEBUG(dbgs() <<"LEV: Unable to vectorize epilogue because the loop "

4684"is not a supported candidate.\n");

4685return Result;

4686 }

4687

4688if (EpilogueVectorizationForceVF > 1) {

4689LLVM_DEBUG(dbgs() <<"LEV: Epilogue vectorization factor is forced.\n");

4690ElementCount ForcedEC =ElementCount::getFixed(EpilogueVectorizationForceVF);

4691if (hasPlanWithVF(ForcedEC))

4692return {ForcedEC, 0, 0};

4693

4694LLVM_DEBUG(dbgs() <<"LEV: Epilogue vectorization forced factor is not "

4695"viable.\n");

4696return Result;

4697 }

4698

4699if (OrigLoop->getHeader()->getParent()->hasOptSize() ||

4700 OrigLoop->getHeader()->getParent()->hasMinSize()) {

4701LLVM_DEBUG(

4702dbgs() <<"LEV: Epilogue vectorization skipped due to opt for size.\n");

4703return Result;

4704 }

4705

4706if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {

4707LLVM_DEBUG(dbgs() <<"LEV: Epilogue vectorization is not profitable for "

4708"this loop\n");

4709return Result;

4710 }

4711

4712// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know

4713// the main loop handles 8 lanes per iteration. We could still benefit from

4714// vectorizing the epilogue loop with VF=4.

4715ElementCount EstimatedRuntimeVF =

4716ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop,TTI, MainLoopVF));

4717

4718ScalarEvolution &SE = *PSE.getSE();

4719Type *TCType = Legal->getWidestInductionType();

4720constSCEV *RemainingIterations =nullptr;

4721unsigned MaxTripCount = 0;

4722for (auto &NextVF : ProfitableVFs) {

4723// Skip candidate VFs without a corresponding VPlan.

4724if (!hasPlanWithVF(NextVF.Width))

4725continue;

4726

4727// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable

4728// vectors) or > the VF of the main loop (fixed vectors).

4729if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&

4730ElementCount::isKnownGE(NextVF.Width, EstimatedRuntimeVF)) ||

4731 (NextVF.Width.isScalable() &&

4732ElementCount::isKnownGE(NextVF.Width, MainLoopVF)) ||

4733 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&

4734ElementCount::isKnownGT(NextVF.Width, MainLoopVF)))

4735continue;

4736

4737// If NextVF is greater than the number of remaining iterations, the

4738// epilogue loop would be dead. Skip such factors.

4739if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {

4740// TODO: extend to support scalable VFs.

4741if (!RemainingIterations) {

4742constSCEV *TC =vputils::getSCEVExprForVPValue(

4743getPlanFor(NextVF.Width).getTripCount(), SE);

4744assert(!isa<SCEVCouldNotCompute>(TC) &&

4745"Trip count SCEV must be computable");

4746 RemainingIterations = SE.getURemExpr(

4747 TC, SE.getConstant(TCType, MainLoopVF.getKnownMinValue() * IC));

4748 MaxTripCount = MainLoopVF.getKnownMinValue() * IC - 1;

4749if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations,

4750 SE.getConstant(TCType, MaxTripCount))) {

4751 MaxTripCount =

4752 SE.getUnsignedRangeMax(RemainingIterations).getZExtValue();

4753 }

4754LLVM_DEBUG(dbgs() <<"LEV: Maximum Trip Count for Epilogue: "

4755 << MaxTripCount <<"\n");

4756 }

4757if (SE.isKnownPredicate(

4758CmpInst::ICMP_UGT,

4759 SE.getConstant(TCType, NextVF.Width.getKnownMinValue()),

4760 RemainingIterations))

4761continue;

4762 }

4763

4764if (Result.Width.isScalar() ||

4765 isMoreProfitable(NextVF, Result, MaxTripCount))

4766 Result = NextVF;

4767 }

4768

4769if (Result !=VectorizationFactor::Disabled())

4770LLVM_DEBUG(dbgs() <<"LEV: Vectorizing epilogue loop with VF = "

4771 << Result.Width <<"\n");

4772return Result;

4773}

4774

4775std::pair<unsigned, unsigned>

4776LoopVectorizationCostModel::getSmallestAndWidestTypes() {

4777unsigned MinWidth = -1U;

4778unsigned MaxWidth = 8;

4779constDataLayout &DL =TheFunction->getDataLayout();

4780// For in-loop reductions, no element types are added to ElementTypesInLoop

4781// if there are no loads/stores in the loop. In this case, check through the

4782// reduction variables to determine the maximum width.

4783if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {

4784// Reset MaxWidth so that we can find the smallest type used by recurrences

4785// in the loop.

4786 MaxWidth = -1U;

4787for (constauto &PhiDescriptorPair :Legal->getReductionVars()) {

4788constRecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;

4789// When finding the min width used by the recurrence we need to account

4790// for casts on the input operands of the recurrence.

4791 MaxWidth = std::min<unsigned>(

4792 MaxWidth, std::min<unsigned>(

4793 RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),

4794 RdxDesc.getRecurrenceType()->getScalarSizeInBits()));

4795 }

4796 }else {

4797for (Type *T :ElementTypesInLoop) {

4798 MinWidth = std::min<unsigned>(

4799 MinWidth,DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());

4800 MaxWidth = std::max<unsigned>(

4801 MaxWidth,DL.getTypeSizeInBits(T->getScalarType()).getFixedValue());

4802 }

4803 }

4804return {MinWidth, MaxWidth};

4805}

4806

4807voidLoopVectorizationCostModel::collectElementTypesForWidening() {

4808ElementTypesInLoop.clear();

4809// For each block.

4810for (BasicBlock *BB :TheLoop->blocks()) {

4811// For each instruction in the loop.

4812for (Instruction &I : BB->instructionsWithoutDebug()) {

4813Type *T =I.getType();

4814

4815// Skip ignored values.

4816if (ValuesToIgnore.count(&I))

4817continue;

4818

4819// Only examine Loads, Stores and PHINodes.

4820if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))

4821continue;

4822

4823// Examine PHI nodes that are reduction variables. Update the type to

4824// account for the recurrence type.

4825if (auto *PN = dyn_cast<PHINode>(&I)) {

4826if (!Legal->isReductionVariable(PN))

4827continue;

4828constRecurrenceDescriptor &RdxDesc =

4829Legal->getReductionVars().find(PN)->second;

4830if (PreferInLoopReductions ||useOrderedReductions(RdxDesc) ||

4831TTI.preferInLoopReduction(RdxDesc.getOpcode(),

4832 RdxDesc.getRecurrenceType(),

4833TargetTransformInfo::ReductionFlags()))

4834continue;

4835T = RdxDesc.getRecurrenceType();

4836 }

4837

4838// Examine the stored values.

4839if (auto *ST = dyn_cast<StoreInst>(&I))

4840T = ST->getValueOperand()->getType();

4841

4842assert(T->isSized() &&

4843"Expected the load/store/recurrence type to be sized");

4844

4845ElementTypesInLoop.insert(T);

4846 }

4847 }

4848}

4849

4850unsigned

4851LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,

4852InstructionCost LoopCost) {

4853// -- The interleave heuristics --

4854// We interleave the loop in order to expose ILP and reduce the loop overhead.

4855// There are many micro-architectural considerations that we can't predict

4856// at this level. For example, frontend pressure (on decode or fetch) due to

4857// code size, or the number and capabilities of the execution ports.

4858//

4859// We use the following heuristics to select the interleave count:

4860// 1. If the code has reductions, then we interleave to break the cross

4861// iteration dependency.

4862// 2. If the loop is really small, then we interleave to reduce the loop

4863// overhead.

4864// 3. We don't interleave if we think that we will spill registers to memory

4865// due to the increased register pressure.

4866

4867if (!isScalarEpilogueAllowed())

4868return 1;

4869

4870// Do not interleave if EVL is preferred and no User IC is specified.

4871if (foldTailWithEVL()) {

4872LLVM_DEBUG(dbgs() <<"LV: Preference for VP intrinsics indicated. "

4873"Unroll factor forced to be 1.\n");

4874return 1;

4875 }

4876

4877// We used the distance for the interleave count.

4878if (!Legal->isSafeForAnyVectorWidth())

4879return 1;

4880

4881// We don't attempt to perform interleaving for loops with uncountable early

4882// exits because the VPInstruction::AnyOf code cannot currently handle

4883// multiple parts.

4884if (Legal->hasUncountableEarlyExit())

4885return 1;

4886

4887auto BestKnownTC =getSmallBestKnownTC(PSE,TheLoop);

4888constbool HasReductions = !Legal->getReductionVars().empty();

4889

4890// If we did not calculate the cost for VF (because the user selected the VF)

4891// then we calculate the cost of VF here.

4892if (LoopCost == 0) {

4893 LoopCost =expectedCost(VF);

4894assert(LoopCost.isValid() &&"Expected to have chosen a VF with valid cost");

4895

4896// Loop body is free and there is no need for interleaving.

4897if (LoopCost == 0)

4898return 1;

4899 }

4900

4901RegisterUsage R =calculateRegisterUsage({VF})[0];

4902// We divide by these constants so assume that we have at least one

4903// instruction that uses at least one register.

4904for (auto &Pair : R.MaxLocalUsers) {

4905 Pair.second = std::max(Pair.second, 1U);

4906 }

4907

4908// We calculate the interleave count using the following formula.

4909// Subtract the number of loop invariants from the number of available

4910// registers. These registers are used by all of the interleaved instances.

4911// Next, divide the remaining registers by the number of registers that is

4912// required by the loop, in order to estimate how many parallel instances

4913// fit without causing spills. All of this is rounded down if necessary to be

4914// a power of two. We want power of two interleave count to simplify any

4915// addressing operations or alignment considerations.

4916// We also want power of two interleave counts to ensure that the induction

4917// variable of the vector loop wraps to zero, when tail is folded by masking;

4918// this currently happens when OptForSize, in which case IC is set to 1 above.

4919unsigned IC = UINT_MAX;

4920

4921for (constauto &Pair : R.MaxLocalUsers) {

4922unsigned TargetNumRegisters =TTI.getNumberOfRegisters(Pair.first);

4923LLVM_DEBUG(dbgs() <<"LV: The target has " << TargetNumRegisters

4924 <<" registers of "

4925 <<TTI.getRegisterClassName(Pair.first)

4926 <<" register class\n");

4927if (VF.isScalar()) {

4928if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)

4929 TargetNumRegisters =ForceTargetNumScalarRegs;

4930 }else {

4931if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)

4932 TargetNumRegisters =ForceTargetNumVectorRegs;

4933 }

4934unsigned MaxLocalUsers = Pair.second;

4935unsigned LoopInvariantRegs = 0;

4936if (R.LoopInvariantRegs.find(Pair.first) != R.LoopInvariantRegs.end())

4937 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];

4938

4939unsigned TmpIC =llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs) /

4940 MaxLocalUsers);

4941// Don't count the induction variable as interleaved.

4942if (EnableIndVarRegisterHeur) {

4943 TmpIC =llvm::bit_floor((TargetNumRegisters - LoopInvariantRegs - 1) /

4944 std::max(1U, (MaxLocalUsers - 1)));

4945 }

4946

4947 IC = std::min(IC, TmpIC);

4948 }

4949

4950// Clamp the interleave ranges to reasonable counts.

4951unsigned MaxInterleaveCount =TTI.getMaxInterleaveFactor(VF);

4952

4953// Check if the user has overridden the max.

4954if (VF.isScalar()) {

4955if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)

4956 MaxInterleaveCount =ForceTargetMaxScalarInterleaveFactor;

4957 }else {

4958if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)

4959 MaxInterleaveCount =ForceTargetMaxVectorInterleaveFactor;

4960 }

4961

4962unsigned EstimatedVF =getEstimatedRuntimeVF(TheLoop,TTI, VF);

4963unsigned KnownTC =PSE.getSE()->getSmallConstantTripCount(TheLoop);

4964if (KnownTC > 0) {

4965// At least one iteration must be scalar when this constraint holds. So the

4966// maximum available iterations for interleaving is one less.

4967unsigned AvailableTC =

4968requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;

4969

4970// If trip count is known we select between two prospective ICs, where

4971// 1) the aggressive IC is capped by the trip count divided by VF

4972// 2) the conservative IC is capped by the trip count divided by (VF * 2)

4973// The final IC is selected in a way that the epilogue loop trip count is

4974// minimized while maximizing the IC itself, so that we either run the

4975// vector loop at least once if it generates a small epilogue loop, or else

4976// we run the vector loop at least twice.

4977

4978unsigned InterleaveCountUB =bit_floor(

4979 std::max(1u, std::min(AvailableTC / EstimatedVF, MaxInterleaveCount)));

4980unsigned InterleaveCountLB =bit_floor(std::max(

4981 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));

4982 MaxInterleaveCount = InterleaveCountLB;

4983

4984if (InterleaveCountUB != InterleaveCountLB) {

4985unsigned TailTripCountUB =

4986 (AvailableTC % (EstimatedVF * InterleaveCountUB));

4987unsigned TailTripCountLB =

4988 (AvailableTC % (EstimatedVF * InterleaveCountLB));

4989// If both produce same scalar tail, maximize the IC to do the same work

4990// in fewer vector loop iterations

4991if (TailTripCountUB == TailTripCountLB)

4992 MaxInterleaveCount = InterleaveCountUB;

4993 }

4994 }elseif (BestKnownTC && *BestKnownTC > 0) {

4995// At least one iteration must be scalar when this constraint holds. So the

4996// maximum available iterations for interleaving is one less.

4997unsigned AvailableTC =requiresScalarEpilogue(VF.isVector())

4998 ? (*BestKnownTC) - 1

4999 : *BestKnownTC;

5000

5001// If trip count is an estimated compile time constant, limit the

5002// IC to be capped by the trip count divided by VF * 2, such that the vector

5003// loop runs at least twice to make interleaving seem profitable when there

5004// is an epilogue loop present. Since exact Trip count is not known we

5005// choose to be conservative in our IC estimate.

5006 MaxInterleaveCount =bit_floor(std::max(

5007 1u, std::min(AvailableTC / (EstimatedVF * 2), MaxInterleaveCount)));

5008 }

5009

5010assert(MaxInterleaveCount > 0 &&

5011"Maximum interleave count must be greater than 0");

5012

5013// Clamp the calculated IC to be between the 1 and the max interleave count

5014// that the target and trip count allows.

5015if (IC > MaxInterleaveCount)

5016 IC = MaxInterleaveCount;

5017else

5018// Make sure IC is greater than 0.

5019 IC = std::max(1u, IC);

5020

5021assert(IC > 0 &&"Interleave count must be greater than 0.");

5022

5023// Interleave if we vectorized this loop and there is a reduction that could

5024// benefit from interleaving.

5025if (VF.isVector() && HasReductions) {

5026LLVM_DEBUG(dbgs() <<"LV: Interleaving because of reductions.\n");

5027return IC;

5028 }

5029

5030// For any scalar loop that either requires runtime checks or predication we

5031// are better off leaving this to the unroller. Note that if we've already

5032// vectorized the loop we will have done the runtime check and so interleaving

5033// won't require further checks.

5034bool ScalarInterleavingRequiresPredication =

5035 (VF.isScalar() &&any_of(TheLoop->blocks(), [this](BasicBlock *BB) {

5036 return Legal->blockNeedsPredication(BB);

5037 }));

5038bool ScalarInterleavingRequiresRuntimePointerCheck =

5039 (VF.isScalar() &&Legal->getRuntimePointerChecking()->Need);

5040

5041// We want to interleave small loops in order to reduce the loop overhead and

5042// potentially expose ILP opportunities.

5043LLVM_DEBUG(dbgs() <<"LV: Loop cost is " << LoopCost <<'\n'

5044 <<"LV: IC is " << IC <<'\n'

5045 <<"LV: VF is " << VF <<'\n');

5046constbool AggressivelyInterleaveReductions =

5047TTI.enableAggressiveInterleaving(HasReductions);

5048if (!ScalarInterleavingRequiresRuntimePointerCheck &&

5049 !ScalarInterleavingRequiresPredication && LoopCost <SmallLoopCost) {

5050// We assume that the cost overhead is 1 and we use the cost model

5051// to estimate the cost of the loop and interleave until the cost of the

5052// loop overhead is about 5% of the cost of the loop.

5053unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor<uint64_t>(

5054SmallLoopCost / *LoopCost.getValue()));

5055

5056// Interleave until store/load ports (estimated by max interleave count) are

5057// saturated.

5058unsigned NumStores =Legal->getNumStores();

5059unsigned NumLoads =Legal->getNumLoads();

5060unsigned StoresIC = IC / (NumStores ? NumStores : 1);

5061unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);

5062

5063// There is little point in interleaving for reductions containing selects

5064// and compares when VF=1 since it may just create more overhead than it's

5065// worth for loops with small trip counts. This is because we still have to

5066// do the final reduction after the loop.

5067bool HasSelectCmpReductions =

5068 HasReductions &&

5069any_of(Legal->getReductionVars(), [&](auto &Reduction) ->bool {

5070 const RecurrenceDescriptor &RdxDesc = Reduction.second;

5071 RecurKind RK = RdxDesc.getRecurrenceKind();

5072 return RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||

5073 RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK);

5074 });

5075if (HasSelectCmpReductions) {

5076LLVM_DEBUG(dbgs() <<"LV: Not interleaving select-cmp reductions.\n");

5077return 1;

5078 }

5079

5080// If we have a scalar reduction (vector reductions are already dealt with

5081// by this point), we can increase the critical path length if the loop

5082// we're interleaving is inside another loop. For tree-wise reductions

5083// set the limit to 2, and for ordered reductions it's best to disable

5084// interleaving entirely.

5085if (HasReductions &&TheLoop->getLoopDepth() > 1) {

5086bool HasOrderedReductions =

5087any_of(Legal->getReductionVars(), [&](auto &Reduction) ->bool {

5088 const RecurrenceDescriptor &RdxDesc = Reduction.second;

5089 return RdxDesc.isOrdered();

5090 });

5091if (HasOrderedReductions) {

5092LLVM_DEBUG(

5093dbgs() <<"LV: Not interleaving scalar ordered reductions.\n");

5094return 1;

5095 }

5096

5097unsignedF =static_cast<unsigned>(MaxNestedScalarReductionIC);

5098 SmallIC = std::min(SmallIC,F);

5099 StoresIC = std::min(StoresIC,F);

5100 LoadsIC = std::min(LoadsIC,F);

5101 }

5102

5103if (EnableLoadStoreRuntimeInterleave &&

5104 std::max(StoresIC, LoadsIC) > SmallIC) {

5105LLVM_DEBUG(

5106dbgs() <<"LV: Interleaving to saturate store or load ports.\n");

5107return std::max(StoresIC, LoadsIC);

5108 }

5109

5110// If there are scalar reductions and TTI has enabled aggressive

5111// interleaving for reductions, we will interleave to expose ILP.

5112if (VF.isScalar() && AggressivelyInterleaveReductions) {

5113LLVM_DEBUG(dbgs() <<"LV: Interleaving to expose ILP.\n");

5114// Interleave no less than SmallIC but not as aggressive as the normal IC

5115// to satisfy the rare situation when resources are too limited.

5116return std::max(IC / 2, SmallIC);

5117 }

5118

5119LLVM_DEBUG(dbgs() <<"LV: Interleaving to reduce branch cost.\n");

5120return SmallIC;

5121 }

5122

5123// Interleave if this is a large loop (small loops are already dealt with by

5124// this point) that could benefit from interleaving.

5125if (AggressivelyInterleaveReductions) {

5126LLVM_DEBUG(dbgs() <<"LV: Interleaving to expose ILP.\n");

5127return IC;

5128 }

5129

5130LLVM_DEBUG(dbgs() <<"LV: Not Interleaving.\n");

5131return 1;

5132}

5133

5134SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>

5135LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {

5136// This function calculates the register usage by measuring the highest number

5137// of values that are alive at a single location. Obviously, this is a very

5138// rough estimation. We scan the loop in a topological order in order and

5139// assign a number to each instruction. We use RPO to ensure that defs are

5140// met before their users. We assume that each instruction that has in-loop

5141// users starts an interval. We record every time that an in-loop value is

5142// used, so we have a list of the first and last occurrences of each

5143// instruction. Next, we transpose this data structure into a multi map that

5144// holds the list of intervals that *end* at a specific location. This multi

5145// map allows us to perform a linear search. We scan the instructions linearly

5146// and record each time that a new interval starts, by placing it in a set.

5147// If we find this value in the multi-map then we remove it from the set.

5148// The max register usage is the maximum size of the set.

5149// We also search for instructions that are defined outside the loop, but are

5150// used inside the loop. We need this number separately from the max-interval

5151// usage number because when we unroll, loop-invariant values do not take

5152// more register.

5153LoopBlocksDFS DFS(TheLoop);

5154 DFS.perform(LI);

5155

5156RegisterUsage RU;

5157

5158// Each 'key' in the map opens a new interval. The values

5159// of the map are the index of the 'last seen' usage of the

5160// instruction that is the key.

5161usingIntervalMap =SmallDenseMap<Instruction *, unsigned, 16>;

5162

5163// Maps instruction to its index.

5164SmallVector<Instruction *, 64> IdxToInstr;

5165// Marks the end of each interval.

5166IntervalMap EndPoint;

5167// Saves the list of instruction indices that are used in the loop.

5168SmallPtrSet<Instruction *, 8> Ends;

5169// Saves the list of values that are used in the loop but are defined outside

5170// the loop (not including non-instruction values such as arguments and

5171// constants).

5172SmallSetVector<Instruction *, 8> LoopInvariants;

5173

5174for (BasicBlock *BB :make_range(DFS.beginRPO(), DFS.endRPO())) {

5175for (Instruction &I : BB->instructionsWithoutDebug()) {

5176 IdxToInstr.push_back(&I);

5177

5178// Save the end location of each USE.

5179for (Value *U :I.operands()) {

5180auto *Instr = dyn_cast<Instruction>(U);

5181

5182// Ignore non-instruction values such as arguments, constants, etc.

5183// FIXME: Might need some motivation why these values are ignored. If

5184// for example an argument is used inside the loop it will increase the

5185// register pressure (so shouldn't we add it to LoopInvariants).

5186if (!Instr)

5187continue;

5188

5189// If this instruction is outside the loop then record it and continue.

5190if (!TheLoop->contains(Instr)) {

5191 LoopInvariants.insert(Instr);

5192continue;

5193 }

5194

5195// Overwrite previous end points.

5196 EndPoint[Instr] = IdxToInstr.size();

5197 Ends.insert(Instr);

5198 }

5199 }

5200 }

5201

5202// Saves the list of intervals that end with the index in 'key'.

5203usingInstrList =SmallVector<Instruction *, 2>;

5204SmallDenseMap<unsigned, InstrList, 16> TransposeEnds;

5205

5206// Transpose the EndPoints to a list of values that end at each index.

5207for (auto &Interval : EndPoint)

5208 TransposeEnds[Interval.second].push_back(Interval.first);

5209

5210SmallPtrSet<Instruction *, 8> OpenIntervals;

5211SmallVector<RegisterUsage, 8> RUs(VFs.size());

5212SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());

5213

5214LLVM_DEBUG(dbgs() <<"LV(REG): Calculating max register usage:\n");

5215

5216constauto &TTICapture =TTI;

5217auto GetRegUsage = [&TTICapture](Type *Ty,ElementCount VF) ->unsigned {

5218if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||

5219 (VF.isScalable() &&

5220 !TTICapture.isElementTypeLegalForScalableVector(Ty)))

5221return 0;

5222return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));

5223 };

5224

5225for (unsignedintIdx = 0, Sz = IdxToInstr.size();Idx < Sz; ++Idx) {

5226Instruction *I = IdxToInstr[Idx];

5227

5228// Remove all of the instructions that end at this location.

5229 InstrList &List = TransposeEnds[Idx];

5230for (Instruction *ToRemove :List)

5231 OpenIntervals.erase(ToRemove);

5232

5233// Ignore instructions that are never used within the loop.

5234if (!Ends.count(I))

5235continue;

5236

5237// Skip ignored values.

5238if (ValuesToIgnore.count(I))

5239continue;

5240

5241collectInLoopReductions();

5242

5243// For each VF find the maximum usage of registers.

5244for (unsigned J = 0, E = VFs.size(); J < E; ++J) {

5245// Count the number of registers used, per register class, given all open

5246// intervals.

5247// Note that elements in this SmallMapVector will be default constructed

5248// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if

5249// there is no previous entry for ClassID.

5250SmallMapVector<unsigned, unsigned, 4>RegUsage;

5251

5252if (VFs[J].isScalar()) {

5253for (auto *Inst : OpenIntervals) {

5254unsigned ClassID =

5255TTI.getRegisterClassForType(false, Inst->getType());

5256// FIXME: The target might use more than one register for the type

5257// even in the scalar case.

5258RegUsage[ClassID] += 1;

5259 }

5260 }else {

5261collectUniformsAndScalars(VFs[J]);

5262for (auto *Inst : OpenIntervals) {

5263// Skip ignored values for VF > 1.

5264if (VecValuesToIgnore.count(Inst))

5265continue;

5266if (isScalarAfterVectorization(Inst, VFs[J])) {

5267unsigned ClassID =

5268TTI.getRegisterClassForType(false, Inst->getType());

5269// FIXME: The target might use more than one register for the type

5270// even in the scalar case.

5271RegUsage[ClassID] += 1;

5272 }else {

5273unsigned ClassID =

5274TTI.getRegisterClassForType(true, Inst->getType());

5275RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[J]);

5276 }

5277 }

5278 }

5279

5280for (constauto &Pair :RegUsage) {

5281auto &Entry = MaxUsages[J][Pair.first];

5282 Entry = std::max(Entry, Pair.second);

5283 }

5284 }

5285

5286LLVM_DEBUG(dbgs() <<"LV(REG): At #" <<Idx <<" Interval # "

5287 << OpenIntervals.size() <<'\n');

5288

5289// Add the current instruction to the list of open intervals.

5290 OpenIntervals.insert(I);

5291 }

5292

5293for (unsignedIdx = 0,End = VFs.size();Idx <End; ++Idx) {

5294// Note that elements in this SmallMapVector will be default constructed

5295// as 0. So we can use "Invariant[ClassID] += n" in the code below even if

5296// there is no previous entry for ClassID.

5297SmallMapVector<unsigned, unsigned, 4> Invariant;

5298

5299for (auto *Inst : LoopInvariants) {

5300// FIXME: The target might use more than one register for the type

5301// even in the scalar case.

5302bool IsScalar =all_of(Inst->users(), [&](User *U) {

5303 auto *I = cast<Instruction>(U);

5304 return TheLoop != LI->getLoopFor(I->getParent()) ||

5305 isScalarAfterVectorization(I, VFs[Idx]);

5306 });

5307

5308ElementCount VF = IsScalar ?ElementCount::getFixed(1) : VFs[Idx];

5309unsigned ClassID =

5310TTI.getRegisterClassForType(VF.isVector(), Inst->getType());

5311 Invariant[ClassID] += GetRegUsage(Inst->getType(), VF);

5312 }

5313

5314LLVM_DEBUG({

5315dbgs() <<"LV(REG): VF = " << VFs[Idx] <<'\n';

5316dbgs() <<"LV(REG): Found max usage: " << MaxUsages[Idx].size()

5317 <<" item\n";

5318for (constauto &pair : MaxUsages[Idx]) {

5319dbgs() <<"LV(REG): RegisterClass: "

5320 <<TTI.getRegisterClassName(pair.first) <<", " << pair.second

5321 <<" registers\n";

5322 }

5323dbgs() <<"LV(REG): Found invariant usage: " << Invariant.size()

5324 <<" item\n";

5325for (constauto &pair : Invariant) {

5326dbgs() <<"LV(REG): RegisterClass: "

5327 <<TTI.getRegisterClassName(pair.first) <<", " << pair.second

5328 <<" registers\n";

5329 }

5330 });

5331

5332 RU.LoopInvariantRegs = Invariant;

5333 RU.MaxLocalUsers = MaxUsages[Idx];

5334 RUs[Idx] = RU;

5335 }

5336

5337return RUs;

5338}

5339

5340bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,

5341ElementCount VF) {

5342// TODO: Cost model for emulated masked load/store is completely

5343// broken. This hack guides the cost model to use an artificially

5344// high enough value to practically disable vectorization with such

5345// operations, except where previously deployed legality hack allowed

5346// using very low cost values. This is to avoid regressions coming simply

5347// from moving "masked load/store" check from legality to cost model.

5348// Masked Load/Gather emulation was previously never allowed.

5349// Limited number of Masked Store/Scatter emulation was allowed.

5350assert((isPredicatedInst(I)) &&

5351"Expecting a scalar emulated instruction");

5352return isa<LoadInst>(I) ||

5353 (isa<StoreInst>(I) &&

5354 NumPredStores >NumberOfStoresToPredicate);

5355}

5356

5357voidLoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {

5358// If we aren't vectorizing the loop, or if we've already collected the

5359// instructions to scalarize, there's nothing to do. Collection may already

5360// have occurred if we have a user-selected VF and are now computing the

5361// expected cost for interleaving.

5362if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF))

5363return;

5364

5365// Initialize a mapping for VF in InstsToScalalarize. If we find that it's

5366// not profitable to scalarize any instructions, the presence of VF in the

5367// map will indicate that we've analyzed it already.

5368ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];

5369

5370 PredicatedBBsAfterVectorization[VF].clear();

5371

5372// Find all the instructions that are scalar with predication in the loop and

5373// determine if it would be better to not if-convert the blocks they are in.

5374// If so, we also record the instructions to scalarize.

5375for (BasicBlock *BB :TheLoop->blocks()) {

5376if (!blockNeedsPredicationForAnyReason(BB))

5377continue;

5378for (Instruction &I : *BB)

5379if (isScalarWithPredication(&I, VF)) {

5380ScalarCostsTy ScalarCosts;

5381// Do not apply discount logic for:

5382// 1. Scalars after vectorization, as there will only be a single copy

5383// of the instruction.

5384// 2. Scalable VF, as that would lead to invalid scalarization costs.

5385// 3. Emulated masked memrefs, if a hacked cost is needed.

5386if (!isScalarAfterVectorization(&I, VF) && !VF.isScalable() &&

5387 !useEmulatedMaskMemRefHack(&I, VF) &&

5388 computePredInstDiscount(&I, ScalarCosts, VF) >= 0) {

5389 ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());

5390// Check if we decided to scalarize a call. If so, update the widening

5391// decision of the call to CM_Scalarize with the computed scalar cost.

5392for (constauto &[I,_] : ScalarCosts) {

5393auto *CI = dyn_cast<CallInst>(I);

5394if (!CI || !CallWideningDecisions.contains({CI, VF}))

5395continue;

5396 CallWideningDecisions[{CI, VF}].Kind =CM_Scalarize;

5397 CallWideningDecisions[{CI, VF}].Cost = ScalarCosts[CI];

5398 }

5399 }

5400// Remember that BB will remain after vectorization.

5401 PredicatedBBsAfterVectorization[VF].insert(BB);

5402for (auto *Pred :predecessors(BB)) {

5403if (Pred->getSingleSuccessor() == BB)

5404 PredicatedBBsAfterVectorization[VF].insert(Pred);

5405 }

5406 }

5407 }

5408}

5409

5410InstructionCost LoopVectorizationCostModel::computePredInstDiscount(

5411Instruction *PredInst, ScalarCostsTy &ScalarCosts,ElementCount VF) {

5412assert(!isUniformAfterVectorization(PredInst, VF) &&

5413"Instruction marked uniform-after-vectorization will be predicated");

5414

5415// Initialize the discount to zero, meaning that the scalar version and the

5416// vector version cost the same.

5417InstructionCost Discount = 0;

5418

5419// Holds instructions to analyze. The instructions we visit are mapped in

5420// ScalarCosts. Those instructions are the ones that would be scalarized if

5421// we find that the scalar version costs less.

5422SmallVector<Instruction *, 8> Worklist;

5423

5424// Returns true if the given instruction can be scalarized.

5425auto CanBeScalarized = [&](Instruction *I) ->bool {

5426// We only attempt to scalarize instructions forming a single-use chain

5427// from the original predicated block that would otherwise be vectorized.

5428// Although not strictly necessary, we give up on instructions we know will

5429// already be scalar to avoid traversing chains that are unlikely to be

5430// beneficial.

5431if (!I->hasOneUse() || PredInst->getParent() !=I->getParent() ||

5432isScalarAfterVectorization(I, VF))

5433returnfalse;

5434

5435// If the instruction is scalar with predication, it will be analyzed

5436// separately. We ignore it within the context of PredInst.

5437if (isScalarWithPredication(I, VF))

5438returnfalse;

5439

5440// If any of the instruction's operands are uniform after vectorization,

5441// the instruction cannot be scalarized. This prevents, for example, a

5442// masked load from being scalarized.

5443//

5444// We assume we will only emit a value for lane zero of an instruction

5445// marked uniform after vectorization, rather than VF identical values.

5446// Thus, if we scalarize an instruction that uses a uniform, we would

5447// create uses of values corresponding to the lanes we aren't emitting code

5448// for. This behavior can be changed by allowing getScalarValue to clone

5449// the lane zero values for uniforms rather than asserting.

5450for (Use &U :I->operands())

5451if (auto *J = dyn_cast<Instruction>(U.get()))

5452if (isUniformAfterVectorization(J, VF))

5453returnfalse;

5454

5455// Otherwise, we can scalarize the instruction.

5456returntrue;

5457 };

5458

5459// Compute the expected cost discount from scalarizing the entire expression

5460// feeding the predicated instruction. We currently only consider expressions

5461// that are single-use instruction chains.

5462 Worklist.push_back(PredInst);

5463while (!Worklist.empty()) {

5464Instruction *I = Worklist.pop_back_val();

5465

5466// If we've already analyzed the instruction, there's nothing to do.

5467if (ScalarCosts.contains(I))

5468continue;

5469

5470// Compute the cost of the vector instruction. Note that this cost already

5471// includes the scalarization overhead of the predicated instruction.

5472InstructionCost VectorCost =getInstructionCost(I, VF);

5473

5474// Compute the cost of the scalarized instruction. This cost is the cost of

5475// the instruction as if it wasn't if-converted and instead remained in the

5476// predicated block. We will scale this cost by block probability after

5477// computing the scalarization overhead.

5478InstructionCost ScalarCost =

5479 VF.getFixedValue() *getInstructionCost(I,ElementCount::getFixed(1));

5480

5481// Compute the scalarization overhead of needed insertelement instructions

5482// and phi nodes.

5483if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {

5484 ScalarCost +=TTI.getScalarizationOverhead(

5485 cast<VectorType>(toVectorTy(I->getType(), VF)),

5486APInt::getAllOnes(VF.getFixedValue()),/*Insert*/true,

5487/*Extract*/false,CostKind);

5488 ScalarCost +=

5489 VF.getFixedValue() *TTI.getCFInstrCost(Instruction::PHI,CostKind);

5490 }

5491

5492// Compute the scalarization overhead of needed extractelement

5493// instructions. For each of the instruction's operands, if the operand can

5494// be scalarized, add it to the worklist; otherwise, account for the

5495// overhead.

5496for (Use &U :I->operands())

5497if (auto *J = dyn_cast<Instruction>(U.get())) {

5498assert(VectorType::isValidElementType(J->getType()) &&

5499"Instruction has non-scalar type");

5500if (CanBeScalarized(J))

5501 Worklist.push_back(J);

5502elseif (needsExtract(J, VF)) {

5503 ScalarCost +=TTI.getScalarizationOverhead(

5504 cast<VectorType>(toVectorTy(J->getType(), VF)),

5505APInt::getAllOnes(VF.getFixedValue()),/*Insert*/false,

5506/*Extract*/true,CostKind);

5507 }

5508 }

5509

5510// Scale the total scalar cost by block probability.

5511 ScalarCost /=getReciprocalPredBlockProb();

5512

5513// Compute the discount. A non-negative discount means the vector version

5514// of the instruction costs more, and scalarizing would be beneficial.

5515 Discount += VectorCost - ScalarCost;

5516 ScalarCosts[I] = ScalarCost;

5517 }

5518

5519return Discount;

5520}

5521

5522InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {

5523InstructionCost Cost;

5524

5525// If the vector loop gets executed exactly once with the given VF, ignore the

5526// costs of comparison and induction instructions, as they'll get simplified

5527// away.

5528SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;

5529auto TC =PSE.getSE()->getSmallConstantTripCount(TheLoop);

5530if (VF.isFixed() && TC == VF.getFixedValue() && !foldTailByMasking())

5531addFullyUnrolledInstructionsToIgnore(TheLoop,Legal->getInductionVars(),

5532 ValuesToIgnoreForVF);

5533

5534// For each block.

5535for (BasicBlock *BB :TheLoop->blocks()) {

5536InstructionCost BlockCost;

5537

5538// For each instruction in the old loop.

5539for (Instruction &I : BB->instructionsWithoutDebug()) {

5540// Skip ignored values.

5541if (ValuesToIgnore.count(&I) || ValuesToIgnoreForVF.count(&I) ||

5542 (VF.isVector() &&VecValuesToIgnore.count(&I)))

5543continue;

5544

5545InstructionCost C =getInstructionCost(&I, VF);

5546

5547// Check if we should override the cost.

5548if (C.isValid() &&ForceTargetInstructionCost.getNumOccurrences() > 0)

5549C =InstructionCost(ForceTargetInstructionCost);

5550

5551 BlockCost +=C;

5552LLVM_DEBUG(dbgs() <<"LV: Found an estimated cost of " <<C <<" for VF "

5553 << VF <<" For instruction: " <<I <<'\n');

5554 }

5555

5556// If we are vectorizing a predicated block, it will have been

5557// if-converted. This means that the block's instructions (aside from

5558// stores and instructions that may divide by zero) will now be

5559// unconditionally executed. For the scalar case, we may not always execute

5560// the predicated block, if it is an if-else block. Thus, scale the block's

5561// cost by the probability of executing it. blockNeedsPredication from

5562// Legal is used so as to not include all blocks in tail folded loops.

5563if (VF.isScalar() &&Legal->blockNeedsPredication(BB))

5564 BlockCost /=getReciprocalPredBlockProb();

5565

5566Cost += BlockCost;

5567 }

5568

5569returnCost;

5570}

5571

5572/// Gets Address Access SCEV after verifying that the access pattern

5573/// is loop invariant except the induction variable dependence.

5574///

5575/// This SCEV can be sent to the Target in order to estimate the address

5576/// calculation cost.

5577staticconstSCEV *getAddressAccessSCEV(

5578Value *Ptr,

5579LoopVectorizationLegality *Legal,

5580PredicatedScalarEvolution &PSE,

5581constLoop *TheLoop) {

5582

5583auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);

5584if (!Gep)

5585returnnullptr;

5586

5587// We are looking for a gep with all loop invariant indices except for one

5588// which should be an induction variable.

5589auto *SE = PSE.getSE();

5590unsigned NumOperands = Gep->getNumOperands();

5591for (unsignedIdx = 1;Idx < NumOperands; ++Idx) {

5592Value *Opd = Gep->getOperand(Idx);

5593if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&

5594 !Legal->isInductionVariable(Opd))

5595returnnullptr;

5596 }

5597

5598// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.

5599return PSE.getSCEV(Ptr);

5600}

5601

5602InstructionCost

5603LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,

5604ElementCount VF) {

5605assert(VF.isVector() &&

5606"Scalarization cost of instruction implies vectorization.");

5607if (VF.isScalable())

5608returnInstructionCost::getInvalid();

5609

5610Type *ValTy =getLoadStoreType(I);

5611auto *SE =PSE.getSE();

5612

5613unsigned AS =getLoadStoreAddressSpace(I);

5614Value *Ptr =getLoadStorePointerOperand(I);

5615Type *PtrTy =toVectorTy(Ptr->getType(), VF);

5616// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`

5617// that it is being called from this specific place.

5618

5619// Figure out whether the access is strided and get the stride value

5620// if it's known in compile time

5621constSCEV *PtrSCEV =getAddressAccessSCEV(Ptr,Legal,PSE,TheLoop);

5622

5623// Get the cost of the scalar memory instruction and address computation.

5624InstructionCost Cost =

5625 VF.getKnownMinValue() *TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);

5626

5627// Don't pass *I here, since it is scalar but will actually be part of a

5628// vectorized loop where the user of it is a vectorized instruction.

5629constAlign Alignment =getLoadStoreAlignment(I);

5630Cost += VF.getKnownMinValue() *TTI.getMemoryOpCost(I->getOpcode(),

5631 ValTy->getScalarType(),

5632 Alignment, AS,CostKind);

5633

5634// Get the overhead of the extractelement and insertelement instructions

5635// we might create due to scalarization.

5636Cost += getScalarizationOverhead(I, VF);

5637

5638// If we have a predicated load/store, it will need extra i1 extracts and

5639// conditional branches, but may not be executed for each vector lane. Scale

5640// the cost by the probability of executing the predicated block.

5641if (isPredicatedInst(I)) {

5642Cost /=getReciprocalPredBlockProb();

5643

5644// Add the cost of an i1 extract and a branch

5645auto *VecI1Ty =

5646VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);

5647Cost +=TTI.getScalarizationOverhead(

5648 VecI1Ty,APInt::getAllOnes(VF.getKnownMinValue()),

5649/*Insert=*/false,/*Extract=*/true,CostKind);

5650Cost +=TTI.getCFInstrCost(Instruction::Br,CostKind);

5651

5652if (useEmulatedMaskMemRefHack(I, VF))

5653// Artificially setting to a high enough value to practically disable

5654// vectorization with such operations.

5655Cost = 3000000;

5656 }

5657

5658returnCost;

5659}

5660

5661InstructionCost

5662LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,

5663ElementCount VF) {

5664Type *ValTy =getLoadStoreType(I);

5665auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));

5666Value *Ptr =getLoadStorePointerOperand(I);

5667unsigned AS =getLoadStoreAddressSpace(I);

5668int ConsecutiveStride =Legal->isConsecutivePtr(ValTy,Ptr);

5669

5670assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&

5671"Stride should be 1 or -1 for consecutive memory access");

5672constAlign Alignment =getLoadStoreAlignment(I);

5673InstructionCost Cost = 0;

5674if (Legal->isMaskRequired(I)) {

5675Cost +=TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,

5676CostKind);

5677 }else {

5678TTI::OperandValueInfo OpInfo =TTI::getOperandInfo(I->getOperand(0));

5679Cost +=TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,

5680CostKind, OpInfo,I);

5681 }

5682

5683boolReverse = ConsecutiveStride < 0;

5684if (Reverse)

5685Cost +=TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},

5686CostKind, 0);

5687returnCost;

5688}

5689

5690InstructionCost

5691LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,

5692ElementCount VF) {

5693assert(Legal->isUniformMemOp(*I, VF));

5694

5695Type *ValTy =getLoadStoreType(I);

5696auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));

5697constAlign Alignment =getLoadStoreAlignment(I);

5698unsigned AS =getLoadStoreAddressSpace(I);

5699if (isa<LoadInst>(I)) {

5700returnTTI.getAddressComputationCost(ValTy) +

5701TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,

5702CostKind) +

5703TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy, {},

5704CostKind);

5705 }

5706StoreInst *SI = cast<StoreInst>(I);

5707

5708bool IsLoopInvariantStoreValue =Legal->isInvariant(SI->getValueOperand());

5709returnTTI.getAddressComputationCost(ValTy) +

5710TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,

5711CostKind) +

5712 (IsLoopInvariantStoreValue

5713 ? 0

5714 :TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,

5715CostKind, VF.getKnownMinValue() - 1));

5716}

5717

5718InstructionCost

5719LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,

5720ElementCount VF) {

5721Type *ValTy =getLoadStoreType(I);

5722auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));

5723constAlign Alignment =getLoadStoreAlignment(I);

5724constValue *Ptr =getLoadStorePointerOperand(I);

5725

5726returnTTI.getAddressComputationCost(VectorTy) +

5727TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy,Ptr,

5728Legal->isMaskRequired(I), Alignment,

5729CostKind,I);

5730}

5731

5732InstructionCost

5733LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,

5734ElementCount VF) {

5735constauto *Group =getInterleavedAccessGroup(I);

5736assert(Group &&"Fail to get an interleaved access group.");

5737

5738Instruction *InsertPos = Group->getInsertPos();

5739Type *ValTy =getLoadStoreType(InsertPos);

5740auto *VectorTy = cast<VectorType>(toVectorTy(ValTy, VF));

5741unsigned AS =getLoadStoreAddressSpace(InsertPos);

5742

5743unsigned InterleaveFactor = Group->getFactor();

5744auto *WideVecTy =VectorType::get(ValTy, VF * InterleaveFactor);

5745

5746// Holds the indices of existing members in the interleaved group.

5747SmallVector<unsigned, 4> Indices;

5748for (unsigned IF = 0;IF < InterleaveFactor;IF++)

5749if (Group->getMember(IF))

5750 Indices.push_back(IF);

5751

5752// Calculate the cost of the whole interleaved group.

5753bool UseMaskForGaps =

5754 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||

5755 (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));

5756InstructionCost Cost =TTI.getInterleavedMemoryOpCost(

5757 InsertPos->getOpcode(), WideVecTy, Group->getFactor(), Indices,

5758 Group->getAlign(), AS,CostKind,Legal->isMaskRequired(I),

5759 UseMaskForGaps);

5760

5761if (Group->isReverse()) {

5762// TODO: Add support for reversed masked interleaved access.

5763assert(!Legal->isMaskRequired(I) &&

5764"Reverse masked interleaved access not supported.");

5765Cost += Group->getNumMembers() *

5766TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, {},

5767CostKind, 0);

5768 }

5769returnCost;

5770}

5771

5772std::optional<InstructionCost>

5773LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,

5774ElementCount VF,

5775Type *Ty) const{

5776using namespacellvm::PatternMatch;

5777// Early exit for no inloop reductions

5778if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Ty))

5779return std::nullopt;

5780auto *VectorTy = cast<VectorType>(Ty);

5781

5782// We are looking for a pattern of, and finding the minimal acceptable cost:

5783// reduce(mul(ext(A), ext(B))) or

5784// reduce(mul(A, B)) or

5785// reduce(ext(A)) or

5786// reduce(A).

5787// The basic idea is that we walk down the tree to do that, finding the root

5788// reduction instruction in InLoopReductionImmediateChains. From there we find

5789// the pattern of mul/ext and test the cost of the entire pattern vs the cost

5790// of the components. If the reduction cost is lower then we return it for the

5791// reduction instruction and 0 for the other instructions in the pattern. If

5792// it is not we return an invalid cost specifying the orignal cost method

5793// should be used.

5794Instruction *RetI =I;

5795if (match(RetI,m_ZExtOrSExt(m_Value()))) {

5796if (!RetI->hasOneUser())

5797return std::nullopt;

5798 RetI = RetI->user_back();

5799 }

5800

5801if (match(RetI,m_OneUse(m_Mul(m_Value(),m_Value()))) &&

5802 RetI->user_back()->getOpcode() == Instruction::Add) {

5803 RetI = RetI->user_back();

5804 }

5805

5806// Test if the found instruction is a reduction, and if not return an invalid

5807// cost specifying the parent to use the original cost modelling.

5808if (!InLoopReductionImmediateChains.count(RetI))

5809return std::nullopt;

5810

5811// Find the reduction this chain is a part of and calculate the basic cost of

5812// the reduction on its own.

5813Instruction *LastChain = InLoopReductionImmediateChains.at(RetI);

5814Instruction *ReductionPhi = LastChain;

5815while (!isa<PHINode>(ReductionPhi))

5816 ReductionPhi = InLoopReductionImmediateChains.at(ReductionPhi);

5817

5818constRecurrenceDescriptor &RdxDesc =

5819Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;

5820

5821InstructionCost BaseCost;

5822RecurKind RK = RdxDesc.getRecurrenceKind();

5823if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK)) {

5824Intrinsic::ID MinMaxID =getMinMaxReductionIntrinsicOp(RK);

5825 BaseCost =TTI.getMinMaxReductionCost(MinMaxID, VectorTy,

5826 RdxDesc.getFastMathFlags(),CostKind);

5827 }else {

5828 BaseCost =TTI.getArithmeticReductionCost(

5829 RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(),CostKind);

5830 }

5831

5832// For a call to the llvm.fmuladd intrinsic we need to add the cost of a

5833// normal fmul instruction to the cost of the fadd reduction.

5834if (RK ==RecurKind::FMulAdd)

5835 BaseCost +=

5836TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy,CostKind);

5837

5838// If we're using ordered reductions then we can just return the base cost

5839// here, since getArithmeticReductionCost calculates the full ordered

5840// reduction cost when FP reassociation is not allowed.

5841if (useOrderedReductions(RdxDesc))

5842return BaseCost;

5843

5844// Get the operand that was not the reduction chain and match it to one of the

5845// patterns, returning the better cost if it is found.

5846Instruction *RedOp = RetI->getOperand(1) == LastChain

5847 ? dyn_cast<Instruction>(RetI->getOperand(0))

5848 : dyn_cast<Instruction>(RetI->getOperand(1));

5849

5850 VectorTy =VectorType::get(I->getOperand(0)->getType(), VectorTy);

5851

5852Instruction *Op0, *Op1;

5853if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&

5854match(RedOp,

5855m_ZExtOrSExt(m_Mul(m_Instruction(Op0),m_Instruction(Op1)))) &&

5856match(Op0,m_ZExtOrSExt(m_Value())) &&

5857 Op0->getOpcode() == Op1->getOpcode() &&

5858 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&

5859 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&

5860 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {

5861

5862// Matched reduce.add(ext(mul(ext(A), ext(B)))

5863// Note that the extend opcodes need to all match, or if A==B they will have

5864// been converted to zext(mul(sext(A), sext(A))) as it is known positive,

5865// which is equally fine.

5866bool IsUnsigned = isa<ZExtInst>(Op0);

5867auto *ExtType =VectorType::get(Op0->getOperand(0)->getType(), VectorTy);

5868auto *MulType =VectorType::get(Op0->getType(), VectorTy);

5869

5870InstructionCost ExtCost =

5871TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,

5872TTI::CastContextHint::None,CostKind, Op0);

5873InstructionCost MulCost =

5874TTI.getArithmeticInstrCost(Instruction::Mul, MulType,CostKind);

5875InstructionCost Ext2Cost =

5876TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,

5877TTI::CastContextHint::None,CostKind, RedOp);

5878

5879InstructionCost RedCost =TTI.getMulAccReductionCost(

5880 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,CostKind);

5881

5882if (RedCost.isValid() &&

5883 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)

5884returnI == RetI ? RedCost : 0;

5885 }elseif (RedOp &&match(RedOp,m_ZExtOrSExt(m_Value())) &&

5886 !TheLoop->isLoopInvariant(RedOp)) {

5887// Matched reduce(ext(A))

5888bool IsUnsigned = isa<ZExtInst>(RedOp);

5889auto *ExtType =VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);

5890InstructionCost RedCost =TTI.getExtendedReductionCost(

5891 RdxDesc.getOpcode(), IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,

5892 RdxDesc.getFastMathFlags(),CostKind);

5893

5894InstructionCost ExtCost =

5895TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,

5896TTI::CastContextHint::None,CostKind, RedOp);

5897if (RedCost.isValid() && RedCost < BaseCost + ExtCost)

5898returnI == RetI ? RedCost : 0;

5899 }elseif (RedOp && RdxDesc.getOpcode() == Instruction::Add &&

5900match(RedOp,m_Mul(m_Instruction(Op0),m_Instruction(Op1)))) {

5901if (match(Op0,m_ZExtOrSExt(m_Value())) &&

5902 Op0->getOpcode() == Op1->getOpcode() &&

5903 !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {

5904bool IsUnsigned = isa<ZExtInst>(Op0);

5905Type *Op0Ty = Op0->getOperand(0)->getType();

5906Type *Op1Ty = Op1->getOperand(0)->getType();

5907Type *LargestOpTy =

5908 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty

5909 : Op0Ty;

5910auto *ExtType =VectorType::get(LargestOpTy, VectorTy);

5911

5912// Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of

5913// different sizes. We take the largest type as the ext to reduce, and add

5914// the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).

5915InstructionCost ExtCost0 =TTI.getCastInstrCost(

5916 Op0->getOpcode(), VectorTy,VectorType::get(Op0Ty, VectorTy),

5917TTI::CastContextHint::None,CostKind, Op0);

5918InstructionCost ExtCost1 =TTI.getCastInstrCost(

5919 Op1->getOpcode(), VectorTy,VectorType::get(Op1Ty, VectorTy),

5920TTI::CastContextHint::None,CostKind, Op1);

5921InstructionCost MulCost =

5922TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,CostKind);

5923

5924InstructionCost RedCost =TTI.getMulAccReductionCost(

5925 IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,CostKind);

5926InstructionCost ExtraExtCost = 0;

5927if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {

5928Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;

5929 ExtraExtCost =TTI.getCastInstrCost(

5930 ExtraExtOp->getOpcode(), ExtType,

5931VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),

5932TTI::CastContextHint::None,CostKind, ExtraExtOp);

5933 }

5934

5935if (RedCost.isValid() &&

5936 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))

5937returnI == RetI ? RedCost : 0;

5938 }elseif (!match(I,m_ZExtOrSExt(m_Value()))) {

5939// Matched reduce.add(mul())

5940InstructionCost MulCost =

5941TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,CostKind);

5942

5943InstructionCost RedCost =TTI.getMulAccReductionCost(

5944true, RdxDesc.getRecurrenceType(), VectorTy,CostKind);

5945

5946if (RedCost.isValid() && RedCost < MulCost + BaseCost)

5947returnI == RetI ? RedCost : 0;

5948 }

5949 }

5950

5951returnI == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;

5952}

5953

5954InstructionCost

5955LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,

5956ElementCount VF) {

5957// Calculate scalar cost only. Vectorization cost should be ready at this

5958// moment.

5959if (VF.isScalar()) {

5960Type *ValTy =getLoadStoreType(I);

5961constAlign Alignment =getLoadStoreAlignment(I);

5962unsigned AS =getLoadStoreAddressSpace(I);

5963

5964TTI::OperandValueInfo OpInfo =TTI::getOperandInfo(I->getOperand(0));

5965returnTTI.getAddressComputationCost(ValTy) +

5966TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,CostKind,

5967 OpInfo,I);

5968 }

5969returngetWideningCost(I, VF);

5970}

5971

5972InstructionCost

5973LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,

5974ElementCount VF) const{

5975

5976// There is no mechanism yet to create a scalable scalarization loop,

5977// so this is currently Invalid.

5978if (VF.isScalable())

5979returnInstructionCost::getInvalid();

5980

5981if (VF.isScalar())

5982return 0;

5983

5984InstructionCost Cost = 0;

5985Type *RetTy =toVectorTy(I->getType(), VF);

5986if (!RetTy->isVoidTy() &&

5987 (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))

5988Cost +=TTI.getScalarizationOverhead(

5989 cast<VectorType>(RetTy),APInt::getAllOnes(VF.getKnownMinValue()),

5990/*Insert*/true,

5991/*Extract*/false,CostKind);

5992

5993// Some targets keep addresses scalar.

5994if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())

5995returnCost;

5996

5997// Some targets support efficient element stores.

5998if (isa<StoreInst>(I) &&TTI.supportsEfficientVectorElementLoadStore())

5999returnCost;

6000

6001// Collect operands to consider.

6002CallInst *CI = dyn_cast<CallInst>(I);

6003Instruction::op_range Ops = CI ? CI->args() :I->operands();

6004

6005// Skip operands that do not require extraction/scalarization and do not incur

6006// any overhead.

6007SmallVector<Type *>Tys;

6008for (auto *V : filterExtractingOperands(Ops, VF))

6009Tys.push_back(maybeVectorizeType(V->getType(), VF));

6010returnCost +TTI.getOperandsScalarizationOverhead(

6011 filterExtractingOperands(Ops, VF), Tys,CostKind);

6012}

6013

6014voidLoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {

6015if (VF.isScalar())

6016return;

6017 NumPredStores = 0;

6018for (BasicBlock *BB :TheLoop->blocks()) {

6019// For each instruction in the old loop.

6020for (Instruction &I : *BB) {

6021Value *Ptr =getLoadStorePointerOperand(&I);

6022if (!Ptr)

6023continue;

6024

6025// TODO: We should generate better code and update the cost model for

6026// predicated uniform stores. Today they are treated as any other

6027// predicated store (see added test cases in

6028// invariant-store-vectorization.ll).

6029if (isa<StoreInst>(&I) &&isScalarWithPredication(&I, VF))

6030 NumPredStores++;

6031

6032if (Legal->isUniformMemOp(I, VF)) {

6033auto IsLegalToScalarize = [&]() {

6034if (!VF.isScalable())

6035// Scalarization of fixed length vectors "just works".

6036returntrue;

6037

6038// We have dedicated lowering for unpredicated uniform loads and

6039// stores. Note that even with tail folding we know that at least

6040// one lane is active (i.e. generalized predication is not possible

6041// here), and the logic below depends on this fact.

6042if (!foldTailByMasking())

6043returntrue;

6044

6045// For scalable vectors, a uniform memop load is always

6046// uniform-by-parts and we know how to scalarize that.

6047if (isa<LoadInst>(I))

6048returntrue;

6049

6050// A uniform store isn't neccessarily uniform-by-part

6051// and we can't assume scalarization.

6052auto &SI = cast<StoreInst>(I);

6053returnTheLoop->isLoopInvariant(SI.getValueOperand());

6054 };

6055

6056constInstructionCost GatherScatterCost =

6057isLegalGatherOrScatter(&I, VF) ?

6058 getGatherScatterCost(&I, VF) :InstructionCost::getInvalid();

6059

6060// Load: Scalar load + broadcast

6061// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract

6062// FIXME: This cost is a significant under-estimate for tail folded

6063// memory ops.

6064constInstructionCost ScalarizationCost =

6065 IsLegalToScalarize() ? getUniformMemOpCost(&I, VF)

6066 :InstructionCost::getInvalid();

6067

6068// Choose better solution for the current VF, Note that Invalid

6069// costs compare as maximumal large. If both are invalid, we get

6070// scalable invalid which signals a failure and a vectorization abort.

6071if (GatherScatterCost < ScalarizationCost)

6072setWideningDecision(&I, VF,CM_GatherScatter, GatherScatterCost);

6073else

6074setWideningDecision(&I, VF,CM_Scalarize, ScalarizationCost);

6075continue;

6076 }

6077

6078// We assume that widening is the best solution when possible.

6079if (memoryInstructionCanBeWidened(&I, VF)) {

6080InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);

6081int ConsecutiveStride =Legal->isConsecutivePtr(

6082getLoadStoreType(&I),getLoadStorePointerOperand(&I));

6083assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&

6084"Expected consecutive stride.");

6085InstWidening Decision =

6086 ConsecutiveStride == 1 ?CM_Widen :CM_Widen_Reverse;

6087setWideningDecision(&I, VF, Decision,Cost);

6088continue;

6089 }

6090

6091// Choose between Interleaving, Gather/Scatter or Scalarization.

6092InstructionCost InterleaveCost =InstructionCost::getInvalid();

6093unsigned NumAccesses = 1;

6094if (isAccessInterleaved(&I)) {

6095constauto *Group =getInterleavedAccessGroup(&I);

6096assert(Group &&"Fail to get an interleaved access group.");

6097

6098// Make one decision for the whole group.

6099if (getWideningDecision(&I, VF) !=CM_Unknown)

6100continue;

6101

6102 NumAccesses = Group->getNumMembers();

6103if (interleavedAccessCanBeWidened(&I, VF))

6104 InterleaveCost = getInterleaveGroupCost(&I, VF);

6105 }

6106

6107InstructionCost GatherScatterCost =

6108isLegalGatherOrScatter(&I, VF)

6109 ? getGatherScatterCost(&I, VF) * NumAccesses

6110 :InstructionCost::getInvalid();

6111

6112InstructionCost ScalarizationCost =

6113 getMemInstScalarizationCost(&I, VF) * NumAccesses;

6114

6115// Choose better solution for the current VF,

6116// write down this decision and use it during vectorization.

6117InstructionCost Cost;

6118InstWidening Decision;

6119if (InterleaveCost <= GatherScatterCost &&

6120 InterleaveCost < ScalarizationCost) {

6121 Decision =CM_Interleave;

6122Cost = InterleaveCost;

6123 }elseif (GatherScatterCost < ScalarizationCost) {

6124 Decision =CM_GatherScatter;

6125Cost = GatherScatterCost;

6126 }else {

6127 Decision =CM_Scalarize;

6128Cost = ScalarizationCost;

6129 }

6130// If the instructions belongs to an interleave group, the whole group

6131// receives the same decision. The whole group receives the cost, but

6132// the cost will actually be assigned to one instruction.

6133if (constauto *Group =getInterleavedAccessGroup(&I))

6134setWideningDecision(Group, VF, Decision,Cost);

6135else

6136setWideningDecision(&I, VF, Decision,Cost);

6137 }

6138 }

6139

6140// Make sure that any load of address and any other address computation

6141// remains scalar unless there is gather/scatter support. This avoids

6142// inevitable extracts into address registers, and also has the benefit of

6143// activating LSR more, since that pass can't optimize vectorized

6144// addresses.

6145if (TTI.prefersVectorizedAddressing())

6146return;

6147

6148// Start with all scalar pointer uses.

6149SmallPtrSet<Instruction *, 8> AddrDefs;

6150for (BasicBlock *BB :TheLoop->blocks())

6151for (Instruction &I : *BB) {

6152Instruction *PtrDef =

6153 dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));

6154if (PtrDef &&TheLoop->contains(PtrDef) &&

6155getWideningDecision(&I, VF) !=CM_GatherScatter)

6156 AddrDefs.insert(PtrDef);

6157 }

6158

6159// Add all instructions used to generate the addresses.

6160SmallVector<Instruction *, 4> Worklist;

6161append_range(Worklist, AddrDefs);

6162while (!Worklist.empty()) {

6163Instruction *I = Worklist.pop_back_val();

6164for (auto &Op :I->operands())

6165if (auto *InstOp = dyn_cast<Instruction>(Op))

6166if ((InstOp->getParent() ==I->getParent()) && !isa<PHINode>(InstOp) &&

6167 AddrDefs.insert(InstOp).second)

6168 Worklist.push_back(InstOp);

6169 }

6170

6171for (auto *I : AddrDefs) {

6172if (isa<LoadInst>(I)) {

6173// Setting the desired widening decision should ideally be handled in

6174// by cost functions, but since this involves the task of finding out

6175// if the loaded register is involved in an address computation, it is

6176// instead changed here when we know this is the case.

6177InstWidening Decision =getWideningDecision(I, VF);

6178if (Decision ==CM_Widen || Decision ==CM_Widen_Reverse)

6179// Scalarize a widened load of address.

6180setWideningDecision(

6181I, VF,CM_Scalarize,

6182 (VF.getKnownMinValue() *

6183 getMemoryInstructionCost(I,ElementCount::getFixed(1))));

6184elseif (constauto *Group =getInterleavedAccessGroup(I)) {

6185// Scalarize an interleave group of address loads.

6186for (unsignedI = 0;I < Group->getFactor(); ++I) {

6187if (Instruction *Member = Group->getMember(I))

6188setWideningDecision(

6189 Member, VF,CM_Scalarize,

6190 (VF.getKnownMinValue() *

6191 getMemoryInstructionCost(Member,ElementCount::getFixed(1))));

6192 }

6193 }

6194 }else

6195// Make sure I gets scalarized and a cost estimate without

6196// scalarization overhead.

6197 ForcedScalars[VF].insert(I);

6198 }

6199}

6200

6201voidLoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {

6202assert(!VF.isScalar() &&

6203"Trying to set a vectorization decision for a scalar VF");

6204

6205auto ForcedScalar = ForcedScalars.find(VF);

6206for (BasicBlock *BB :TheLoop->blocks()) {

6207// For each instruction in the old loop.

6208for (Instruction &I : *BB) {

6209CallInst *CI = dyn_cast<CallInst>(&I);

6210

6211if (!CI)

6212continue;

6213

6214InstructionCost ScalarCost =InstructionCost::getInvalid();

6215InstructionCost VectorCost =InstructionCost::getInvalid();

6216InstructionCost IntrinsicCost =InstructionCost::getInvalid();

6217Function *ScalarFunc = CI->getCalledFunction();

6218Type *ScalarRetTy = CI->getType();

6219SmallVector<Type *, 4> Tys, ScalarTys;

6220for (auto &ArgOp : CI->args())

6221 ScalarTys.push_back(ArgOp->getType());

6222

6223// Estimate cost of scalarized vector call. The source operands are

6224// assumed to be vectors, so we need to extract individual elements from

6225// there, execute VF scalar calls, and then gather the result into the

6226// vector return value.

6227InstructionCost ScalarCallCost =

6228TTI.getCallInstrCost(ScalarFunc, ScalarRetTy, ScalarTys,CostKind);

6229

6230// Compute costs of unpacking argument values for the scalar calls and

6231// packing the return values to a vector.

6232InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);

6233

6234 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;

6235// Honor ForcedScalars and UniformAfterVectorization decisions.

6236// TODO: For calls, it might still be more profitable to widen. Use

6237// VPlan-based cost model to compare different options.

6238if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&

6239 ForcedScalar->second.contains(CI)) ||

6240isUniformAfterVectorization(CI, VF))) {

6241setCallWideningDecision(CI, VF,CM_Scalarize,nullptr,

6242Intrinsic::not_intrinsic, std::nullopt,

6243 ScalarCost);

6244continue;

6245 }

6246

6247bool MaskRequired =Legal->isMaskRequired(CI);

6248// Compute corresponding vector type for return value and arguments.

6249Type *RetTy =toVectorTy(ScalarRetTy, VF);

6250for (Type *ScalarTy : ScalarTys)

6251 Tys.push_back(toVectorTy(ScalarTy, VF));

6252

6253// An in-loop reduction using an fmuladd intrinsic is a special case;

6254// we don't want the normal cost for that intrinsic.

6255if (RecurrenceDescriptor::isFMulAddIntrinsic(CI))

6256if (auto RedCost =getReductionPatternCost(CI, VF,RetTy)) {

6257setCallWideningDecision(CI, VF,CM_IntrinsicCall,nullptr,

6258getVectorIntrinsicIDForCall(CI,TLI),

6259 std::nullopt, *RedCost);

6260continue;

6261 }

6262

6263// Find the cost of vectorizing the call, if we can find a suitable

6264// vector variant of the function.

6265bool UsesMask =false;

6266VFInfo FuncInfo;

6267Function *VecFunc =nullptr;

6268// Search through any available variants for one we can use at this VF.

6269for (VFInfo &Info :VFDatabase::getMappings(*CI)) {

6270// Must match requested VF.

6271if (Info.Shape.VF != VF)

6272continue;

6273

6274// Must take a mask argument if one is required

6275if (MaskRequired && !Info.isMasked())

6276continue;

6277

6278// Check that all parameter kinds are supported

6279bool ParamsOk =true;

6280for (VFParameter Param :Info.Shape.Parameters) {

6281switch (Param.ParamKind) {

6282caseVFParamKind::Vector:

6283break;

6284caseVFParamKind::OMP_Uniform: {

6285Value *ScalarParam = CI->getArgOperand(Param.ParamPos);

6286// Make sure the scalar parameter in the loop is invariant.

6287if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),

6288TheLoop))

6289 ParamsOk =false;

6290break;

6291 }

6292caseVFParamKind::OMP_Linear: {

6293Value *ScalarParam = CI->getArgOperand(Param.ParamPos);

6294// Find the stride for the scalar parameter in this loop and see if

6295// it matches the stride for the variant.

6296// TODO: do we need to figure out the cost of an extract to get the

6297// first lane? Or do we hope that it will be folded away?

6298ScalarEvolution *SE =PSE.getSE();

6299constauto *SAR =

6300 dyn_cast<SCEVAddRecExpr>(SE->getSCEV(ScalarParam));

6301

6302if (!SAR || SAR->getLoop() !=TheLoop) {

6303 ParamsOk =false;

6304break;

6305 }

6306

6307constSCEVConstant *Step =

6308 dyn_cast<SCEVConstant>(SAR->getStepRecurrence(*SE));

6309

6310if (!Step ||

6311 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)

6312 ParamsOk =false;

6313

6314break;

6315 }

6316caseVFParamKind::GlobalPredicate:

6317 UsesMask =true;

6318break;

6319default:

6320 ParamsOk =false;

6321break;

6322 }

6323 }

6324

6325if (!ParamsOk)

6326continue;

6327

6328// Found a suitable candidate, stop here.

6329 VecFunc = CI->getModule()->getFunction(Info.VectorName);

6330 FuncInfo =Info;

6331break;

6332 }

6333

6334// Add in the cost of synthesizing a mask if one wasn't required.

6335InstructionCost MaskCost = 0;

6336if (VecFunc && UsesMask && !MaskRequired)

6337 MaskCost =TTI.getShuffleCost(

6338TargetTransformInfo::SK_Broadcast,

6339VectorType::get(IntegerType::getInt1Ty(

6340 VecFunc->getFunctionType()->getContext()),

6341 VF),

6342 {},CostKind);

6343

6344if (TLI && VecFunc && !CI->isNoBuiltin())

6345 VectorCost =

6346TTI.getCallInstrCost(nullptr,RetTy, Tys,CostKind) + MaskCost;

6347

6348// Find the cost of an intrinsic; some targets may have instructions that

6349// perform the operation without needing an actual call.

6350Intrinsic::ID IID =getVectorIntrinsicIDForCall(CI,TLI);

6351if (IID !=Intrinsic::not_intrinsic)

6352 IntrinsicCost =getVectorIntrinsicCost(CI, VF);

6353

6354InstructionCost Cost = ScalarCost;

6355InstWidening Decision =CM_Scalarize;

6356

6357if (VectorCost <=Cost) {

6358Cost = VectorCost;

6359 Decision =CM_VectorCall;

6360 }

6361

6362if (IntrinsicCost <=Cost) {

6363Cost = IntrinsicCost;

6364 Decision =CM_IntrinsicCall;

6365 }

6366

6367setCallWideningDecision(CI, VF, Decision, VecFunc, IID,

6368 FuncInfo.getParamIndexForOptionalMask(),Cost);

6369 }

6370 }

6371}

6372

6373boolLoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {

6374if (!Legal->isInvariant(Op))

6375returnfalse;

6376// Consider Op invariant, if it or its operands aren't predicated

6377// instruction in the loop. In that case, it is not trivially hoistable.

6378auto *OpI = dyn_cast<Instruction>(Op);

6379return !OpI || !TheLoop->contains(OpI) ||

6380 (!isPredicatedInst(OpI) &&

6381 (!isa<PHINode>(OpI) || OpI->getParent() !=TheLoop->getHeader()) &&

6382all_of(OpI->operands(),

6383 [this](Value *Op) {returnshouldConsiderInvariant(Op); }));

6384}

6385

6386InstructionCost

6387LoopVectorizationCostModel::getInstructionCost(Instruction *I,

6388ElementCount VF) {

6389// If we know that this instruction will remain uniform, check the cost of

6390// the scalar version.

6391if (isUniformAfterVectorization(I, VF))

6392 VF =ElementCount::getFixed(1);

6393

6394if (VF.isVector() &&isProfitableToScalarize(I, VF))

6395return InstsToScalarize[VF][I];

6396

6397// Forced scalars do not have any scalarization overhead.

6398auto ForcedScalar = ForcedScalars.find(VF);

6399if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {

6400auto InstSet = ForcedScalar->second;

6401if (InstSet.count(I))

6402returngetInstructionCost(I,ElementCount::getFixed(1)) *

6403 VF.getKnownMinValue();

6404 }

6405

6406Type *RetTy =I->getType();

6407if (canTruncateToMinimalBitwidth(I, VF))

6408RetTy =IntegerType::get(RetTy->getContext(), MinBWs[I]);

6409auto *SE =PSE.getSE();

6410

6411auto HasSingleCopyAfterVectorization = [this](Instruction *I,

6412ElementCount VF) ->bool {

6413if (VF.isScalar())

6414returntrue;

6415

6416auto Scalarized = InstsToScalarize.find(VF);

6417assert(Scalarized != InstsToScalarize.end() &&

6418"VF not yet analyzed for scalarization profitability");

6419return !Scalarized->second.count(I) &&

6420llvm::all_of(I->users(), [&](User *U) {

6421 auto *UI = cast<Instruction>(U);

6422 return !Scalarized->second.count(UI);

6423 });

6424 };

6425 (void)HasSingleCopyAfterVectorization;

6426

6427Type *VectorTy;

6428if (isScalarAfterVectorization(I, VF)) {

6429// With the exception of GEPs and PHIs, after scalarization there should

6430// only be one copy of the instruction generated in the loop. This is

6431// because the VF is either 1, or any instructions that need scalarizing

6432// have already been dealt with by the time we get here. As a result,

6433// it means we don't have to multiply the instruction cost by VF.

6434assert(I->getOpcode() == Instruction::GetElementPtr ||

6435I->getOpcode() == Instruction::PHI ||

6436 (I->getOpcode() == Instruction::BitCast &&

6437I->getType()->isPointerTy()) ||

6438 HasSingleCopyAfterVectorization(I, VF));

6439 VectorTy =RetTy;

6440 }else

6441 VectorTy =toVectorTy(RetTy, VF);

6442

6443if (VF.isVector() && VectorTy->isVectorTy() &&

6444 !TTI.getNumberOfParts(VectorTy))

6445returnInstructionCost::getInvalid();

6446

6447// TODO: We need to estimate the cost of intrinsic calls.

6448switch (I->getOpcode()) {

6449case Instruction::GetElementPtr:

6450// We mark this instruction as zero-cost because the cost of GEPs in

6451// vectorized code depends on whether the corresponding memory instruction

6452// is scalarized or not. Therefore, we handle GEPs with the memory

6453// instruction cost.

6454return 0;

6455case Instruction::Br: {

6456// In cases of scalarized and predicated instructions, there will be VF

6457// predicated blocks in the vectorized loop. Each branch around these

6458// blocks requires also an extract of its vector compare i1 element.

6459// Note that the conditional branch from the loop latch will be replaced by

6460// a single branch controlling the loop, so there is no extra overhead from

6461// scalarization.

6462bool ScalarPredicatedBB =false;

6463BranchInst *BI = cast<BranchInst>(I);

6464if (VF.isVector() && BI->isConditional() &&

6465 (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||

6466 PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))) &&

6467 BI->getParent() !=TheLoop->getLoopLatch())

6468 ScalarPredicatedBB =true;

6469

6470if (ScalarPredicatedBB) {

6471// Not possible to scalarize scalable vector with predicated instructions.

6472if (VF.isScalable())

6473returnInstructionCost::getInvalid();

6474// Return cost for branches around scalarized and predicated blocks.

6475auto *VecI1Ty =

6476VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);

6477return (

6478TTI.getScalarizationOverhead(

6479 VecI1Ty,APInt::getAllOnes(VF.getFixedValue()),

6480/*Insert*/false,/*Extract*/true,CostKind) +

6481 (TTI.getCFInstrCost(Instruction::Br,CostKind) * VF.getFixedValue()));

6482 }

6483

6484if (I->getParent() ==TheLoop->getLoopLatch() || VF.isScalar())

6485// The back-edge branch will remain, as will all scalar branches.

6486returnTTI.getCFInstrCost(Instruction::Br,CostKind);

6487

6488// This branch will be eliminated by if-conversion.

6489return 0;

6490// Note: We currently assume zero cost for an unconditional branch inside

6491// a predicated block since it will become a fall-through, although we

6492// may decide in the future to call TTI for all branches.

6493 }

6494case Instruction::Switch: {

6495if (VF.isScalar())

6496returnTTI.getCFInstrCost(Instruction::Switch,CostKind);

6497auto *Switch = cast<SwitchInst>(I);

6498return Switch->getNumCases() *

6499TTI.getCmpSelInstrCost(

6500 Instruction::ICmp,

6501toVectorTy(Switch->getCondition()->getType(), VF),

6502toVectorTy(Type::getInt1Ty(I->getContext()), VF),

6503CmpInst::ICMP_EQ,CostKind);

6504 }

6505case Instruction::PHI: {

6506auto *Phi = cast<PHINode>(I);

6507

6508// First-order recurrences are replaced by vector shuffles inside the loop.

6509if (VF.isVector() &&Legal->isFixedOrderRecurrence(Phi)) {

6510// For <vscale x 1 x i64>, if vscale = 1 we are unable to extract the

6511// penultimate value of the recurrence.

6512// TODO: Consider vscale_range info.

6513if (VF.isScalable() && VF.getKnownMinValue() == 1)

6514returnInstructionCost::getInvalid();

6515SmallVector<int> Mask(VF.getKnownMinValue());

6516 std::iota(Mask.begin(), Mask.end(), VF.getKnownMinValue() - 1);

6517returnTTI.getShuffleCost(TargetTransformInfo::SK_Splice,

6518 cast<VectorType>(VectorTy), Mask,CostKind,

6519 VF.getKnownMinValue() - 1);

6520 }

6521

6522// Phi nodes in non-header blocks (not inductions, reductions, etc.) are

6523// converted into select instructions. We require N - 1 selects per phi

6524// node, where N is the number of incoming values.

6525if (VF.isVector() && Phi->getParent() !=TheLoop->getHeader()) {

6526Type *ResultTy = Phi->getType();

6527

6528// All instructions in an Any-of reduction chain are narrowed to bool.

6529// Check if that is the case for this phi node.

6530auto *HeaderUser = cast_if_present<PHINode>(

6531 find_singleton<User>(Phi->users(), [this](User *U,bool) ->User * {

6532 auto *Phi = dyn_cast<PHINode>(U);

6533 if (Phi && Phi->getParent() == TheLoop->getHeader())

6534 return Phi;

6535 return nullptr;

6536 }));

6537if (HeaderUser) {

6538auto &ReductionVars =Legal->getReductionVars();

6539auto Iter = ReductionVars.find(HeaderUser);

6540if (Iter != ReductionVars.end() &&

6541RecurrenceDescriptor::isAnyOfRecurrenceKind(

6542 Iter->second.getRecurrenceKind()))

6543 ResultTy =Type::getInt1Ty(Phi->getContext());

6544 }

6545return (Phi->getNumIncomingValues() - 1) *

6546TTI.getCmpSelInstrCost(

6547 Instruction::Select,toVectorTy(ResultTy, VF),

6548toVectorTy(Type::getInt1Ty(Phi->getContext()), VF),

6549CmpInst::BAD_ICMP_PREDICATE,CostKind);

6550 }

6551

6552// When tail folding with EVL, if the phi is part of an out of loop

6553// reduction then it will be transformed into a wide vp_merge.

6554if (VF.isVector() &&foldTailWithEVL() &&

6555Legal->getReductionVars().contains(Phi) && !isInLoopReduction(Phi)) {

6556IntrinsicCostAttributes ICA(

6557 Intrinsic::vp_merge,toVectorTy(Phi->getType(), VF),

6558 {toVectorTy(Type::getInt1Ty(Phi->getContext()), VF)});

6559returnTTI.getIntrinsicInstrCost(ICA,CostKind);

6560 }

6561

6562returnTTI.getCFInstrCost(Instruction::PHI,CostKind);

6563 }

6564case Instruction::UDiv:

6565case Instruction::SDiv:

6566case Instruction::URem:

6567case Instruction::SRem:

6568if (VF.isVector() &&isPredicatedInst(I)) {

6569constauto [ScalarCost, SafeDivisorCost] =getDivRemSpeculationCost(I, VF);

6570returnisDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?

6571 ScalarCost : SafeDivisorCost;

6572 }

6573// We've proven all lanes safe to speculate, fall through.

6574 [[fallthrough]];

6575case Instruction::Add:

6576case Instruction::Sub: {

6577autoInfo =Legal->getHistogramInfo(I);

6578if (Info && VF.isVector()) {

6579constHistogramInfo *HGram =Info.value();

6580// Assume that a non-constant update value (or a constant != 1) requires

6581// a multiply, and add that into the cost.

6582InstructionCost MulCost =TTI::TCC_Free;

6583ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1));

6584if (!RHS ||RHS->getZExtValue() != 1)

6585 MulCost =

6586TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,CostKind);

6587

6588// Find the cost of the histogram operation itself.

6589Type *PtrTy =VectorType::get(HGram->Load->getPointerOperandType(), VF);

6590Type *ScalarTy =I->getType();

6591Type *MaskTy =VectorType::get(Type::getInt1Ty(I->getContext()), VF);

6592IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,

6593Type::getVoidTy(I->getContext()),

6594 {PtrTy, ScalarTy, MaskTy});

6595

6596// Add the costs together with the add/sub operation.

6597returnTTI.getIntrinsicInstrCost(ICA,CostKind) + MulCost +

6598TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy,CostKind);

6599 }

6600 [[fallthrough]];

6601 }

6602case Instruction::FAdd:

6603case Instruction::FSub:

6604case Instruction::Mul:

6605case Instruction::FMul:

6606case Instruction::FDiv:

6607case Instruction::FRem:

6608case Instruction::Shl:

6609case Instruction::LShr:

6610case Instruction::AShr:

6611case Instruction::And:

6612case Instruction::Or:

6613case Instruction::Xor: {

6614// If we're speculating on the stride being 1, the multiplication may

6615// fold away. We can generalize this for all operations using the notion

6616// of neutral elements. (TODO)

6617if (I->getOpcode() == Instruction::Mul &&

6618 (PSE.getSCEV(I->getOperand(0))->isOne() ||

6619PSE.getSCEV(I->getOperand(1))->isOne()))

6620return 0;

6621

6622// Detect reduction patterns

6623if (auto RedCost =getReductionPatternCost(I, VF, VectorTy))

6624return *RedCost;

6625

6626// Certain instructions can be cheaper to vectorize if they have a constant

6627// second vector operand. One example of this are shifts on x86.

6628Value *Op2 =I->getOperand(1);

6629if (!isa<Constant>(Op2) &&PSE.getSE()->isSCEVable(Op2->getType()) &&

6630 isa<SCEVConstant>(PSE.getSCEV(Op2))) {

6631 Op2 = cast<SCEVConstant>(PSE.getSCEV(Op2))->getValue();

6632 }

6633auto Op2Info =TTI.getOperandInfo(Op2);

6634if (Op2Info.Kind ==TargetTransformInfo::OK_AnyValue &&

6635shouldConsiderInvariant(Op2))

6636 Op2Info.Kind =TargetTransformInfo::OK_UniformValue;

6637

6638SmallVector<const Value *, 4>Operands(I->operand_values());

6639returnTTI.getArithmeticInstrCost(

6640I->getOpcode(), VectorTy,CostKind,

6641 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},

6642 Op2Info,Operands,I,TLI);

6643 }

6644case Instruction::FNeg: {

6645returnTTI.getArithmeticInstrCost(

6646I->getOpcode(), VectorTy,CostKind,

6647 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},

6648 {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},

6649I->getOperand(0),I);

6650 }

6651case Instruction::Select: {

6652SelectInst *SI = cast<SelectInst>(I);

6653constSCEV *CondSCEV = SE->getSCEV(SI->getCondition());

6654bool ScalarCond = (SE->isLoopInvariant(CondSCEV,TheLoop));

6655

6656constValue *Op0, *Op1;

6657using namespacellvm::PatternMatch;

6658if (!ScalarCond && (match(I,m_LogicalAnd(m_Value(Op0),m_Value(Op1))) ||

6659match(I,m_LogicalOr(m_Value(Op0),m_Value(Op1))))) {

6660// select x, y, false --> x & y

6661// select x, true, y --> x | y

6662constauto [Op1VK, Op1VP] =TTI::getOperandInfo(Op0);

6663constauto [Op2VK, Op2VP] =TTI::getOperandInfo(Op1);

6664assert(Op0->getType()->getScalarSizeInBits() == 1 &&

6665 Op1->getType()->getScalarSizeInBits() == 1);

6666

6667SmallVector<const Value *, 2>Operands{Op0, Op1};

6668returnTTI.getArithmeticInstrCost(

6669match(I,m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,

6670CostKind, {Op1VK, Op1VP}, {Op2VK, Op2VP},Operands,I);

6671 }

6672

6673Type *CondTy = SI->getCondition()->getType();

6674if (!ScalarCond)

6675 CondTy =VectorType::get(CondTy, VF);

6676

6677CmpInst::Predicate Pred =CmpInst::BAD_ICMP_PREDICATE;

6678if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))

6679 Pred = Cmp->getPredicate();

6680returnTTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,

6681CostKind, {TTI::OK_AnyValue, TTI::OP_None},

6682 {TTI::OK_AnyValue, TTI::OP_None},I);

6683 }

6684case Instruction::ICmp:

6685case Instruction::FCmp: {

6686Type *ValTy =I->getOperand(0)->getType();

6687

6688if (canTruncateToMinimalBitwidth(I, VF)) {

6689Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));

6690 (void)Op0AsInstruction;

6691assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||

6692 MinBWs[I] == MinBWs[Op0AsInstruction]) &&

6693"if both the operand and the compare are marked for "

6694"truncation, they must have the same bitwidth");

6695 ValTy =IntegerType::get(ValTy->getContext(), MinBWs[I]);

6696 }

6697

6698 VectorTy =toVectorTy(ValTy, VF);

6699returnTTI.getCmpSelInstrCost(I->getOpcode(), VectorTy,nullptr,

6700 cast<CmpInst>(I)->getPredicate(),CostKind,

6701 {TTI::OK_AnyValue, TTI::OP_None},

6702 {TTI::OK_AnyValue, TTI::OP_None},I);

6703 }

6704case Instruction::Store:

6705case Instruction::Load: {

6706ElementCount Width = VF;

6707if (Width.isVector()) {

6708InstWidening Decision =getWideningDecision(I, Width);

6709assert(Decision !=CM_Unknown &&

6710"CM decision should be taken at this point");

6711if (getWideningCost(I, VF) ==InstructionCost::getInvalid())

6712returnInstructionCost::getInvalid();

6713if (Decision ==CM_Scalarize)

6714 Width =ElementCount::getFixed(1);

6715 }

6716 VectorTy =toVectorTy(getLoadStoreType(I), Width);

6717return getMemoryInstructionCost(I, VF);

6718 }

6719case Instruction::BitCast:

6720if (I->getType()->isPointerTy())

6721return 0;

6722 [[fallthrough]];

6723case Instruction::ZExt:

6724case Instruction::SExt:

6725case Instruction::FPToUI:

6726case Instruction::FPToSI:

6727case Instruction::FPExt:

6728case Instruction::PtrToInt:

6729case Instruction::IntToPtr:

6730case Instruction::SIToFP:

6731case Instruction::UIToFP:

6732case Instruction::Trunc:

6733case Instruction::FPTrunc: {

6734// Computes the CastContextHint from a Load/Store instruction.

6735auto ComputeCCH = [&](Instruction *I) ->TTI::CastContextHint {

6736assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&

6737"Expected a load or a store!");

6738

6739if (VF.isScalar() || !TheLoop->contains(I))

6740returnTTI::CastContextHint::Normal;

6741

6742switch (getWideningDecision(I, VF)) {

6743caseLoopVectorizationCostModel::CM_GatherScatter:

6744returnTTI::CastContextHint::GatherScatter;

6745caseLoopVectorizationCostModel::CM_Interleave:

6746returnTTI::CastContextHint::Interleave;

6747caseLoopVectorizationCostModel::CM_Scalarize:

6748caseLoopVectorizationCostModel::CM_Widen:

6749returnLegal->isMaskRequired(I) ?TTI::CastContextHint::Masked

6750 :TTI::CastContextHint::Normal;

6751caseLoopVectorizationCostModel::CM_Widen_Reverse:

6752returnTTI::CastContextHint::Reversed;

6753caseLoopVectorizationCostModel::CM_Unknown:

6754llvm_unreachable("Instr did not go through cost modelling?");

6755caseLoopVectorizationCostModel::CM_VectorCall:

6756caseLoopVectorizationCostModel::CM_IntrinsicCall:

6757llvm_unreachable_internal("Instr has invalid widening decision");

6758 }

6759

6760llvm_unreachable("Unhandled case!");

6761 };

6762

6763unsigned Opcode =I->getOpcode();

6764TTI::CastContextHint CCH =TTI::CastContextHint::None;

6765// For Trunc, the context is the only user, which must be a StoreInst.

6766if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {

6767if (I->hasOneUse())

6768if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))

6769 CCH = ComputeCCH(Store);

6770 }

6771// For Z/Sext, the context is the operand, which must be a LoadInst.

6772elseif (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||

6773 Opcode == Instruction::FPExt) {

6774if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))

6775 CCH = ComputeCCH(Load);

6776 }

6777

6778// We optimize the truncation of induction variables having constant

6779// integer steps. The cost of these truncations is the same as the scalar

6780// operation.

6781if (isOptimizableIVTruncate(I, VF)) {

6782auto *Trunc = cast<TruncInst>(I);

6783returnTTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),

6784 Trunc->getSrcTy(), CCH,CostKind, Trunc);

6785 }

6786

6787// Detect reduction patterns

6788if (auto RedCost =getReductionPatternCost(I, VF, VectorTy))

6789return *RedCost;

6790

6791Type *SrcScalarTy =I->getOperand(0)->getType();

6792Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));

6793if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))

6794 SrcScalarTy =

6795IntegerType::get(SrcScalarTy->getContext(), MinBWs[Op0AsInstruction]);

6796Type *SrcVecTy =

6797 VectorTy->isVectorTy() ?toVectorTy(SrcScalarTy, VF) : SrcScalarTy;

6798

6799if (canTruncateToMinimalBitwidth(I, VF)) {

6800// If the result type is <= the source type, there will be no extend

6801// after truncating the users to the minimal required bitwidth.

6802if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&

6803 (I->getOpcode() == Instruction::ZExt ||

6804I->getOpcode() == Instruction::SExt))

6805return 0;

6806 }

6807

6808returnTTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH,CostKind,I);

6809 }

6810case Instruction::Call:

6811returngetVectorCallCost(cast<CallInst>(I), VF);

6812case Instruction::ExtractValue:

6813returnTTI.getInstructionCost(I,CostKind);

6814case Instruction::Alloca:

6815// We cannot easily widen alloca to a scalable alloca, as

6816// the result would need to be a vector of pointers.

6817if (VF.isScalable())

6818returnInstructionCost::getInvalid();

6819 [[fallthrough]];

6820default:

6821// This opcode is unknown. Assume that it is the same as 'mul'.

6822returnTTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,CostKind);

6823 }// end of switch.

6824}

6825

6826voidLoopVectorizationCostModel::collectValuesToIgnore() {

6827// Ignore ephemeral values.

6828CodeMetrics::collectEphemeralValues(TheLoop,AC,ValuesToIgnore);

6829

6830SmallVector<Value *, 4> DeadInterleavePointerOps;

6831SmallVector<Value *, 4> DeadOps;

6832

6833// If a scalar epilogue is required, users outside the loop won't use

6834// live-outs from the vector loop but from the scalar epilogue. Ignore them if

6835// that is the case.

6836bool RequiresScalarEpilogue =requiresScalarEpilogue(true);

6837auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {

6838return RequiresScalarEpilogue &&

6839 !TheLoop->contains(cast<Instruction>(U)->getParent());

6840 };

6841

6842LoopBlocksDFS DFS(TheLoop);

6843 DFS.perform(LI);

6844MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;

6845for (BasicBlock *BB :reverse(make_range(DFS.beginRPO(), DFS.endRPO())))

6846for (Instruction &I :reverse(*BB)) {

6847// Find all stores to invariant variables. Since they are going to sink

6848// outside the loop we do not need calculate cost for them.

6849StoreInst *SI;

6850if ((SI = dyn_cast<StoreInst>(&I)) &&

6851Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {

6852ValuesToIgnore.insert(&I);

6853 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(

6854 SI->getValueOperand());

6855 }

6856

6857if (VecValuesToIgnore.contains(&I) ||ValuesToIgnore.contains(&I))

6858continue;

6859

6860// Add instructions that would be trivially dead and are only used by

6861// values already ignored to DeadOps to seed worklist.

6862if (wouldInstructionBeTriviallyDead(&I,TLI) &&

6863all_of(I.users(), [this, IsLiveOutDead](User *U) {

6864 return VecValuesToIgnore.contains(U) ||

6865 ValuesToIgnore.contains(U) || IsLiveOutDead(U);

6866 }))

6867 DeadOps.push_back(&I);

6868

6869// For interleave groups, we only create a pointer for the start of the

6870// interleave group. Queue up addresses of group members except the insert

6871// position for further processing.

6872if (isAccessInterleaved(&I)) {

6873auto *Group =getInterleavedAccessGroup(&I);

6874if (Group->getInsertPos() == &I)

6875continue;

6876Value *PointerOp =getLoadStorePointerOperand(&I);

6877 DeadInterleavePointerOps.push_back(PointerOp);

6878 }

6879

6880// Queue branches for analysis. They are dead, if their successors only

6881// contain dead instructions.

6882if (auto *Br = dyn_cast<BranchInst>(&I)) {

6883if (Br->isConditional())

6884 DeadOps.push_back(&I);

6885 }

6886 }

6887

6888// Mark ops feeding interleave group members as free, if they are only used

6889// by other dead computations.

6890for (unsignedI = 0;I != DeadInterleavePointerOps.size(); ++I) {

6891auto *Op = dyn_cast<Instruction>(DeadInterleavePointerOps[I]);

6892if (!Op || !TheLoop->contains(Op) ||any_of(Op->users(), [this](User *U) {

6893 Instruction *UI = cast<Instruction>(U);

6894 return !VecValuesToIgnore.contains(U) &&

6895 (!isAccessInterleaved(UI) ||

6896 getInterleavedAccessGroup(UI)->getInsertPos() == UI);

6897 }))

6898continue;

6899VecValuesToIgnore.insert(Op);

6900 DeadInterleavePointerOps.append(Op->op_begin(),Op->op_end());

6901 }

6902

6903for (constauto &[_, Ops] : DeadInvariantStoreOps) {

6904for (Value *Op :ArrayRef(Ops).drop_back())

6905 DeadOps.push_back(Op);

6906 }

6907// Mark ops that would be trivially dead and are only used by ignored

6908// instructions as free.

6909BasicBlock *Header =TheLoop->getHeader();

6910

6911// Returns true if the block contains only dead instructions. Such blocks will

6912// be removed by VPlan-to-VPlan transforms and won't be considered by the

6913// VPlan-based cost model, so skip them in the legacy cost-model as well.

6914autoIsEmptyBlock = [this](BasicBlock *BB) {

6915returnall_of(*BB, [this](Instruction &I) {

6916returnValuesToIgnore.contains(&I) ||VecValuesToIgnore.contains(&I) ||

6917 (isa<BranchInst>(&I) && !cast<BranchInst>(&I)->isConditional());

6918 });

6919 };

6920for (unsignedI = 0;I != DeadOps.size(); ++I) {

6921auto *Op = dyn_cast<Instruction>(DeadOps[I]);

6922

6923// Check if the branch should be considered dead.

6924if (auto *Br = dyn_cast_or_null<BranchInst>(Op)) {

6925BasicBlock *ThenBB = Br->getSuccessor(0);

6926BasicBlock *ElseBB = Br->getSuccessor(1);

6927// Don't considers branches leaving the loop for simplification.

6928if (!TheLoop->contains(ThenBB) || !TheLoop->contains(ElseBB))

6929continue;

6930bool ThenEmpty =IsEmptyBlock(ThenBB);

6931bool ElseEmpty =IsEmptyBlock(ElseBB);

6932if ((ThenEmpty && ElseEmpty) ||

6933 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&

6934 ElseBB->phis().empty()) ||

6935 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&

6936 ThenBB->phis().empty())) {

6937VecValuesToIgnore.insert(Br);

6938 DeadOps.push_back(Br->getCondition());

6939 }

6940continue;

6941 }

6942

6943// Skip any op that shouldn't be considered dead.

6944if (!Op || !TheLoop->contains(Op) ||

6945 (isa<PHINode>(Op) &&Op->getParent() == Header) ||

6946 !wouldInstructionBeTriviallyDead(Op,TLI) ||

6947any_of(Op->users(), [this, IsLiveOutDead](User *U) {

6948 return !VecValuesToIgnore.contains(U) &&

6949 !ValuesToIgnore.contains(U) && !IsLiveOutDead(U);

6950 }))

6951continue;

6952

6953if (!TheLoop->contains(Op->getParent()))

6954continue;

6955

6956// If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore

6957// which applies for both scalar and vector versions. Otherwise it is only

6958// dead in vector versions, so only add it to VecValuesToIgnore.

6959if (all_of(Op->users(),

6960 [this](User *U) { return ValuesToIgnore.contains(U); }))

6961ValuesToIgnore.insert(Op);

6962

6963VecValuesToIgnore.insert(Op);

6964 DeadOps.append(Op->op_begin(),Op->op_end());

6965 }

6966

6967// Ignore type-promoting instructions we identified during reduction

6968// detection.

6969for (constauto &Reduction :Legal->getReductionVars()) {

6970constRecurrenceDescriptor &RedDes =Reduction.second;

6971constSmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();

6972VecValuesToIgnore.insert(Casts.begin(), Casts.end());

6973 }

6974// Ignore type-casting instructions we identified during induction

6975// detection.

6976for (constauto &Induction :Legal->getInductionVars()) {

6977constInductionDescriptor &IndDes = Induction.second;

6978constSmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();

6979VecValuesToIgnore.insert(Casts.begin(), Casts.end());

6980 }

6981}

6982

6983voidLoopVectorizationCostModel::collectInLoopReductions() {

6984for (constauto &Reduction :Legal->getReductionVars()) {

6985PHINode *Phi =Reduction.first;

6986constRecurrenceDescriptor &RdxDesc =Reduction.second;

6987

6988// We don't collect reductions that are type promoted (yet).

6989if (RdxDesc.getRecurrenceType() != Phi->getType())

6990continue;

6991

6992// If the target would prefer this reduction to happen "in-loop", then we

6993// want to record it as such.

6994unsigned Opcode = RdxDesc.getOpcode();

6995if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&

6996 !TTI.preferInLoopReduction(Opcode, Phi->getType(),

6997TargetTransformInfo::ReductionFlags()))

6998continue;

6999

7000// Check that we can correctly put the reductions into the loop, by

7001// finding the chain of operations that leads from the phi to the loop

7002// exit value.

7003SmallVector<Instruction *, 4> ReductionOperations =

7004 RdxDesc.getReductionOpChain(Phi,TheLoop);

7005bool InLoop = !ReductionOperations.empty();

7006

7007if (InLoop) {

7008 InLoopReductions.insert(Phi);

7009// Add the elements to InLoopReductionImmediateChains for cost modelling.

7010Instruction *LastChain = Phi;

7011for (auto *I : ReductionOperations) {

7012 InLoopReductionImmediateChains[I] = LastChain;

7013 LastChain =I;

7014 }

7015 }

7016LLVM_DEBUG(dbgs() <<"LV: Using " << (InLoop ?"inloop" :"out of loop")

7017 <<" reduction for phi: " << *Phi <<"\n");

7018 }

7019}

7020

7021// This function will select a scalable VF if the target supports scalable

7022// vectors and a fixed one otherwise.

7023// TODO: we could return a pair of values that specify the max VF and

7024// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of

7025// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment

7026// doesn't have a cost model that can choose which plan to execute if

7027// more than one is generated.

7028staticElementCount determineVPlanVF(constTargetTransformInfo &TTI,

7029LoopVectorizationCostModel &CM) {

7030unsigned WidestType;

7031 std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();

7032

7033TargetTransformInfo::RegisterKind RegKind =

7034TTI.enableScalableVectorization()

7035 ?TargetTransformInfo::RGK_ScalableVector

7036 :TargetTransformInfo::RGK_FixedWidthVector;

7037

7038TypeSize RegSize =TTI.getRegisterBitWidth(RegKind);

7039unsignedN =RegSize.getKnownMinValue() / WidestType;

7040returnElementCount::get(N,RegSize.isScalable());

7041}

7042

7043VectorizationFactor

7044LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {

7045ElementCount VF = UserVF;

7046// Outer loop handling: They may require CFG and instruction level

7047// transformations before even evaluating whether vectorization is profitable.

7048// Since we cannot modify the incoming IR, we need to build VPlan upfront in

7049// the vectorization pipeline.

7050if (!OrigLoop->isInnermost()) {

7051// If the user doesn't provide a vectorization factor, determine a

7052// reasonable one.

7053if (UserVF.isZero()) {

7054 VF =determineVPlanVF(TTI, CM);

7055LLVM_DEBUG(dbgs() <<"LV: VPlan computed VF " << VF <<".\n");

7056

7057// Make sure we have a VF > 1 for stress testing.

7058if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {

7059LLVM_DEBUG(dbgs() <<"LV: VPlan stress testing: "

7060 <<"overriding computed VF.\n");

7061 VF =ElementCount::getFixed(4);

7062 }

7063 }elseif (UserVF.isScalable() && !TTI.supportsScalableVectors() &&

7064 !ForceTargetSupportsScalableVectors) {

7065LLVM_DEBUG(dbgs() <<"LV: Not vectorizing. Scalable VF requested, but "

7066 <<"not supported by the target.\n");

7067reportVectorizationFailure(

7068"Scalable vectorization requested but not supported by the target",

7069"the scalable user-specified vectorization width for outer-loop "

7070"vectorization cannot be used because the target does not support "

7071"scalable vectors.",

7072"ScalableVFUnfeasible", ORE, OrigLoop);

7073returnVectorizationFactor::Disabled();

7074 }

7075assert(EnableVPlanNativePath &&"VPlan-native path is not enabled.");

7076assert(isPowerOf2_32(VF.getKnownMinValue()) &&

7077"VF needs to be a power of two");

7078LLVM_DEBUG(dbgs() <<"LV: Using " << (!UserVF.isZero() ?"user " :"")

7079 <<"VF " << VF <<" to build VPlans.\n");

7080buildVPlans(VF, VF);

7081

7082// For VPlan build stress testing, we bail out after VPlan construction.

7083if (VPlanBuildStressTest)

7084returnVectorizationFactor::Disabled();

7085

7086return {VF, 0/*Cost*/, 0/* ScalarCost */};

7087 }

7088

7089LLVM_DEBUG(

7090dbgs() <<"LV: Not vectorizing. Inner loops aren't supported in the "

7091"VPlan-native path.\n");

7092returnVectorizationFactor::Disabled();

7093}

7094

7095voidLoopVectorizationPlanner::plan(ElementCount UserVF,unsigned UserIC) {

7096assert(OrigLoop->isInnermost() &&"Inner loop expected.");

7097 CM.collectValuesToIgnore();

7098 CM.collectElementTypesForWidening();

7099

7100FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);

7101if (!MaxFactors)// Cases that should not to be vectorized nor interleaved.

7102return;

7103

7104// Invalidate interleave groups if all blocks of loop will be predicated.

7105if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&

7106 !useMaskedInterleavedAccesses(TTI)) {

7107LLVM_DEBUG(

7108dbgs()

7109 <<"LV: Invalidate all interleaved groups due to fold-tail by masking "

7110"which requires masked-interleaved support.\n");

7111if (CM.InterleaveInfo.invalidateGroups())

7112// Invalidating interleave groups also requires invalidating all decisions

7113// based on them, which includes widening decisions and uniform and scalar

7114// values.

7115 CM.invalidateCostModelingDecisions();

7116 }

7117

7118if (CM.foldTailByMasking())

7119 Legal->prepareToFoldTailByMasking();

7120

7121ElementCount MaxUserVF =

7122 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;

7123if (UserVF) {

7124if (!ElementCount::isKnownLE(UserVF, MaxUserVF)) {

7125reportVectorizationInfo(

7126"UserVF ignored because it may be larger than the maximal safe VF",

7127"InvalidUserVF", ORE, OrigLoop);

7128 }else {

7129assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&

7130"VF needs to be a power of two");

7131// Collect the instructions (and their associated costs) that will be more

7132// profitable to scalarize.

7133 CM.collectInLoopReductions();

7134if (CM.selectUserVectorizationFactor(UserVF)) {

7135LLVM_DEBUG(dbgs() <<"LV: Using user VF " << UserVF <<".\n");

7136 buildVPlansWithVPRecipes(UserVF, UserVF);

7137LLVM_DEBUG(printPlans(dbgs()));

7138return;

7139 }

7140reportVectorizationInfo("UserVF ignored because of invalid costs.",

7141"InvalidCost", ORE, OrigLoop);

7142 }

7143 }

7144

7145// Collect the Vectorization Factor Candidates.

7146SmallVector<ElementCount> VFCandidates;

7147for (auto VF =ElementCount::getFixed(1);

7148ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)

7149 VFCandidates.push_back(VF);

7150for (auto VF =ElementCount::getScalable(1);

7151ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)

7152 VFCandidates.push_back(VF);

7153

7154 CM.collectInLoopReductions();

7155for (constauto &VF : VFCandidates) {

7156// Collect Uniform and Scalar instructions after vectorization with VF.

7157 CM.collectUniformsAndScalars(VF);

7158

7159// Collect the instructions (and their associated costs) that will be more

7160// profitable to scalarize.

7161if (VF.isVector())

7162 CM.collectInstsToScalarize(VF);

7163 }

7164

7165 buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);

7166 buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);

7167

7168LLVM_DEBUG(printPlans(dbgs()));

7169}

7170

7171InstructionCost VPCostContext::getLegacyCost(Instruction *UI,

7172ElementCount VF) const{

7173if (ForceTargetInstructionCost.getNumOccurrences())

7174returnInstructionCost(ForceTargetInstructionCost.getNumOccurrences());

7175returnCM.getInstructionCost(UI, VF);

7176}

7177

7178boolVPCostContext::skipCostComputation(Instruction *UI,bool IsVector) const{

7179returnCM.ValuesToIgnore.contains(UI) ||

7180 (IsVector &&CM.VecValuesToIgnore.contains(UI)) ||

7181SkipCostComputation.contains(UI);

7182}

7183

7184InstructionCost

7185LoopVectorizationPlanner::precomputeCosts(VPlan &Plan,ElementCount VF,

7186VPCostContext &CostCtx) const{

7187InstructionCost Cost;

7188// Cost modeling for inductions is inaccurate in the legacy cost model

7189// compared to the recipes that are generated. To match here initially during

7190// VPlan cost model bring up directly use the induction costs from the legacy

7191// cost model. Note that we do this as pre-processing; the VPlan may not have

7192// any recipes associated with the original induction increment instruction

7193// and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute

7194// the cost of induction phis and increments (both that are represented by

7195// recipes and those that are not), to avoid distinguishing between them here,

7196// and skip all recipes that represent induction phis and increments (the

7197// former case) later on, if they exist, to avoid counting them twice.

7198// Similarly we pre-compute the cost of any optimized truncates.

7199// TODO: Switch to more accurate costing based on VPlan.

7200for (constauto &[IV, IndDesc] : Legal->getInductionVars()) {

7201Instruction *IVInc = cast<Instruction>(

7202IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));

7203SmallVector<Instruction *> IVInsts = {IVInc};

7204for (unsignedI = 0;I != IVInsts.size();I++) {

7205for (Value *Op : IVInsts[I]->operands()) {

7206auto *OpI = dyn_cast<Instruction>(Op);

7207if (Op ==IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())

7208continue;

7209 IVInsts.push_back(OpI);

7210 }

7211 }

7212 IVInsts.push_back(IV);

7213for (User *U :IV->users()) {

7214auto *CI = cast<Instruction>(U);

7215if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))

7216continue;

7217 IVInsts.push_back(CI);

7218 }

7219

7220// If the vector loop gets executed exactly once with the given VF, ignore

7221// the costs of comparison and induction instructions, as they'll get

7222// simplified away.

7223// TODO: Remove this code after stepping away from the legacy cost model and

7224// adding code to simplify VPlans before calculating their costs.

7225auto TC = PSE.getSE()->getSmallConstantTripCount(OrigLoop);

7226if (VF.isFixed() && TC == VF.getFixedValue() && !CM.foldTailByMasking())

7227addFullyUnrolledInstructionsToIgnore(OrigLoop, Legal->getInductionVars(),

7228 CostCtx.SkipCostComputation);

7229

7230for (Instruction *IVInst : IVInsts) {

7231if (CostCtx.skipCostComputation(IVInst, VF.isVector()))

7232continue;

7233InstructionCost InductionCost = CostCtx.getLegacyCost(IVInst, VF);

7234LLVM_DEBUG({

7235dbgs() <<"Cost of " << InductionCost <<" for VF " << VF

7236 <<": induction instruction " << *IVInst <<"\n";

7237 });

7238Cost += InductionCost;

7239 CostCtx.SkipCostComputation.insert(IVInst);

7240 }

7241 }

7242

7243 /// Compute the cost of all exiting conditions of the loop using the legacy

7244 /// cost model. This is to match the legacy behavior, which adds the cost of

7245 /// all exit conditions. Note that this over-estimates the cost, as there will

7246 /// be a single condition to control the vector loop.

7247SmallVector<BasicBlock *> Exiting;

7248 CM.TheLoop->getExitingBlocks(Exiting);

7249SetVector<Instruction *> ExitInstrs;

7250// Collect all exit conditions.

7251for (BasicBlock *EB : Exiting) {

7252auto *Term = dyn_cast<BranchInst>(EB->getTerminator());

7253if (!Term)

7254continue;

7255if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {

7256 ExitInstrs.insert(CondI);

7257 }

7258 }

7259// Compute the cost of all instructions only feeding the exit conditions.

7260for (unsignedI = 0;I != ExitInstrs.size(); ++I) {

7261Instruction *CondI = ExitInstrs[I];

7262if (!OrigLoop->contains(CondI) ||

7263 !CostCtx.SkipCostComputation.insert(CondI).second)

7264continue;

7265InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);

7266LLVM_DEBUG({

7267dbgs() <<"Cost of " << CondICost <<" for VF " << VF

7268 <<": exit condition instruction " << *CondI <<"\n";

7269 });

7270Cost += CondICost;

7271for (Value *Op : CondI->operands()) {

7272auto *OpI = dyn_cast<Instruction>(Op);

7273if (!OpI ||any_of(OpI->users(), [&ExitInstrs,this](User *U) {

7274 return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&

7275 !ExitInstrs.contains(cast<Instruction>(U));

7276 }))

7277continue;

7278 ExitInstrs.insert(OpI);

7279 }

7280 }

7281

7282// The legacy cost model has special logic to compute the cost of in-loop

7283// reductions, which may be smaller than the sum of all instructions involved

7284// in the reduction.

7285// TODO: Switch to costing based on VPlan once the logic has been ported.

7286for (constauto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {

7287if (ForceTargetInstructionCost.getNumOccurrences())

7288continue;

7289

7290if (!CM.isInLoopReduction(RedPhi))

7291continue;

7292

7293constauto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);

7294SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),

7295 ChainOps.end());

7296auto IsZExtOrSExt = [](constunsigned Opcode) ->bool {

7297return Opcode == Instruction::ZExt || Opcode == Instruction::SExt;

7298 };

7299// Also include the operands of instructions in the chain, as the cost-model

7300// may mark extends as free.

7301//

7302// For ARM, some of the instruction can folded into the reducion

7303// instruction. So we need to mark all folded instructions free.

7304// For example: We can fold reduce(mul(ext(A), ext(B))) into one

7305// instruction.

7306for (auto *ChainOp : ChainOps) {

7307for (Value *Op : ChainOp->operands()) {

7308if (auto *I = dyn_cast<Instruction>(Op)) {

7309 ChainOpsAndOperands.insert(I);

7310if (I->getOpcode() == Instruction::Mul) {

7311auto *Ext0 = dyn_cast<Instruction>(I->getOperand(0));

7312auto *Ext1 = dyn_cast<Instruction>(I->getOperand(1));

7313if (Ext0 && IsZExtOrSExt(Ext0->getOpcode()) && Ext1 &&

7314 Ext0->getOpcode() == Ext1->getOpcode()) {

7315 ChainOpsAndOperands.insert(Ext0);

7316 ChainOpsAndOperands.insert(Ext1);

7317 }

7318 }

7319 }

7320 }

7321 }

7322

7323// Pre-compute the cost for I, if it has a reduction pattern cost.

7324for (Instruction *I : ChainOpsAndOperands) {

7325auto ReductionCost =

7326 CM.getReductionPatternCost(I, VF,toVectorTy(I->getType(), VF));

7327if (!ReductionCost)

7328continue;

7329

7330assert(!CostCtx.SkipCostComputation.contains(I) &&

7331"reduction op visited multiple times");

7332 CostCtx.SkipCostComputation.insert(I);

7333LLVM_DEBUG(dbgs() <<"Cost of " << ReductionCost <<" for VF " << VF

7334 <<":\n in-loop reduction " << *I <<"\n");

7335Cost += *ReductionCost;

7336 }

7337 }

7338

7339// Pre-compute the costs for branches except for the backedge, as the number

7340// of replicate regions in a VPlan may not directly match the number of

7341// branches, which would lead to different decisions.

7342// TODO: Compute cost of branches for each replicate region in the VPlan,

7343// which is more accurate than the legacy cost model.

7344for (BasicBlock *BB : OrigLoop->blocks()) {

7345if (CostCtx.skipCostComputation(BB->getTerminator(), VF.isVector()))

7346continue;

7347 CostCtx.SkipCostComputation.insert(BB->getTerminator());

7348if (BB == OrigLoop->getLoopLatch())

7349continue;

7350auto BranchCost = CostCtx.getLegacyCost(BB->getTerminator(), VF);

7351Cost += BranchCost;

7352 }

7353

7354// Pre-compute costs for instructions that are forced-scalar or profitable to

7355// scalarize. Their costs will be computed separately in the legacy cost

7356// model.

7357for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {

7358if (CostCtx.skipCostComputation(ForcedScalar, VF.isVector()))

7359continue;

7360 CostCtx.SkipCostComputation.insert(ForcedScalar);

7361InstructionCost ForcedCost = CostCtx.getLegacyCost(ForcedScalar, VF);

7362LLVM_DEBUG({

7363dbgs() <<"Cost of " << ForcedCost <<" for VF " << VF

7364 <<": forced scalar " << *ForcedScalar <<"\n";

7365 });

7366Cost += ForcedCost;

7367 }

7368for (constauto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {

7369if (CostCtx.skipCostComputation(Scalarized, VF.isVector()))

7370continue;

7371 CostCtx.SkipCostComputation.insert(Scalarized);

7372LLVM_DEBUG({

7373dbgs() <<"Cost of " << ScalarCost <<" for VF " << VF

7374 <<": profitable to scalarize " << *Scalarized <<"\n";

7375 });

7376Cost += ScalarCost;

7377 }

7378

7379returnCost;

7380}

7381

7382InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,

7383ElementCount VF) const{

7384VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,

7385 CM.CostKind);

7386InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);

7387

7388// Now compute and add the VPlan-based cost.

7389Cost += Plan.cost(VF, CostCtx);

7390#ifndef NDEBUG

7391unsigned EstimatedWidth =getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);

7392LLVM_DEBUG(dbgs() <<"Cost for VF " << VF <<": " <<Cost

7393 <<" (Estimated cost per lane: ");

7394if (Cost.isValid()) {

7395double CostPerLane = double(*Cost.getValue()) / EstimatedWidth;

7396LLVM_DEBUG(dbgs() <<format("%.1f", CostPerLane));

7397 }else/* No point dividing an invalid cost - it will still be invalid */

7398LLVM_DEBUG(dbgs() <<"Invalid");

7399LLVM_DEBUG(dbgs() <<")\n");

7400#endif

7401returnCost;

7402}

7403

7404#ifndef NDEBUG

7405/// Return true if the original loop \ TheLoop contains any instructions that do

7406/// not have corresponding recipes in \p Plan and are not marked to be ignored

7407/// in \p CostCtx. This means the VPlan contains simplification that the legacy

7408/// cost-model did not account for.

7409staticboolplanContainsAdditionalSimplifications(VPlan &Plan,

7410VPCostContext &CostCtx,

7411Loop *TheLoop) {

7412// First collect all instructions for the recipes in Plan.

7413auto GetInstructionForCost = [](constVPRecipeBase *R) ->Instruction * {

7414if (auto *S = dyn_cast<VPSingleDefRecipe>(R))

7415return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());

7416if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))

7417return &WidenMem->getIngredient();

7418returnnullptr;

7419 };

7420

7421DenseSet<Instruction *> SeenInstrs;

7422auto Iter =vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());

7423for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {

7424for (VPRecipeBase &R : *VPBB) {

7425if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {

7426auto *IG =IR->getInterleaveGroup();

7427unsigned NumMembers = IG->getNumMembers();

7428for (unsignedI = 0;I != NumMembers; ++I) {

7429if (Instruction *M = IG->getMember(I))

7430 SeenInstrs.insert(M);

7431 }

7432continue;

7433 }

7434// The VPlan-based cost model is more accurate for partial reduction and

7435// comparing against the legacy cost isn't desirable.

7436if (isa<VPPartialReductionRecipe>(&R))

7437returntrue;

7438if (Instruction *UI = GetInstructionForCost(&R))

7439 SeenInstrs.insert(UI);

7440 }

7441 }

7442

7443// Return true if the loop contains any instructions that are not also part of

7444// the VPlan or are skipped for VPlan-based cost computations. This indicates

7445// that the VPlan contains extra simplifications.

7446returnany_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,

7447 TheLoop](BasicBlock *BB) {

7448 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {

7449 if (isa<PHINode>(&I) && BB == TheLoop->getHeader())

7450 return false;

7451 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);

7452 });

7453 });

7454}

7455#endif

7456

7457VectorizationFactor LoopVectorizationPlanner::computeBestVF() {

7458if (VPlans.empty())

7459returnVectorizationFactor::Disabled();

7460// If there is a single VPlan with a single VF, return it directly.

7461VPlan &FirstPlan = *VPlans[0];

7462if (VPlans.size() == 1 &&size(FirstPlan.vectorFactors()) == 1)

7463return {*FirstPlan.vectorFactors().begin(), 0, 0};

7464

7465LLVM_DEBUG(dbgs() <<"LV: Computing best VF using cost kind: "

7466 << (CM.CostKind ==TTI::TCK_RecipThroughput

7467 ?"Reciprocal Throughput\n"

7468 : CM.CostKind ==TTI::TCK_Latency

7469 ?"Instruction Latency\n"

7470 : CM.CostKind ==TTI::TCK_CodeSize ?"Code Size\n"

7471 : CM.CostKind ==TTI::TCK_SizeAndLatency

7472 ?"Code Size and Latency\n"

7473 :"Unknown\n"));

7474

7475ElementCount ScalarVF =ElementCount::getFixed(1);

7476assert(hasPlanWithVF(ScalarVF) &&

7477"More than a single plan/VF w/o any plan having scalar VF");

7478

7479// TODO: Compute scalar cost using VPlan-based cost model.

7480InstructionCost ScalarCost = CM.expectedCost(ScalarVF);

7481LLVM_DEBUG(dbgs() <<"LV: Scalar loop costs: " << ScalarCost <<".\n");

7482VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);

7483VectorizationFactor BestFactor = ScalarFactor;

7484

7485bool ForceVectorization = Hints.getForce() ==LoopVectorizeHints::FK_Enabled;

7486if (ForceVectorization) {

7487// Ignore scalar width, because the user explicitly wants vectorization.

7488// Initialize cost to max so that VF = 2 is, at least, chosen during cost

7489// evaluation.

7490 BestFactor.Cost =InstructionCost::getMax();

7491 }

7492

7493for (auto &P : VPlans) {

7494for (ElementCount VF :P->vectorFactors()) {

7495if (VF.isScalar())

7496continue;

7497if (!ForceVectorization && !willGenerateVectors(*P, VF,TTI)) {

7498LLVM_DEBUG(

7499dbgs()

7500 <<"LV: Not considering vector loop of width " << VF

7501 <<" because it will not generate any vector instructions.\n");

7502continue;

7503 }

7504

7505InstructionCost Cost = cost(*P, VF);

7506VectorizationFactor CurrentFactor(VF,Cost, ScalarCost);

7507if (isMoreProfitable(CurrentFactor, BestFactor))

7508 BestFactor = CurrentFactor;

7509

7510// If profitable add it to ProfitableVF list.

7511if (isMoreProfitable(CurrentFactor, ScalarFactor))

7512 ProfitableVFs.push_back(CurrentFactor);

7513 }

7514 }

7515

7516#ifndef NDEBUG

7517// Select the optimal vectorization factor according to the legacy cost-model.

7518// This is now only used to verify the decisions by the new VPlan-based

7519// cost-model and will be retired once the VPlan-based cost-model is

7520// stabilized.

7521VectorizationFactor LegacyVF = selectVectorizationFactor();

7522VPlan &BestPlan =getPlanFor(BestFactor.Width);

7523

7524// Pre-compute the cost and use it to check if BestPlan contains any

7525// simplifications not accounted for in the legacy cost model. If that's the

7526// case, don't trigger the assertion, as the extra simplifications may cause a

7527// different VF to be picked by the VPlan-based cost model.

7528VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,

7529 CM.CostKind);

7530 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);

7531assert((BestFactor.Width == LegacyVF.Width ||

7532planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),

7533 CostCtx, OrigLoop) ||

7534planContainsAdditionalSimplifications(getPlanFor(LegacyVF.Width),

7535 CostCtx, OrigLoop)) &&

7536" VPlan cost model and legacy cost model disagreed");

7537assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&

7538"when vectorizing, the scalar cost must be computed.");

7539#endif

7540

7541LLVM_DEBUG(dbgs() <<"LV: Selecting VF: " << BestFactor.Width <<".\n");

7542return BestFactor;

7543}

7544

7545staticvoidaddRuntimeUnrollDisableMetaData(Loop *L) {

7546SmallVector<Metadata *, 4> MDs;

7547// Reserve first location for self reference to the LoopID metadata node.

7548 MDs.push_back(nullptr);

7549bool IsUnrollMetadata =false;

7550MDNode *LoopID = L->getLoopID();

7551if (LoopID) {

7552// First find existing loop unrolling disable metadata.

7553for (unsignedI = 1, IE = LoopID->getNumOperands();I < IE; ++I) {

7554auto *MD = dyn_cast<MDNode>(LoopID->getOperand(I));

7555if (MD) {

7556constauto *S = dyn_cast<MDString>(MD->getOperand(0));

7557 IsUnrollMetadata =

7558 S && S->getString().starts_with("llvm.loop.unroll.disable");

7559 }

7560 MDs.push_back(LoopID->getOperand(I));

7561 }

7562 }

7563

7564if (!IsUnrollMetadata) {

7565// Add runtime unroll disable metadata.

7566LLVMContext &Context = L->getHeader()->getContext();

7567SmallVector<Metadata *, 1> DisableOperands;

7568 DisableOperands.push_back(

7569MDString::get(Context,"llvm.loop.unroll.runtime.disable"));

7570MDNode *DisableNode =MDNode::get(Context, DisableOperands);

7571 MDs.push_back(DisableNode);

7572MDNode *NewLoopID =MDNode::get(Context, MDs);

7573// Set operand 0 to refer to the loop id itself.

7574 NewLoopID->replaceOperandWith(0, NewLoopID);

7575 L->setLoopID(NewLoopID);

7576 }

7577}

7578

7579// If \p R is a ComputeReductionResult when vectorizing the epilog loop,

7580// fix the reduction's scalar PHI node by adding the incoming value from the

7581// main vector loop.

7582staticvoidfixReductionScalarResumeWhenVectorizingEpilog(

7583VPRecipeBase *R,VPTransformState &State,BasicBlock *LoopMiddleBlock,

7584BasicBlock *BypassBlock) {

7585auto *EpiRedResult = dyn_cast<VPInstruction>(R);

7586if (!EpiRedResult ||

7587 EpiRedResult->getOpcode() !=VPInstruction::ComputeReductionResult)

7588return;

7589

7590auto *EpiRedHeaderPhi =

7591 cast<VPReductionPHIRecipe>(EpiRedResult->getOperand(0));

7592constRecurrenceDescriptor &RdxDesc =

7593 EpiRedHeaderPhi->getRecurrenceDescriptor();

7594Value *MainResumeValue =

7595 EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();

7596if (RecurrenceDescriptor::isAnyOfRecurrenceKind(

7597 RdxDesc.getRecurrenceKind())) {

7598auto *Cmp = cast<ICmpInst>(MainResumeValue);

7599assert(Cmp->getPredicate() ==CmpInst::ICMP_NE &&

7600"AnyOf expected to start with ICMP_NE");

7601assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() &&

7602"AnyOf expected to start by comparing main resume value to original "

7603"start value");

7604 MainResumeValue = Cmp->getOperand(0);

7605 }elseif (RecurrenceDescriptor::isFindLastIVRecurrenceKind(

7606 RdxDesc.getRecurrenceKind())) {

7607using namespacellvm::PatternMatch;

7608Value *Cmp, *OrigResumeV;

7609bool IsExpectedPattern =

7610match(MainResumeValue,m_Select(m_OneUse(m_Value(Cmp)),

7611m_Specific(RdxDesc.getSentinelValue()),

7612m_Value(OrigResumeV))) &&

7613match(Cmp,

7614m_SpecificICmp(ICmpInst::ICMP_EQ,m_Specific(OrigResumeV),

7615m_Specific(RdxDesc.getRecurrenceStartValue())));

7616assert(IsExpectedPattern &&"Unexpected reduction resume pattern");

7617 (void)IsExpectedPattern;

7618 MainResumeValue = OrigResumeV;

7619 }

7620PHINode *MainResumePhi = cast<PHINode>(MainResumeValue);

7621

7622// When fixing reductions in the epilogue loop we should already have

7623// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry

7624// over the incoming values correctly.

7625using namespaceVPlanPatternMatch;

7626auto IsResumePhi = [](VPUser *U) {

7627returnmatch(

7628 U, m_VPInstruction<VPInstruction::ResumePhi>(m_VPValue(), m_VPValue()));

7629 };

7630assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 &&

7631"ResumePhi must have a single user");

7632auto *EpiResumePhiVPI =

7633 cast<VPInstruction>(*find_if(EpiRedResult->users(), IsResumePhi));

7634auto *EpiResumePhi = cast<PHINode>(State.get(EpiResumePhiVPI,true));

7635 EpiResumePhi->setIncomingValueForBlock(

7636 BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock));

7637}

7638

7639DenseMap<const SCEV *, Value *>LoopVectorizationPlanner::executePlan(

7640ElementCount BestVF,unsigned BestUF,VPlan &BestVPlan,

7641InnerLoopVectorizer &ILV,DominatorTree *DT,bool VectorizingEpilogue,

7642constDenseMap<const SCEV *, Value *> *ExpandedSCEVs) {

7643assert(BestVPlan.hasVF(BestVF) &&

7644"Trying to execute plan with unsupported VF");

7645assert(BestVPlan.hasUF(BestUF) &&

7646"Trying to execute plan with unsupported UF");

7647assert(

7648 ((VectorizingEpilogue && ExpandedSCEVs) ||

7649 (!VectorizingEpilogue && !ExpandedSCEVs)) &&

7650"expanded SCEVs to reuse can only be used during epilogue vectorization");

7651

7652// TODO: Move to VPlan transform stage once the transition to the VPlan-based

7653// cost model is complete for better cost estimates.

7654VPlanTransforms::unrollByUF(BestVPlan, BestUF,

7655 OrigLoop->getHeader()->getContext());

7656VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

7657VPlanTransforms::convertToConcreteRecipes(BestVPlan);

7658

7659// Perform the actual loop transformation.

7660VPTransformState State(&TTI, BestVF, BestUF, LI, DT, ILV.Builder, &ILV,

7661 &BestVPlan, OrigLoop->getParentLoop(),

7662 Legal->getWidestInductionType());

7663

7664#ifdef EXPENSIVE_CHECKS

7665assert(DT->verify(DominatorTree::VerificationLevel::Fast));

7666#endif

7667

7668// 0. Generate SCEV-dependent code in the entry, including TripCount, before

7669// making any changes to the CFG.

7670if (!BestVPlan.getEntry()->empty())

7671 BestVPlan.getEntry()->execute(&State);

7672

7673if (!ILV.getTripCount())

7674 ILV.setTripCount(State.get(BestVPlan.getTripCount(),VPLane(0)));

7675else

7676assert(VectorizingEpilogue &&"should only re-use the existing trip "

7677"count during epilogue vectorization");

7678

7679// 1. Set up the skeleton for vectorization, including vector pre-header and

7680// middle block. The vector loop is created during VPlan execution.

7681VPBasicBlock *VectorPH =

7682 cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());

7683 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(

7684 ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);

7685if (VectorizingEpilogue)

7686VPlanTransforms::removeDeadRecipes(BestVPlan);

7687

7688// Only use noalias metadata when using memory checks guaranteeing no overlap

7689// across all iterations.

7690constLoopAccessInfo *LAI = ILV.Legal->getLAI();

7691 std::unique_ptr<LoopVersioning> LVer =nullptr;

7692if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&

7693 !LAI->getRuntimePointerChecking()->getDiffChecks()) {

7694

7695// We currently don't use LoopVersioning for the actual loop cloning but we

7696// still use it to add the noalias metadata.

7697// TODO: Find a better way to re-use LoopVersioning functionality to add

7698// metadata.

7699 LVer = std::make_unique<LoopVersioning>(

7700 *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,

7701 PSE.getSE());

7702 State.LVer = &*LVer;

7703 State.LVer->prepareNoAliasMetadata();

7704 }

7705

7706 ILV.printDebugTracesAtStart();

7707

7708//===------------------------------------------------===//

7709//

7710// Notice: any optimization or new instruction that go

7711// into the code below should also be implemented in

7712// the cost-model.

7713//

7714//===------------------------------------------------===//

7715

7716// 2. Copy and widen instructions from the old loop into the new loop.

7717 BestVPlan.prepareToExecute(

7718 ILV.getTripCount(),

7719 ILV.getOrCreateVectorTripCount(ILV.LoopVectorPreHeader), State);

7720replaceVPBBWithIRVPBB(VectorPH, State.CFG.PrevBB);

7721

7722 BestVPlan.execute(&State);

7723

7724auto *MiddleVPBB = BestVPlan.getMiddleBlock();

7725// 2.5 When vectorizing the epilogue, fix reduction and induction resume

7726// values from the additional bypass block.

7727if (VectorizingEpilogue) {

7728assert(!ILV.Legal->hasUncountableEarlyExit() &&

7729"Epilogue vectorisation not yet supported with early exits");

7730BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();

7731for (VPRecipeBase &R : *MiddleVPBB) {

7732fixReductionScalarResumeWhenVectorizingEpilog(

7733 &R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);

7734 }

7735BasicBlock *PH = OrigLoop->getLoopPreheader();

7736for (constauto &[IVPhi,_] : Legal->getInductionVars()) {

7737auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));

7738Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);

7739 Inc->setIncomingValueForBlock(BypassBlock, V);

7740 }

7741 }

7742

7743// 2.6. Maintain Loop Hints

7744// Keep all loop hints from the original loop on the vector loop (we'll

7745// replace the vectorizer-specific hints below).

7746if (auto *LoopRegion = BestVPlan.getVectorLoopRegion()) {

7747MDNode *OrigLoopID = OrigLoop->getLoopID();

7748

7749 std::optional<MDNode *> VectorizedLoopID =

7750makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,

7751LLVMLoopVectorizeFollowupVectorized});

7752

7753VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();

7754Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);

7755if (VectorizedLoopID) {

7756 L->setLoopID(*VectorizedLoopID);

7757 }else {

7758// Keep all loop hints from the original loop on the vector loop (we'll

7759// replace the vectorizer-specific hints below).

7760if (MDNode *LID = OrigLoop->getLoopID())

7761 L->setLoopID(LID);

7762

7763LoopVectorizeHints Hints(L,true, *ORE);

7764 Hints.setAlreadyVectorized();

7765 }

7766TargetTransformInfo::UnrollingPreferences UP;

7767TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);

7768if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)

7769addRuntimeUnrollDisableMetaData(L);

7770 }

7771

7772// 3. Fix the vectorized code: take care of header phi's, live-outs,

7773// predication, updating analyses.

7774 ILV.fixVectorizedLoop(State);

7775

7776 ILV.printDebugTracesAtEnd();

7777

7778// 4. Adjust branch weight of the branch in the middle block.

7779if (BestVPlan.getVectorLoopRegion()) {

7780auto *MiddleVPBB = BestVPlan.getMiddleBlock();

7781auto *MiddleTerm =

7782 cast<BranchInst>(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator());

7783if (MiddleTerm->isConditional() &&

7784hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {

7785// Assume that `Count % VectorTripCount` is equally distributed.

7786unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();

7787assert(TripCount > 0 &&"trip count should not be zero");

7788constuint32_t Weights[] = {1, TripCount - 1};

7789setBranchWeights(*MiddleTerm, Weights,/*IsExpected=*/false);

7790 }

7791 }

7792

7793return State.ExpandedSCEVs;

7794}

7795

7796//===--------------------------------------------------------------------===//

7797// EpilogueVectorizerMainLoop

7798//===--------------------------------------------------------------------===//

7799

7800/// This function is partially responsible for generating the control flow

7801/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.

7802BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(

7803const SCEV2ValueTy &ExpandedSCEVs) {

7804createVectorLoopSkeleton("");

7805

7806// Generate the code to check the minimum iteration count of the vector

7807// epilogue (see below).

7808EPI.EpilogueIterationCountCheck =

7809emitIterationCountCheck(LoopScalarPreHeader,true);

7810EPI.EpilogueIterationCountCheck->setName("iter.check");

7811

7812// Generate the code to check any assumptions that we've made for SCEV

7813// expressions.

7814EPI.SCEVSafetyCheck =emitSCEVChecks(LoopScalarPreHeader);

7815

7816// Generate the code that checks at runtime if arrays overlap. We put the

7817// checks into a separate block to make the more common case of few elements

7818// faster.

7819EPI.MemSafetyCheck =emitMemRuntimeChecks(LoopScalarPreHeader);

7820

7821// Generate the iteration count check for the main loop, *after* the check

7822// for the epilogue loop, so that the path-length is shorter for the case

7823// that goes directly through the vector epilogue. The longer-path length for

7824// the main loop is compensated for, by the gain from vectorizing the larger

7825// trip count. Note: the branch will get updated later on when we vectorize

7826// the epilogue.

7827EPI.MainLoopIterationCountCheck =

7828emitIterationCountCheck(LoopScalarPreHeader,false);

7829

7830// Generate the induction variable.

7831EPI.VectorTripCount =getOrCreateVectorTripCount(LoopVectorPreHeader);

7832

7833returnLoopVectorPreHeader;

7834}

7835

7836voidEpilogueVectorizerMainLoop::printDebugTracesAtStart() {

7837LLVM_DEBUG({

7838dbgs() <<"Create Skeleton for epilogue vectorized loop (first pass)\n"

7839 <<"Main Loop VF:" <<EPI.MainLoopVF

7840 <<", Main Loop UF:" <<EPI.MainLoopUF

7841 <<", Epilogue Loop VF:" <<EPI.EpilogueVF

7842 <<", Epilogue Loop UF:" <<EPI.EpilogueUF <<"\n";

7843 });

7844}

7845

7846voidEpilogueVectorizerMainLoop::printDebugTracesAtEnd() {

7847DEBUG_WITH_TYPE(VerboseDebug, {

7848dbgs() <<"intermediate fn:\n"

7849 << *OrigLoop->getHeader()->getParent() <<"\n";

7850 });

7851}

7852

7853BasicBlock *

7854EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,

7855bool ForEpilogue) {

7856assert(Bypass &&"Expected valid bypass basic block.");

7857ElementCount VFactor = ForEpilogue ?EPI.EpilogueVF :VF;

7858unsigned UFactor = ForEpilogue ?EPI.EpilogueUF :UF;

7859Value *Count =getTripCount();

7860// Reuse existing vector loop preheader for TC checks.

7861// Note that new preheader block is generated for vector loop.

7862BasicBlock *const TCCheckBlock =LoopVectorPreHeader;

7863IRBuilder<>Builder(TCCheckBlock->getTerminator());

7864

7865// Generate code to check if the loop's trip count is less than VF * UF of the

7866// main vector loop.

7867autoP =Cost->requiresScalarEpilogue(ForEpilogue ?EPI.EpilogueVF.isVector()

7868 :VF.isVector())

7869 ?ICmpInst::ICMP_ULE

7870 :ICmpInst::ICMP_ULT;

7871

7872Value *CheckMinIters =Builder.CreateICmp(

7873P, Count,createStepForVF(Builder, Count->getType(), VFactor, UFactor),

7874"min.iters.check");

7875

7876if (!ForEpilogue)

7877 TCCheckBlock->setName("vector.main.loop.iter.check");

7878

7879// Create new preheader for vector loop.

7880LoopVectorPreHeader =SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),

7881DT,LI,nullptr,"vector.ph");

7882

7883if (ForEpilogue) {

7884assert(DT->properlyDominates(DT->getNode(TCCheckBlock),

7885DT->getNode(Bypass)->getIDom()) &&

7886"TC check is expected to dominate Bypass");

7887

7888LoopBypassBlocks.push_back(TCCheckBlock);

7889

7890// Save the trip count so we don't have to regenerate it in the

7891// vec.epilog.iter.check. This is safe to do because the trip count

7892// generated here dominates the vector epilog iter check.

7893EPI.TripCount = Count;

7894 }

7895

7896BranchInst &BI =

7897 *BranchInst::Create(Bypass,LoopVectorPreHeader, CheckMinIters);

7898if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))

7899setBranchWeights(BI,MinItersBypassWeights,/*IsExpected=*/false);

7900ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);

7901

7902introduceCheckBlockInVPlan(TCCheckBlock);

7903return TCCheckBlock;

7904}

7905

7906//===--------------------------------------------------------------------===//

7907// EpilogueVectorizerEpilogueLoop

7908//===--------------------------------------------------------------------===//

7909

7910/// This function is partially responsible for generating the control flow

7911/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.

7912BasicBlock *

7913EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(

7914const SCEV2ValueTy &ExpandedSCEVs) {

7915createVectorLoopSkeleton("vec.epilog.");

7916

7917// Now, compare the remaining count and if there aren't enough iterations to

7918// execute the vectorized epilogue skip to the scalar part.

7919LoopVectorPreHeader->setName("vec.epilog.ph");

7920BasicBlock *VecEpilogueIterationCountCheck =

7921SplitBlock(LoopVectorPreHeader,LoopVectorPreHeader->begin(),DT,LI,

7922nullptr,"vec.epilog.iter.check",true);

7923emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,

7924 VecEpilogueIterationCountCheck);

7925AdditionalBypassBlock = VecEpilogueIterationCountCheck;

7926

7927// Adjust the control flow taking the state info from the main loop

7928// vectorization into account.

7929assert(EPI.MainLoopIterationCountCheck &&EPI.EpilogueIterationCountCheck &&

7930"expected this to be saved from the previous pass.");

7931EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(

7932 VecEpilogueIterationCountCheck,LoopVectorPreHeader);

7933

7934EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(

7935 VecEpilogueIterationCountCheck,LoopScalarPreHeader);

7936

7937if (EPI.SCEVSafetyCheck)

7938EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(

7939 VecEpilogueIterationCountCheck,LoopScalarPreHeader);

7940if (EPI.MemSafetyCheck)

7941EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(

7942 VecEpilogueIterationCountCheck,LoopScalarPreHeader);

7943

7944DT->changeImmediateDominator(LoopScalarPreHeader,

7945EPI.EpilogueIterationCountCheck);

7946// Keep track of bypass blocks, as they feed start values to the induction and

7947// reduction phis in the scalar loop preheader.

7948if (EPI.SCEVSafetyCheck)

7949LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);

7950if (EPI.MemSafetyCheck)

7951LoopBypassBlocks.push_back(EPI.MemSafetyCheck);

7952LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);

7953

7954// The vec.epilog.iter.check block may contain Phi nodes from inductions or

7955// reductions which merge control-flow from the latch block and the middle

7956// block. Update the incoming values here and move the Phi into the preheader.

7957SmallVector<PHINode *, 4> PhisInBlock;

7958for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())

7959 PhisInBlock.push_back(&Phi);

7960

7961for (PHINode *Phi : PhisInBlock) {

7962 Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHIIt());

7963 Phi->replaceIncomingBlockWith(

7964 VecEpilogueIterationCountCheck->getSinglePredecessor(),

7965 VecEpilogueIterationCountCheck);

7966

7967// If the phi doesn't have an incoming value from the

7968// EpilogueIterationCountCheck, we are done. Otherwise remove the incoming

7969// value and also those from other check blocks. This is needed for

7970// reduction phis only.

7971if (none_of(Phi->blocks(), [&](BasicBlock *IncB) {

7972 return EPI.EpilogueIterationCountCheck == IncB;

7973 }))

7974continue;

7975 Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);

7976if (EPI.SCEVSafetyCheck)

7977 Phi->removeIncomingValue(EPI.SCEVSafetyCheck);

7978if (EPI.MemSafetyCheck)

7979 Phi->removeIncomingValue(EPI.MemSafetyCheck);

7980 }

7981

7982// Generate bypass values from the additional bypass block. Note that when the

7983// vectorized epilogue is skipped due to iteration count check, then the

7984// resume value for the induction variable comes from the trip count of the

7985// main vector loop, passed as the second argument.

7986createInductionAdditionalBypassValues(ExpandedSCEVs,EPI.VectorTripCount);

7987returnLoopVectorPreHeader;

7988}

7989

7990BasicBlock *

7991EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(

7992BasicBlock *Bypass,BasicBlock *Insert) {

7993

7994assert(EPI.TripCount &&

7995"Expected trip count to have been saved in the first pass.");

7996assert(

7997 (!isa<Instruction>(EPI.TripCount) ||

7998DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&

7999"saved trip count does not dominate insertion point.");

8000Value *TC =EPI.TripCount;

8001IRBuilder<>Builder(Insert->getTerminator());

8002Value *Count =Builder.CreateSub(TC,EPI.VectorTripCount,"n.vec.remaining");

8003

8004// Generate code to check if the loop's trip count is less than VF * UF of the

8005// vector epilogue loop.

8006autoP =Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())

8007 ?ICmpInst::ICMP_ULE

8008 :ICmpInst::ICMP_ULT;

8009

8010Value *CheckMinIters =

8011Builder.CreateICmp(P, Count,

8012createStepForVF(Builder, Count->getType(),

8013EPI.EpilogueVF,EPI.EpilogueUF),

8014"min.epilog.iters.check");

8015

8016BranchInst &BI =

8017 *BranchInst::Create(Bypass,LoopVectorPreHeader, CheckMinIters);

8018if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {

8019unsigned MainLoopStep =UF *VF.getKnownMinValue();

8020unsigned EpilogueLoopStep =

8021EPI.EpilogueUF *EPI.EpilogueVF.getKnownMinValue();

8022// We assume the remaining `Count` is equally distributed in

8023// [0, MainLoopStep)

8024// So the probability for `Count < EpilogueLoopStep` should be

8025// min(MainLoopStep, EpilogueLoopStep) / MainLoopStep

8026unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);

8027constuint32_t Weights[] = {EstimatedSkipCount,

8028 MainLoopStep - EstimatedSkipCount};

8029setBranchWeights(BI, Weights,/*IsExpected=*/false);

8030 }

8031ReplaceInstWithInst(Insert->getTerminator(), &BI);

8032LoopBypassBlocks.push_back(Insert);

8033

8034// A new entry block has been created for the epilogue VPlan. Hook it in, as

8035// otherwise we would try to modify the entry to the main vector loop.

8036VPIRBasicBlock *NewEntry =Plan.createVPIRBasicBlock(Insert);

8037VPBasicBlock *OldEntry =Plan.getEntry();

8038VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);

8039Plan.setEntry(NewEntry);

8040// OldEntry is now dead and will be cleaned up when the plan gets destroyed.

8041

8042introduceCheckBlockInVPlan(Insert);

8043return Insert;

8044}

8045

8046voidEpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {

8047LLVM_DEBUG({

8048dbgs() <<"Create Skeleton for epilogue vectorized loop (second pass)\n"

8049 <<"Epilogue Loop VF:" <<EPI.EpilogueVF

8050 <<", Epilogue Loop UF:" <<EPI.EpilogueUF <<"\n";

8051 });

8052}

8053

8054voidEpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {

8055DEBUG_WITH_TYPE(VerboseDebug, {

8056dbgs() <<"final fn:\n" << *OrigLoop->getHeader()->getParent() <<"\n";

8057 });

8058}

8059

8060iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>

8061VPRecipeBuilder::mapToVPValues(User::op_range Operands) {

8062 std::function<VPValue *(Value *)> Fn = [this](Value *Op) {

8063return getVPValueOrAddLiveIn(Op);

8064 };

8065returnmap_range(Operands, Fn);

8066}

8067

8068voidVPRecipeBuilder::createSwitchEdgeMasks(SwitchInst *SI) {

8069BasicBlock *Src = SI->getParent();

8070assert(!OrigLoop->isLoopExiting(Src) &&

8071all_of(successors(Src),

8072 [this](BasicBlock *Succ) {

8073return OrigLoop->getHeader() != Succ;

8074 }) &&

8075"unsupported switch either exiting loop or continuing to header");

8076// Create masks where the terminator in Src is a switch. We create mask for

8077// all edges at the same time. This is more efficient, as we can create and

8078// collect compares for all cases once.

8079VPValue *Cond =getVPValueOrAddLiveIn(SI->getCondition());

8080BasicBlock *DefaultDst = SI->getDefaultDest();

8081MapVector<BasicBlock *, SmallVector<VPValue *>> Dst2Compares;

8082for (auto &C : SI->cases()) {

8083BasicBlock *Dst =C.getCaseSuccessor();

8084assert(!EdgeMaskCache.contains({Src, Dst}) &&"Edge masks already created");

8085// Cases whose destination is the same as default are redundant and can be

8086// ignored - they will get there anyhow.

8087if (Dst == DefaultDst)

8088continue;

8089auto &Compares = Dst2Compares[Dst];

8090VPValue *V =getVPValueOrAddLiveIn(C.getCaseValue());

8091 Compares.push_back(Builder.createICmp(CmpInst::ICMP_EQ,Cond, V));

8092 }

8093

8094// We need to handle 2 separate cases below for all entries in Dst2Compares,

8095// which excludes destinations matching the default destination.

8096VPValue *SrcMask =getBlockInMask(Src);

8097VPValue *DefaultMask =nullptr;

8098for (constauto &[Dst, Conds] : Dst2Compares) {

8099// 1. Dst is not the default destination. Dst is reached if any of the cases

8100// with destination == Dst are taken. Join the conditions for each case

8101// whose destination == Dst using an OR.

8102VPValue *Mask = Conds[0];

8103for (VPValue *V :ArrayRef<VPValue *>(Conds).drop_front())

8104 Mask = Builder.createOr(Mask, V);

8105if (SrcMask)

8106 Mask = Builder.createLogicalAnd(SrcMask, Mask);

8107 EdgeMaskCache[{Src, Dst}] = Mask;

8108

8109// 2. Create the mask for the default destination, which is reached if none

8110// of the cases with destination != default destination are taken. Join the

8111// conditions for each case where the destination is != Dst using an OR and

8112// negate it.

8113 DefaultMask = DefaultMask ? Builder.createOr(DefaultMask, Mask) : Mask;

8114 }

8115

8116if (DefaultMask) {

8117 DefaultMask = Builder.createNot(DefaultMask);

8118if (SrcMask)

8119 DefaultMask = Builder.createLogicalAnd(SrcMask, DefaultMask);

8120 }

8121 EdgeMaskCache[{Src, DefaultDst}] = DefaultMask;

8122}

8123

8124VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src,BasicBlock *Dst) {

8125assert(is_contained(predecessors(Dst), Src) &&"Invalid edge");

8126

8127// Look for cached value.

8128 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);

8129EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);

8130if (ECEntryIt != EdgeMaskCache.end())

8131return ECEntryIt->second;

8132

8133if (auto *SI = dyn_cast<SwitchInst>(Src->getTerminator())) {

8134createSwitchEdgeMasks(SI);

8135assert(EdgeMaskCache.contains(Edge) &&"Mask for Edge not created?");

8136return EdgeMaskCache[Edge];

8137 }

8138

8139VPValue *SrcMask =getBlockInMask(Src);

8140

8141// The terminator has to be a branch inst!

8142BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());

8143assert(BI &&"Unexpected terminator found");

8144if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))

8145return EdgeMaskCache[Edge] = SrcMask;

8146

8147// If source is an exiting block, we know the exit edge is dynamically dead

8148// in the vector loop, and thus we don't need to restrict the mask. Avoid

8149// adding uses of an otherwise potentially dead instruction unless we are

8150// vectorizing a loop with uncountable exits. In that case, we always

8151// materialize the mask.

8152if (OrigLoop->isLoopExiting(Src) &&

8153 Src != Legal->getUncountableEarlyExitingBlock())

8154return EdgeMaskCache[Edge] = SrcMask;

8155

8156VPValue *EdgeMask =getVPValueOrAddLiveIn(BI->getCondition());

8157assert(EdgeMask &&"No Edge Mask found for condition");

8158

8159if (BI->getSuccessor(0) != Dst)

8160 EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());

8161

8162if (SrcMask) {// Otherwise block in-mask is all-one, no need to AND.

8163// The bitwise 'And' of SrcMask and EdgeMask introduces new UB if SrcMask

8164// is false and EdgeMask is poison. Avoid that by using 'LogicalAnd'

8165// instead which generates 'select i1 SrcMask, i1 EdgeMask, i1 false'.

8166 EdgeMask = Builder.createLogicalAnd(SrcMask, EdgeMask, BI->getDebugLoc());

8167 }

8168

8169return EdgeMaskCache[Edge] = EdgeMask;

8170}

8171

8172VPValue *VPRecipeBuilder::getEdgeMask(BasicBlock *Src,BasicBlock *Dst) const{

8173assert(is_contained(predecessors(Dst), Src) &&"Invalid edge");

8174

8175// Look for cached value.

8176 std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);

8177EdgeMaskCacheTy::const_iterator ECEntryIt = EdgeMaskCache.find(Edge);

8178assert(ECEntryIt != EdgeMaskCache.end() &&

8179"looking up mask for edge which has not been created");

8180return ECEntryIt->second;

8181}

8182

8183voidVPRecipeBuilder::createHeaderMask() {

8184BasicBlock *Header = OrigLoop->getHeader();

8185

8186// When not folding the tail, use nullptr to model all-true mask.

8187if (!CM.foldTailByMasking()) {

8188 BlockMaskCache[Header] =nullptr;

8189return;

8190 }

8191

8192// Introduce the early-exit compare IV <= BTC to form header block mask.

8193// This is used instead of IV < TC because TC may wrap, unlike BTC. Start by

8194// constructing the desired canonical IV in the header block as its first

8195// non-phi instructions.

8196

8197VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();

8198auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();

8199auto *IV =newVPWidenCanonicalIVRecipe(Plan.getCanonicalIV());

8200 HeaderVPBB->insert(IV, NewInsertionPoint);

8201

8202VPBuilder::InsertPointGuard Guard(Builder);

8203 Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);

8204VPValue *BlockMask =nullptr;

8205VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();

8206 BlockMask = Builder.createICmp(CmpInst::ICMP_ULE,IV, BTC);

8207 BlockMaskCache[Header] = BlockMask;

8208}

8209

8210VPValue *VPRecipeBuilder::getBlockInMask(BasicBlock *BB) const{

8211// Return the cached value.

8212BlockMaskCacheTy::const_iterator BCEntryIt = BlockMaskCache.find(BB);

8213assert(BCEntryIt != BlockMaskCache.end() &&

8214"Trying to access mask for block without one.");

8215return BCEntryIt->second;

8216}

8217

8218voidVPRecipeBuilder::createBlockInMask(BasicBlock *BB) {

8219assert(OrigLoop->contains(BB) &&"Block is not a part of a loop");

8220assert(BlockMaskCache.count(BB) == 0 &&"Mask for block already computed");

8221assert(OrigLoop->getHeader() != BB &&

8222"Loop header must have cached block mask");

8223

8224// All-one mask is modelled as no-mask following the convention for masked

8225// load/store/gather/scatter. Initialize BlockMask to no-mask.

8226VPValue *BlockMask =nullptr;

8227// This is the block mask. We OR all unique incoming edges.

8228for (auto *Predecessor :

8229SetVector<BasicBlock *>(pred_begin(BB),pred_end(BB))) {

8230VPValue *EdgeMask =createEdgeMask(Predecessor, BB);

8231if (!EdgeMask) {// Mask of predecessor is all-one so mask of block is too.

8232 BlockMaskCache[BB] = EdgeMask;

8233return;

8234 }

8235

8236if (!BlockMask) {// BlockMask has its initialized nullptr value.

8237 BlockMask = EdgeMask;

8238continue;

8239 }

8240

8241 BlockMask = Builder.createOr(BlockMask, EdgeMask, {});

8242 }

8243

8244 BlockMaskCache[BB] = BlockMask;

8245}

8246

8247VPWidenMemoryRecipe *

8248VPRecipeBuilder::tryToWidenMemory(Instruction *I,ArrayRef<VPValue *>Operands,

8249VFRange &Range) {

8250assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&

8251"Must be called with either a load or store");

8252

8253auto WillWiden = [&](ElementCount VF) ->bool {

8254LoopVectorizationCostModel::InstWidening Decision =

8255 CM.getWideningDecision(I, VF);

8256assert(Decision !=LoopVectorizationCostModel::CM_Unknown &&

8257"CM decision should be taken at this point.");

8258if (Decision ==LoopVectorizationCostModel::CM_Interleave)

8259returntrue;

8260if (CM.isScalarAfterVectorization(I, VF) ||

8261 CM.isProfitableToScalarize(I, VF))

8262returnfalse;

8263return Decision !=LoopVectorizationCostModel::CM_Scalarize;

8264 };

8265

8266if (!LoopVectorizationPlanner::getDecisionAndClampRange(WillWiden,Range))

8267returnnullptr;

8268

8269VPValue *Mask =nullptr;

8270if (Legal->isMaskRequired(I))

8271 Mask =getBlockInMask(I->getParent());

8272

8273// Determine if the pointer operand of the access is either consecutive or

8274// reverse consecutive.

8275LoopVectorizationCostModel::InstWidening Decision =

8276 CM.getWideningDecision(I,Range.Start);

8277boolReverse = Decision ==LoopVectorizationCostModel::CM_Widen_Reverse;

8278bool Consecutive =

8279Reverse || Decision ==LoopVectorizationCostModel::CM_Widen;

8280

8281VPValue *Ptr = isa<LoadInst>(I) ?Operands[0] :Operands[1];

8282if (Consecutive) {

8283auto *GEP = dyn_cast<GetElementPtrInst>(

8284Ptr->getUnderlyingValue()->stripPointerCasts());

8285VPSingleDefRecipe *VectorPtr;

8286if (Reverse) {

8287// When folding the tail, we may compute an address that we don't in the

8288// original scalar loop and it may not be inbounds. Drop Inbounds in that

8289// case.

8290GEPNoWrapFlags Flags =

8291 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())

8292 ?GEPNoWrapFlags::none()

8293 :GEPNoWrapFlags::inBounds();

8294 VectorPtr =newVPReverseVectorPointerRecipe(

8295Ptr, &Plan.getVF(),getLoadStoreType(I), Flags,I->getDebugLoc());

8296 }else {

8297 VectorPtr =newVPVectorPointerRecipe(Ptr,getLoadStoreType(I),

8298GEP ?GEP->getNoWrapFlags()

8299 :GEPNoWrapFlags::none(),

8300I->getDebugLoc());

8301 }

8302 Builder.getInsertBlock()->appendRecipe(VectorPtr);

8303Ptr = VectorPtr;

8304 }

8305if (LoadInst *Load = dyn_cast<LoadInst>(I))

8306returnnewVPWidenLoadRecipe(*Load,Ptr, Mask, Consecutive,Reverse,

8307I->getDebugLoc());

8308

8309StoreInst *Store = cast<StoreInst>(I);

8310returnnewVPWidenStoreRecipe(*Store,Ptr,Operands[0], Mask, Consecutive,

8311Reverse,I->getDebugLoc());

8312}

8313

8314/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also

8315/// insert a recipe to expand the step for the induction recipe.

8316staticVPWidenIntOrFpInductionRecipe *

8317createWidenInductionRecipes(PHINode *Phi,Instruction *PhiOrTrunc,

8318VPValue *Start,constInductionDescriptor &IndDesc,

8319VPlan &Plan,ScalarEvolution &SE,Loop &OrigLoop) {

8320assert(IndDesc.getStartValue() ==

8321 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));

8322assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&

8323"step must be loop invariant");

8324

8325VPValue *Step =

8326vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);

8327if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {

8328returnnewVPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),

8329 IndDesc, TruncI,

8330 TruncI->getDebugLoc());

8331 }

8332assert(isa<PHINode>(PhiOrTrunc) &&"must be a phi node here");

8333returnnewVPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),

8334 IndDesc, Phi->getDebugLoc());

8335}

8336

8337VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(

8338PHINode *Phi,ArrayRef<VPValue *>Operands,VFRange &Range) {

8339

8340// Check if this is an integer or fp induction. If so, build the recipe that

8341// produces its scalar and vector values.

8342if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))

8343returncreateWidenInductionRecipes(Phi, Phi,Operands[0], *II, Plan,

8344 *PSE.getSE(), *OrigLoop);

8345

8346// Check if this is pointer induction. If so, build the recipe for it.

8347if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {

8348VPValue *Step =vputils::getOrCreateVPValueForSCEVExpr(Plan,II->getStep(),

8349 *PSE.getSE());

8350returnnewVPWidenPointerInductionRecipe(

8351 Phi,Operands[0], Step, *II,

8352LoopVectorizationPlanner::getDecisionAndClampRange(

8353 [&](ElementCount VF) {

8354return CM.isScalarAfterVectorization(Phi, VF);

8355 },

8356Range),

8357Phi->getDebugLoc());

8358 }

8359returnnullptr;

8360}

8361

8362VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(

8363TruncInst *I,ArrayRef<VPValue *>Operands,VFRange &Range) {

8364// Optimize the special case where the source is a constant integer

8365// induction variable. Notice that we can only optimize the 'trunc' case

8366// because (a) FP conversions lose precision, (b) sext/zext may wrap, and

8367// (c) other casts depend on pointer size.

8368

8369// Determine whether \p K is a truncation based on an induction variable that

8370// can be optimized.

8371auto IsOptimizableIVTruncate =

8372 [&](Instruction *K) -> std::function<bool(ElementCount)> {

8373return [=](ElementCount VF) ->bool {

8374return CM.isOptimizableIVTruncate(K, VF);

8375 };

8376 };

8377

8378if (LoopVectorizationPlanner::getDecisionAndClampRange(

8379 IsOptimizableIVTruncate(I),Range)) {

8380

8381auto *Phi = cast<PHINode>(I->getOperand(0));

8382constInductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);

8383VPValue *Start = Plan.getOrAddLiveIn(II.getStartValue());

8384returncreateWidenInductionRecipes(Phi,I, Start,II, Plan, *PSE.getSE(),

8385 *OrigLoop);

8386 }

8387returnnullptr;

8388}

8389

8390VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,

8391ArrayRef<VPValue *>Operands) {

8392unsigned NumIncoming =Phi->getNumIncomingValues();

8393

8394// We know that all PHIs in non-header blocks are converted into selects, so

8395// we don't have to worry about the insertion order and we can just use the

8396// builder. At this point we generate the predication tree. There may be

8397// duplications since this is a simple recursive scan, but future

8398// optimizations will clean it up.

8399SmallVector<VPValue *, 2> OperandsWithMask;

8400

8401for (unsigned In = 0;In < NumIncoming;In++) {

8402 OperandsWithMask.push_back(Operands[In]);

8403VPValue *EdgeMask =

8404getEdgeMask(Phi->getIncomingBlock(In),Phi->getParent());

8405if (!EdgeMask) {

8406assert(In == 0 &&"Both null and non-null edge masks found");

8407assert(all_equal(Operands) &&

8408"Distinct incoming values with one having a full mask");

8409break;

8410 }

8411 OperandsWithMask.push_back(EdgeMask);

8412 }

8413returnnewVPBlendRecipe(Phi, OperandsWithMask);

8414}

8415

8416VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,

8417ArrayRef<VPValue *>Operands,

8418VFRange &Range) {

8419bool IsPredicated =LoopVectorizationPlanner::getDecisionAndClampRange(

8420 [this, CI](ElementCount VF) {

8421return CM.isScalarWithPredication(CI, VF);

8422 },

8423Range);

8424

8425if (IsPredicated)

8426returnnullptr;

8427

8428Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

8429if (ID && (ID == Intrinsic::assume ||ID == Intrinsic::lifetime_end ||

8430ID == Intrinsic::lifetime_start ||ID == Intrinsic::sideeffect ||

8431ID == Intrinsic::pseudoprobe ||

8432ID == Intrinsic::experimental_noalias_scope_decl))

8433returnnullptr;

8434

8435SmallVector<VPValue *, 4> Ops(Operands.take_front(CI->arg_size()));

8436

8437// Is it beneficial to perform intrinsic call compared to lib call?

8438bool ShouldUseVectorIntrinsic =

8439ID &&LoopVectorizationPlanner::getDecisionAndClampRange(

8440 [&](ElementCount VF) ->bool {

8441return CM.getCallWideningDecision(CI, VF).Kind ==

8442LoopVectorizationCostModel::CM_IntrinsicCall;

8443 },

8444Range);

8445if (ShouldUseVectorIntrinsic)

8446returnnewVPWidenIntrinsicRecipe(*CI,ID, Ops, CI->getType(),

8447 CI->getDebugLoc());

8448

8449Function *Variant =nullptr;

8450 std::optional<unsigned> MaskPos;

8451// Is better to call a vectorized version of the function than to to scalarize

8452// the call?

8453auto ShouldUseVectorCall =LoopVectorizationPlanner::getDecisionAndClampRange(

8454 [&](ElementCount VF) ->bool {

8455// The following case may be scalarized depending on the VF.

8456// The flag shows whether we can use a usual Call for vectorized

8457// version of the instruction.

8458

8459// If we've found a variant at a previous VF, then stop looking. A

8460// vectorized variant of a function expects input in a certain shape

8461// -- basically the number of input registers, the number of lanes

8462// per register, and whether there's a mask required.

8463// We store a pointer to the variant in the VPWidenCallRecipe, so

8464// once we have an appropriate variant it's only valid for that VF.

8465// This will force a different vplan to be generated for each VF that

8466// finds a valid variant.

8467if (Variant)

8468returnfalse;

8469LoopVectorizationCostModel::CallWideningDecision Decision =

8470 CM.getCallWideningDecision(CI, VF);

8471if (Decision.Kind ==LoopVectorizationCostModel::CM_VectorCall) {

8472 Variant = Decision.Variant;

8473 MaskPos = Decision.MaskPos;

8474 return true;

8475 }

8476

8477returnfalse;

8478 },

8479Range);

8480if (ShouldUseVectorCall) {

8481if (MaskPos.has_value()) {

8482// We have 2 cases that would require a mask:

8483// 1) The block needs to be predicated, either due to a conditional

8484// in the scalar loop or use of an active lane mask with

8485// tail-folding, and we use the appropriate mask for the block.

8486// 2) No mask is required for the block, but the only available

8487// vector variant at this VF requires a mask, so we synthesize an

8488// all-true mask.

8489VPValue *Mask =nullptr;

8490if (Legal->isMaskRequired(CI))

8491Mask =getBlockInMask(CI->getParent());

8492else

8493Mask = Plan.getOrAddLiveIn(

8494ConstantInt::getTrue(IntegerType::getInt1Ty(CI->getContext())));

8495

8496 Ops.insert(Ops.begin() + *MaskPos, Mask);

8497 }

8498

8499 Ops.push_back(Operands.back());

8500returnnewVPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());

8501 }

8502

8503returnnullptr;

8504}

8505

8506bool VPRecipeBuilder::shouldWiden(Instruction *I,VFRange &Range) const{

8507assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&

8508 !isa<StoreInst>(I) &&"Instruction should have been handled earlier");

8509// Instruction should be widened, unless it is scalar after vectorization,

8510// scalarization is profitable or it is predicated.

8511auto WillScalarize = [this,I](ElementCount VF) ->bool {

8512return CM.isScalarAfterVectorization(I, VF) ||

8513 CM.isProfitableToScalarize(I, VF) ||

8514 CM.isScalarWithPredication(I, VF);

8515 };

8516return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,

8517Range);

8518}

8519

8520VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,

8521ArrayRef<VPValue *>Operands,

8522VPBasicBlock *VPBB) {

8523switch (I->getOpcode()) {

8524default:

8525returnnullptr;

8526case Instruction::SDiv:

8527case Instruction::UDiv:

8528case Instruction::SRem:

8529case Instruction::URem: {

8530// If not provably safe, use a select to form a safe divisor before widening the

8531// div/rem operation itself. Otherwise fall through to general handling below.

8532if (CM.isPredicatedInst(I)) {

8533SmallVector<VPValue *> Ops(Operands);

8534VPValue *Mask =getBlockInMask(I->getParent());

8535VPValue *One =

8536 Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u,false));

8537auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One,I->getDebugLoc());

8538 Ops[1] = SafeRHS;

8539returnnewVPWidenRecipe(*I,make_range(Ops.begin(), Ops.end()));

8540 }

8541 [[fallthrough]];

8542 }

8543case Instruction::Add:

8544case Instruction::And:

8545case Instruction::AShr:

8546case Instruction::FAdd:

8547case Instruction::FCmp:

8548case Instruction::FDiv:

8549case Instruction::FMul:

8550case Instruction::FNeg:

8551case Instruction::FRem:

8552case Instruction::FSub:

8553case Instruction::ICmp:

8554case Instruction::LShr:

8555case Instruction::Mul:

8556case Instruction::Or:

8557case Instruction::Select:

8558case Instruction::Shl:

8559case Instruction::Sub:

8560case Instruction::Xor:

8561case Instruction::Freeze:

8562SmallVector<VPValue *> NewOps(Operands);

8563if (Instruction::isBinaryOp(I->getOpcode())) {

8564// The legacy cost model uses SCEV to check if some of the operands are

8565// constants. To match the legacy cost model's behavior, use SCEV to try

8566// to replace operands with constants.

8567ScalarEvolution &SE = *PSE.getSE();

8568auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {

8569Value *V =Op->getUnderlyingValue();

8570if (isa<Constant>(V) || !SE.isSCEVable(V->getType()))

8571returnOp;

8572auto *C = dyn_cast<SCEVConstant>(SE.getSCEV(V));

8573if (!C)

8574returnOp;

8575return Plan.getOrAddLiveIn(C->getValue());

8576 };

8577// For Mul, the legacy cost model checks both operands.

8578if (I->getOpcode() == Instruction::Mul)

8579 NewOps[0] = GetConstantViaSCEV(NewOps[0]);

8580// For other binops, the legacy cost model only checks the second operand.

8581 NewOps[1] = GetConstantViaSCEV(NewOps[1]);

8582 }

8583returnnewVPWidenRecipe(*I,make_range(NewOps.begin(), NewOps.end()));

8584 };

8585}

8586

8587VPHistogramRecipe *

8588VPRecipeBuilder::tryToWidenHistogram(constHistogramInfo *HI,

8589ArrayRef<VPValue *>Operands) {

8590// FIXME: Support other operations.

8591unsigned Opcode =HI->Update->getOpcode();

8592assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&

8593"Histogram update operation must be an Add or Sub");

8594

8595SmallVector<VPValue *, 3> HGramOps;

8596// Bucket address.

8597 HGramOps.push_back(Operands[1]);

8598// Increment value.

8599 HGramOps.push_back(getVPValueOrAddLiveIn(HI->Update->getOperand(1)));

8600

8601// In case of predicated execution (due to tail-folding, or conditional

8602// execution, or both), pass the relevant mask.

8603if (Legal->isMaskRequired(HI->Store))

8604 HGramOps.push_back(getBlockInMask(HI->Store->getParent()));

8605

8606returnnewVPHistogramRecipe(Opcode,

8607make_range(HGramOps.begin(), HGramOps.end()),

8608HI->Store->getDebugLoc());

8609}

8610

8611voidVPRecipeBuilder::fixHeaderPhis() {

8612BasicBlock *OrigLatch = OrigLoop->getLoopLatch();

8613for (VPHeaderPHIRecipe *R : PhisToFix) {

8614auto *PN = cast<PHINode>(R->getUnderlyingValue());

8615VPRecipeBase *IncR =

8616getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));

8617 R->addOperand(IncR->getVPSingleValue());

8618 }

8619}

8620

8621VPReplicateRecipe *VPRecipeBuilder::handleReplication(Instruction *I,

8622VFRange &Range) {

8623bool IsUniform =LoopVectorizationPlanner::getDecisionAndClampRange(

8624 [&](ElementCount VF) {return CM.isUniformAfterVectorization(I, VF); },

8625Range);

8626

8627bool IsPredicated = CM.isPredicatedInst(I);

8628

8629// Even if the instruction is not marked as uniform, there are certain

8630// intrinsic calls that can be effectively treated as such, so we check for

8631// them here. Conservatively, we only do this for scalable vectors, since

8632// for fixed-width VFs we can always fall back on full scalarization.

8633if (!IsUniform &&Range.Start.isScalable() && isa<IntrinsicInst>(I)) {

8634switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {

8635case Intrinsic::assume:

8636case Intrinsic::lifetime_start:

8637case Intrinsic::lifetime_end:

8638// For scalable vectors if one of the operands is variant then we still

8639// want to mark as uniform, which will generate one instruction for just

8640// the first lane of the vector. We can't scalarize the call in the same

8641// way as for fixed-width vectors because we don't know how many lanes

8642// there are.

8643//

8644// The reasons for doing it this way for scalable vectors are:

8645// 1. For the assume intrinsic generating the instruction for the first

8646// lane is still be better than not generating any at all. For

8647// example, the input may be a splat across all lanes.

8648// 2. For the lifetime start/end intrinsics the pointer operand only

8649// does anything useful when the input comes from a stack object,

8650// which suggests it should always be uniform. For non-stack objects

8651// the effect is to poison the object, which still allows us to

8652// remove the call.

8653 IsUniform =true;

8654break;

8655default:

8656break;

8657 }

8658 }

8659VPValue *BlockInMask =nullptr;

8660if (!IsPredicated) {

8661// Finalize the recipe for Instr, first if it is not predicated.

8662LLVM_DEBUG(dbgs() <<"LV: Scalarizing:" << *I <<"\n");

8663 }else {

8664LLVM_DEBUG(dbgs() <<"LV: Scalarizing and predicating:" << *I <<"\n");

8665// Instructions marked for predication are replicated and a mask operand is

8666// added initially. Masked replicate recipes will later be placed under an

8667// if-then construct to prevent side-effects. Generate recipes to compute

8668// the block mask for this region.

8669 BlockInMask =getBlockInMask(I->getParent());

8670 }

8671

8672// Note that there is some custom logic to mark some intrinsics as uniform

8673// manually above for scalable vectors, which this assert needs to account for

8674// as well.

8675assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||

8676 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&

8677"Should not predicate a uniform recipe");

8678auto *Recipe =newVPReplicateRecipe(I,mapToVPValues(I->operands()),

8679 IsUniform, BlockInMask);

8680return Recipe;

8681}

8682

8683/// Find all possible partial reductions in the loop and track all of those that

8684/// are valid so recipes can be formed later.

8685voidVPRecipeBuilder::collectScaledReductions(VFRange &Range) {

8686// Find all possible partial reductions.

8687SmallVector<std::pair<PartialReductionChain, unsigned>>

8688 PartialReductionChains;

8689for (constauto &[Phi, RdxDesc] : Legal->getReductionVars()) {

8690 getScaledReductions(Phi, RdxDesc.getLoopExitInstr(),Range,

8691 PartialReductionChains);

8692 }

8693

8694// A partial reduction is invalid if any of its extends are used by

8695// something that isn't another partial reduction. This is because the

8696// extends are intended to be lowered along with the reduction itself.

8697

8698// Build up a set of partial reduction bin ops for efficient use checking.

8699SmallSet<User *, 4> PartialReductionBinOps;

8700for (constauto &[PartialRdx,_] : PartialReductionChains)

8701 PartialReductionBinOps.insert(PartialRdx.BinOp);

8702

8703auto ExtendIsOnlyUsedByPartialReductions =

8704 [&PartialReductionBinOps](Instruction *Extend) {

8705returnall_of(Extend->users(), [&](constUser *U) {

8706 return PartialReductionBinOps.contains(U);

8707 });

8708 };

8709

8710// Check if each use of a chain's two extends is a partial reduction

8711// and only add those that don't have non-partial reduction users.

8712for (auto Pair : PartialReductionChains) {

8713PartialReductionChain Chain = Pair.first;

8714if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&

8715 ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))

8716 ScaledReductionMap.insert(std::make_pair(Chain.Reduction, Pair.second));

8717 }

8718}

8719

8720bool VPRecipeBuilder::getScaledReductions(

8721Instruction *PHI,Instruction *RdxExitInstr,VFRange &Range,

8722SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {

8723

8724if (!CM.TheLoop->contains(RdxExitInstr))

8725returnfalse;

8726

8727// TODO: Allow scaling reductions when predicating. The select at

8728// the end of the loop chooses between the phi value and most recent

8729// reduction result, both of which have different VFs to the active lane

8730// mask when scaling.

8731if (CM.blockNeedsPredicationForAnyReason(RdxExitInstr->getParent()))

8732returnfalse;

8733

8734auto *Update = dyn_cast<BinaryOperator>(RdxExitInstr);

8735if (!Update)

8736returnfalse;

8737

8738Value *Op = Update->getOperand(0);

8739Value *PhiOp = Update->getOperand(1);

8740if (Op ==PHI)

8741std::swap(Op, PhiOp);

8742

8743// Try and get a scaled reduction from the first non-phi operand.

8744// If one is found, we use the discovered reduction instruction in

8745// place of the accumulator for costing.

8746if (auto *OpInst = dyn_cast<Instruction>(Op)) {

8747if (getScaledReductions(PHI, OpInst,Range, Chains)) {

8748PHI = Chains.rbegin()->first.Reduction;

8749

8750Op = Update->getOperand(0);

8751 PhiOp = Update->getOperand(1);

8752if (Op ==PHI)

8753std::swap(Op, PhiOp);

8754 }

8755 }

8756if (PhiOp !=PHI)

8757returnfalse;

8758

8759auto *BinOp = dyn_cast<BinaryOperator>(Op);

8760if (!BinOp || !BinOp->hasOneUse())

8761returnfalse;

8762

8763using namespacellvm::PatternMatch;

8764Value *A, *B;

8765if (!match(BinOp->getOperand(0),m_ZExtOrSExt(m_Value(A))) ||

8766 !match(BinOp->getOperand(1),m_ZExtOrSExt(m_Value(B))))

8767returnfalse;

8768

8769Instruction *ExtA = cast<Instruction>(BinOp->getOperand(0));

8770Instruction *ExtB = cast<Instruction>(BinOp->getOperand(1));

8771

8772TTI::PartialReductionExtendKind OpAExtend =

8773TargetTransformInfo::getPartialReductionExtendKind(ExtA);

8774TTI::PartialReductionExtendKind OpBExtend =

8775TargetTransformInfo::getPartialReductionExtendKind(ExtB);

8776

8777PartialReductionChain Chain(RdxExitInstr, ExtA, ExtB, BinOp);

8778

8779unsigned TargetScaleFactor =

8780PHI->getType()->getPrimitiveSizeInBits().getKnownScalarFactor(

8781A->getType()->getPrimitiveSizeInBits());

8782

8783if (LoopVectorizationPlanner::getDecisionAndClampRange(

8784 [&](ElementCount VF) {

8785InstructionCost Cost =TTI->getPartialReductionCost(

8786 Update->getOpcode(),A->getType(),B->getType(),PHI->getType(),

8787 VF, OpAExtend, OpBExtend,

8788 std::make_optional(BinOp->getOpcode()));

8789returnCost.isValid();

8790 },

8791Range)) {

8792 Chains.push_back(std::make_pair(Chain, TargetScaleFactor));

8793returntrue;

8794 }

8795

8796returnfalse;

8797}

8798

8799VPRecipeBase *

8800VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,

8801ArrayRef<VPValue *>Operands,

8802VFRange &Range,VPBasicBlock *VPBB) {

8803// First, check for specific widening recipes that deal with inductions, Phi

8804// nodes, calls and memory operations.

8805VPRecipeBase *Recipe;

8806if (auto *Phi = dyn_cast<PHINode>(Instr)) {

8807if (Phi->getParent() != OrigLoop->getHeader())

8808return tryToBlend(Phi,Operands);

8809

8810if ((Recipe = tryToOptimizeInductionPHI(Phi,Operands,Range)))

8811return Recipe;

8812

8813VPHeaderPHIRecipe *PhiRecipe =nullptr;

8814assert((Legal->isReductionVariable(Phi) ||

8815 Legal->isFixedOrderRecurrence(Phi)) &&

8816"can only widen reductions and fixed-order recurrences here");

8817VPValue *StartV =Operands[0];

8818if (Legal->isReductionVariable(Phi)) {

8819constRecurrenceDescriptor &RdxDesc =

8820 Legal->getReductionVars().find(Phi)->second;

8821assert(RdxDesc.getRecurrenceStartValue() ==

8822 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));

8823

8824// If the PHI is used by a partial reduction, set the scale factor.

8825unsigned ScaleFactor =

8826getScalingForReduction(RdxDesc.getLoopExitInstr()).value_or(1);

8827 PhiRecipe =newVPReductionPHIRecipe(

8828 Phi, RdxDesc, *StartV, CM.isInLoopReduction(Phi),

8829 CM.useOrderedReductions(RdxDesc), ScaleFactor);

8830 }else {

8831// TODO: Currently fixed-order recurrences are modeled as chains of

8832// first-order recurrences. If there are no users of the intermediate

8833// recurrences in the chain, the fixed order recurrence should be modeled

8834// directly, enabling more efficient codegen.

8835 PhiRecipe =newVPFirstOrderRecurrencePHIRecipe(Phi, *StartV);

8836 }

8837

8838 PhisToFix.push_back(PhiRecipe);

8839return PhiRecipe;

8840 }

8841

8842if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(

8843 cast<TruncInst>(Instr),Operands,Range)))

8844return Recipe;

8845

8846// All widen recipes below deal only with VF > 1.

8847if (LoopVectorizationPlanner::getDecisionAndClampRange(

8848 [&](ElementCount VF) {return VF.isScalar(); },Range))

8849returnnullptr;

8850

8851if (auto *CI = dyn_cast<CallInst>(Instr))

8852return tryToWidenCall(CI,Operands,Range);

8853

8854if (StoreInst *SI = dyn_cast<StoreInst>(Instr))

8855if (auto HistInfo = Legal->getHistogramInfo(SI))

8856return tryToWidenHistogram(*HistInfo,Operands);

8857

8858if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))

8859return tryToWidenMemory(Instr,Operands,Range);

8860

8861if (getScalingForReduction(Instr))

8862returntryToCreatePartialReduction(Instr,Operands);

8863

8864if (!shouldWiden(Instr,Range))

8865returnnullptr;

8866

8867if (auto *GEP = dyn_cast<GetElementPtrInst>(Instr))

8868returnnewVPWidenGEPRecipe(GEP,

8869make_range(Operands.begin(),Operands.end()));

8870

8871if (auto *SI = dyn_cast<SelectInst>(Instr)) {

8872returnnewVPWidenSelectRecipe(

8873 *SI,make_range(Operands.begin(),Operands.end()));

8874 }

8875

8876if (auto *CI = dyn_cast<CastInst>(Instr)) {

8877returnnewVPWidenCastRecipe(CI->getOpcode(),Operands[0], CI->getType(),

8878 *CI);

8879 }

8880

8881return tryToWiden(Instr,Operands, VPBB);

8882}

8883

8884VPRecipeBase *

8885VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,

8886ArrayRef<VPValue *>Operands) {

8887assert(Operands.size() == 2 &&

8888"Unexpected number of operands for partial reduction");

8889

8890VPValue *BinOp =Operands[0];

8891VPValue *Accumulator =Operands[1];

8892VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();

8893if (isa<VPReductionPHIRecipe>(BinOpRecipe) ||

8894 isa<VPPartialReductionRecipe>(BinOpRecipe))

8895std::swap(BinOp,Accumulator);

8896

8897returnnewVPPartialReductionRecipe(Reduction->getOpcode(), BinOp,

8898Accumulator,Reduction);

8899}

8900

8901void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,

8902ElementCount MaxVF) {

8903assert(OrigLoop->isInnermost() &&"Inner loop expected.");

8904

8905auto MaxVFTimes2 = MaxVF * 2;

8906for (ElementCount VF = MinVF;ElementCount::isKnownLT(VF, MaxVFTimes2);) {

8907VFRange SubRange = {VF, MaxVFTimes2};

8908if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {

8909// Now optimize the initial VPlan.

8910if (!Plan->hasVF(ElementCount::getFixed(1)))

8911VPlanTransforms::truncateToMinimalBitwidths(*Plan,

8912 CM.getMinimalBitwidths());

8913VPlanTransforms::optimize(*Plan);

8914// TODO: try to put it close to addActiveLaneMask().

8915// Discard the plan if it is not EVL-compatible

8916if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(

8917 *Plan, CM.getMaxSafeElements()))

8918break;

8919assert(verifyVPlanIsValid(*Plan) &&"VPlan is invalid");

8920 VPlans.push_back(std::move(Plan));

8921 }

8922 VF = SubRange.End;

8923 }

8924}

8925

8926// Add the necessary canonical IV and branch recipes required to control the

8927// loop.

8928staticvoidaddCanonicalIVRecipes(VPlan &Plan,Type *IdxTy,bool HasNUW,

8929DebugLoc DL) {

8930Value *StartIdx = ConstantInt::get(IdxTy, 0);

8931auto *StartV = Plan.getOrAddLiveIn(StartIdx);

8932

8933// Add a VPCanonicalIVPHIRecipe starting at 0 to the header.

8934auto *CanonicalIVPHI =newVPCanonicalIVPHIRecipe(StartV,DL);

8935VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();

8936VPBasicBlock *Header = TopRegion->getEntryBasicBlock();

8937 Header->insert(CanonicalIVPHI, Header->begin());

8938

8939VPBuilder Builder(TopRegion->getExitingBasicBlock());

8940// Add a VPInstruction to increment the scalar canonical IV by VF * UF.

8941auto *CanonicalIVIncrement = Builder.createOverflowingOp(

8942 Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW,false},DL,

8943"index.next");

8944 CanonicalIVPHI->addOperand(CanonicalIVIncrement);

8945

8946// Add the BranchOnCount VPInstruction to the latch.

8947 Builder.createNaryOp(VPInstruction::BranchOnCount,

8948 {CanonicalIVIncrement, &Plan.getVectorTripCount()},DL);

8949}

8950

8951/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the

8952/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute

8953/// the end value of the induction.

8954staticVPInstruction *addResumePhiRecipeForInduction(

8955VPWidenInductionRecipe *WideIV,VPBuilder &VectorPHBuilder,

8956VPBuilder &ScalarPHBuilder,VPTypeAnalysis &TypeInfo,VPValue *VectorTC) {

8957auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);

8958// Truncated wide inductions resume from the last lane of their vector value

8959// in the last vector iteration which is handled elsewhere.

8960if (WideIntOrFp && WideIntOrFp->getTruncInst())

8961returnnullptr;

8962

8963VPValue *Start = WideIV->getStartValue();

8964VPValue *Step = WideIV->getStepValue();

8965constInductionDescriptor &ID = WideIV->getInductionDescriptor();

8966VPValue *EndValue = VectorTC;

8967if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {

8968 EndValue = VectorPHBuilder.createDerivedIV(

8969ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),

8970 Start, VectorTC, Step);

8971 }

8972

8973// EndValue is derived from the vector trip count (which has the same type as

8974// the widest induction) and thus may be wider than the induction here.

8975Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);

8976if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {

8977 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,

8978 ScalarTypeOfWideIV,

8979 WideIV->getDebugLoc());

8980 }

8981

8982auto *ResumePhiRecipe =

8983 ScalarPHBuilder.createNaryOp(VPInstruction::ResumePhi, {EndValue, Start},

8984 WideIV->getDebugLoc(),"bc.resume.val");

8985return ResumePhiRecipe;

8986}

8987

8988/// Create resume phis in the scalar preheader for first-order recurrences,

8989/// reductions and inductions, and update the VPIRInstructions wrapping the

8990/// original phis in the scalar header. End values for inductions are added to

8991/// \p IVEndValues.

8992staticvoidaddScalarResumePhis(VPRecipeBuilder &Builder,VPlan &Plan,

8993DenseMap<VPValue *, VPValue *> &IVEndValues) {

8994VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());

8995auto *ScalarPH = Plan.getScalarPreheader();

8996auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getSinglePredecessor());

8997VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

8998VPBuilder VectorPHBuilder(

8999 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));

9000VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());

9001VPBuilder ScalarPHBuilder(ScalarPH);

9002VPValue *OneVPV = Plan.getOrAddLiveIn(

9003 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1));

9004for (VPRecipeBase &ScalarPhiR : *Plan.getScalarHeader()) {

9005auto *ScalarPhiIRI = cast<VPIRInstruction>(&ScalarPhiR);

9006auto *ScalarPhiI = dyn_cast<PHINode>(&ScalarPhiIRI->getInstruction());

9007if (!ScalarPhiI)

9008break;

9009

9010// TODO: Extract final value from induction recipe initially, optimize to

9011// pre-computed end value together in optimizeInductionExitUsers.

9012auto *VectorPhiR = cast<VPHeaderPHIRecipe>(Builder.getRecipe(ScalarPhiI));

9013if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {

9014if (VPInstruction *ResumePhi =addResumePhiRecipeForInduction(

9015 WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,

9016 &Plan.getVectorTripCount())) {

9017assert(ResumePhi->getOpcode() ==VPInstruction::ResumePhi &&

9018"Expected a ResumePhi");

9019 IVEndValues[WideIVR] = ResumePhi->getOperand(0);

9020 ScalarPhiIRI->addOperand(ResumePhi);

9021continue;

9022 }

9023// TODO: Also handle truncated inductions here. Computing end-values

9024// separately should be done as VPlan-to-VPlan optimization, after

9025// legalizing all resume values to use the last lane from the loop.

9026assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&

9027"should only skip truncated wide inductions");

9028continue;

9029 }

9030

9031// The backedge value provides the value to resume coming out of a loop,

9032// which for FORs is a vector whose last element needs to be extracted. The

9033// start value provides the value if the loop is bypassed.

9034bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);

9035auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();

9036assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&

9037"Cannot handle loops with uncountable early exits");

9038if (IsFOR)

9039 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(

9040VPInstruction::ExtractFromEnd, {ResumeFromVectorLoop, OneVPV}, {},

9041"vector.recur.extract");

9042StringRef Name = IsFOR ?"scalar.recur.init" :"bc.merge.rdx";

9043auto *ResumePhiR = ScalarPHBuilder.createNaryOp(

9044VPInstruction::ResumePhi,

9045 {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, {},Name);

9046 ScalarPhiIRI->addOperand(ResumePhiR);

9047 }

9048}

9049

9050// Collect VPIRInstructions for phis in the exit blocks that are modeled

9051// in VPlan and add the exiting VPValue as operand.

9052staticSetVector<VPIRInstruction *>

9053collectUsersInExitBlocks(Loop *OrigLoop,VPRecipeBuilder &Builder,

9054VPlan &Plan) {

9055SetVector<VPIRInstruction *> ExitUsersToFix;

9056for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {

9057for (VPRecipeBase &R : *ExitVPBB) {

9058auto *ExitIRI = dyn_cast<VPIRInstruction>(&R);

9059if (!ExitIRI)

9060continue;

9061auto *ExitPhi = dyn_cast<PHINode>(&ExitIRI->getInstruction());

9062if (!ExitPhi)

9063break;

9064if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock()) {

9065assert(ExitIRI->getNumOperands() ==

9066 ExitVPBB->getPredecessors().size() &&

9067"early-exit must update exit values on construction");

9068continue;

9069 }

9070BasicBlock *ExitingBB = OrigLoop->getLoopLatch();

9071Value *IncomingValue = ExitPhi->getIncomingValueForBlock(ExitingBB);

9072VPValue *V = Builder.getVPValueOrAddLiveIn(IncomingValue);

9073 ExitIRI->addOperand(V);

9074if (V->isLiveIn())

9075continue;

9076assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&

9077"Only recipes defined inside a region should need fixing.");

9078 ExitUsersToFix.insert(ExitIRI);

9079 }

9080 }

9081return ExitUsersToFix;

9082}

9083

9084// Add exit values to \p Plan. Extracts are added for each entry in \p

9085// ExitUsersToFix if needed and their operands are updated.

9086staticvoid

9087addUsersInExitBlocks(VPlan &Plan,

9088constSetVector<VPIRInstruction *> &ExitUsersToFix) {

9089if (ExitUsersToFix.empty())

9090return;

9091

9092auto *MiddleVPBB = Plan.getMiddleBlock();

9093VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());

9094

9095// Introduce extract for exiting values and update the VPIRInstructions

9096// modeling the corresponding LCSSA phis.

9097for (VPIRInstruction *ExitIRI : ExitUsersToFix) {

9098assert(ExitIRI->getNumOperands() == 1 &&

9099 ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&

9100"exit values from early exits must be fixed when branch to "

9101"early-exit is added");

9102 ExitIRI->extractLastLaneOfOperand(B);

9103 }

9104}

9105

9106/// Handle users in the exit block for first order reductions in the original

9107/// exit block. The penultimate value of recurrences is fed to their LCSSA phi

9108/// users in the original exit block using the VPIRInstruction wrapping to the

9109/// LCSSA phi.

9110staticvoidaddExitUsersForFirstOrderRecurrences(

9111VPlan &Plan,SetVector<VPIRInstruction *> &ExitUsersToFix) {

9112VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();

9113auto *ScalarPHVPBB = Plan.getScalarPreheader();

9114auto *MiddleVPBB = Plan.getMiddleBlock();

9115VPBuilder ScalarPHBuilder(ScalarPHVPBB);

9116VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());

9117VPValue *TwoVPV = Plan.getOrAddLiveIn(

9118 ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 2));

9119

9120for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {

9121auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);

9122if (!FOR)

9123continue;

9124

9125assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&

9126"Cannot handle loops with uncountable early exits");

9127

9128// This is the second phase of vectorizing first-order recurrences, creating

9129// extract for users outside the loop. An overview of the transformation is

9130// described below. Suppose we have the following loop with some use after

9131// the loop of the last a[i-1],

9132//

9133// for (int i = 0; i < n; ++i) {

9134// t = a[i - 1];

9135// b[i] = a[i] - t;

9136// }

9137// use t;

9138//

9139// There is a first-order recurrence on "a". For this loop, the shorthand

9140// scalar IR looks like:

9141//

9142// scalar.ph:

9143// s.init = a[-1]

9144// br scalar.body

9145//

9146// scalar.body:

9147// i = phi [0, scalar.ph], [i+1, scalar.body]

9148// s1 = phi [s.init, scalar.ph], [s2, scalar.body]

9149// s2 = a[i]

9150// b[i] = s2 - s1

9151// br cond, scalar.body, exit.block

9152//

9153// exit.block:

9154// use = lcssa.phi [s1, scalar.body]

9155//

9156// In this example, s1 is a recurrence because it's value depends on the

9157// previous iteration. In the first phase of vectorization, we created a

9158// VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts

9159// for users in the scalar preheader and exit block.

9160//

9161// vector.ph:

9162// v_init = vector(..., ..., ..., a[-1])

9163// br vector.body

9164//

9165// vector.body

9166// i = phi [0, vector.ph], [i+4, vector.body]

9167// v1 = phi [v_init, vector.ph], [v2, vector.body]

9168// v2 = a[i, i+1, i+2, i+3]

9169// b[i] = v2 - v1

9170// // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))

9171// b[i, i+1, i+2, i+3] = v2 - v1

9172// br cond, vector.body, middle.block

9173//

9174// middle.block:

9175// vector.recur.extract.for.phi = v2(2)

9176// vector.recur.extract = v2(3)

9177// br cond, scalar.ph, exit.block

9178//

9179// scalar.ph:

9180// scalar.recur.init = phi [vector.recur.extract, middle.block],

9181// [s.init, otherwise]

9182// br scalar.body

9183//

9184// scalar.body:

9185// i = phi [0, scalar.ph], [i+1, scalar.body]

9186// s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]

9187// s2 = a[i]

9188// b[i] = s2 - s1

9189// br cond, scalar.body, exit.block

9190//

9191// exit.block:

9192// lo = lcssa.phi [s1, scalar.body],

9193// [vector.recur.extract.for.phi, middle.block]

9194//

9195// Now update VPIRInstructions modeling LCSSA phis in the exit block.

9196// Extract the penultimate value of the recurrence and use it as operand for

9197// the VPIRInstruction modeling the phi.

9198for (VPIRInstruction *ExitIRI : ExitUsersToFix) {

9199if (ExitIRI->getOperand(0) != FOR)

9200continue;

9201VPValue *PenultimateElement = MiddleBuilder.createNaryOp(

9202VPInstruction::ExtractFromEnd, {FOR->getBackedgeValue(), TwoVPV}, {},

9203"vector.recur.extract.for.phi");

9204 ExitIRI->setOperand(0, PenultimateElement);

9205 ExitUsersToFix.remove(ExitIRI);

9206 }

9207 }

9208}

9209

9210VPlanPtr

9211LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

9212

9213SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;

9214

9215// ---------------------------------------------------------------------------

9216// Build initial VPlan: Scan the body of the loop in a topological order to

9217// visit each basic block after having visited its predecessor basic blocks.

9218// ---------------------------------------------------------------------------

9219

9220// Create initial VPlan skeleton, having a basic block for the pre-header

9221// which contains SCEV expansions that need to happen before the CFG is

9222// modified; a basic block for the vector pre-header, followed by a region for

9223// the vector loop, followed by the middle basic block. The skeleton vector

9224// loop region contains a header and latch basic blocks.

9225

9226bool RequiresScalarEpilogueCheck =

9227LoopVectorizationPlanner::getDecisionAndClampRange(

9228 [this](ElementCount VF) {

9229return !CM.requiresScalarEpilogue(VF.isVector());

9230 },

9231Range);

9232VPlanPtr Plan =VPlan::createInitialVPlan(Legal->getWidestInductionType(),

9233 PSE, RequiresScalarEpilogueCheck,

9234 CM.foldTailByMasking(), OrigLoop);

9235

9236// Don't use getDecisionAndClampRange here, because we don't know the UF

9237// so this function is better to be conservative, rather than to split

9238// it up into different VPlans.

9239// TODO: Consider using getDecisionAndClampRange here to split up VPlans.

9240bool IVUpdateMayOverflow =false;

9241for (ElementCount VF :Range)

9242 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(&CM, VF);

9243

9244DebugLoc DL =getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());

9245TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);

9246// Use NUW for the induction increment if we proved that it won't overflow in

9247// the vector loop or when not folding the tail. In the later case, we know

9248// that the canonical induction increment will not overflow as the vector trip

9249// count is >= increment and a multiple of the increment.

9250bool HasNUW = !IVUpdateMayOverflow ||Style ==TailFoldingStyle::None;

9251addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,DL);

9252

9253VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,

9254 Builder);

9255

9256// ---------------------------------------------------------------------------

9257// Pre-construction: record ingredients whose recipes we'll need to further

9258// process after constructing the initial VPlan.

9259// ---------------------------------------------------------------------------

9260

9261// For each interleave group which is relevant for this (possibly trimmed)

9262// Range, add it to the set of groups to be later applied to the VPlan and add

9263// placeholders for its members' Recipes which we'll be replacing with a

9264// single VPInterleaveRecipe.

9265for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {

9266auto ApplyIG = [IG,this](ElementCount VF) ->bool {

9267boolResult = (VF.isVector() &&// Query is illegal for VF == 1

9268 CM.getWideningDecision(IG->getInsertPos(), VF) ==

9269LoopVectorizationCostModel::CM_Interleave);

9270// For scalable vectors, the only interleave factor currently supported

9271// is 2 since we require the (de)interleave2 intrinsics instead of

9272// shufflevectors.

9273assert((!Result || !VF.isScalable() || IG->getFactor() == 2) &&

9274"Unsupported interleave factor for scalable vectors");

9275returnResult;

9276 };

9277if (!getDecisionAndClampRange(ApplyIG,Range))

9278continue;

9279 InterleaveGroups.insert(IG);

9280 }

9281

9282// ---------------------------------------------------------------------------

9283// Construct recipes for the instructions in the loop

9284// ---------------------------------------------------------------------------

9285

9286// Scan the body of the loop in a topological order to visit each basic block

9287// after having visited its predecessor basic blocks.

9288LoopBlocksDFS DFS(OrigLoop);

9289 DFS.perform(LI);

9290

9291VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();

9292VPBasicBlock *VPBB = HeaderVPBB;

9293BasicBlock *HeaderBB = OrigLoop->getHeader();

9294bool NeedsMasks =

9295 CM.foldTailByMasking() ||

9296any_of(OrigLoop->blocks(), [this, HeaderBB](BasicBlock *BB) {

9297 bool NeedsBlends = BB != HeaderBB && !BB->phis().empty();

9298 return Legal->blockNeedsPredication(BB) || NeedsBlends;

9299 });

9300

9301 RecipeBuilder.collectScaledReductions(Range);

9302

9303auto *MiddleVPBB = Plan->getMiddleBlock();

9304VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();

9305for (BasicBlock *BB :make_range(DFS.beginRPO(), DFS.endRPO())) {

9306// Relevant instructions from basic block BB will be grouped into VPRecipe

9307// ingredients and fill a new VPBasicBlock.

9308if (VPBB != HeaderVPBB)

9309 VPBB->setName(BB->getName());

9310 Builder.setInsertPoint(VPBB);

9311

9312if (VPBB == HeaderVPBB)

9313 RecipeBuilder.createHeaderMask();

9314elseif (NeedsMasks)

9315 RecipeBuilder.createBlockInMask(BB);

9316

9317// Introduce each ingredient into VPlan.

9318// TODO: Model and preserve debug intrinsics in VPlan.

9319for (Instruction &I :drop_end(BB->instructionsWithoutDebug(false))) {

9320Instruction *Instr = &I;

9321SmallVector<VPValue *, 4>Operands;

9322auto *Phi = dyn_cast<PHINode>(Instr);

9323if (Phi &&Phi->getParent() == HeaderBB) {

9324Operands.push_back(Plan->getOrAddLiveIn(

9325Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));

9326 }else {

9327auto OpRange = RecipeBuilder.mapToVPValues(Instr->operands());

9328Operands = {OpRange.begin(), OpRange.end()};

9329 }

9330

9331// The stores with invariant address inside the loop will be deleted, and

9332// in the exit block, a uniform store recipe will be created for the final

9333// invariant store of the reduction.

9334StoreInst *SI;

9335if ((SI = dyn_cast<StoreInst>(&I)) &&

9336 Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {

9337// Only create recipe for the final invariant store of the reduction.

9338if (!Legal->isInvariantStoreOfReduction(SI))

9339continue;

9340auto *Recipe =newVPReplicateRecipe(

9341 SI, RecipeBuilder.mapToVPValues(Instr->operands()),

9342true/* IsUniform */);

9343 Recipe->insertBefore(*MiddleVPBB, MBIP);

9344continue;

9345 }

9346

9347VPRecipeBase *Recipe =

9348 RecipeBuilder.tryToCreateWidenRecipe(Instr,Operands,Range, VPBB);

9349if (!Recipe)

9350 Recipe = RecipeBuilder.handleReplication(Instr,Range);

9351

9352 RecipeBuilder.setRecipe(Instr, Recipe);

9353if (isa<VPHeaderPHIRecipe>(Recipe)) {

9354// VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In

9355// the following cases, VPHeaderPHIRecipes may be created after non-phi

9356// recipes and need to be moved to the phi section of HeaderVPBB:

9357// * tail-folding (non-phi recipes computing the header mask are

9358// introduced earlier than regular header phi recipes, and should appear

9359// after them)

9360// * Optimizing truncates to VPWidenIntOrFpInductionRecipe.

9361

9362assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||

9363 CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&

9364"unexpected recipe needs moving");

9365 Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());

9366 }else

9367 VPBB->appendRecipe(Recipe);

9368 }

9369

9370VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);

9371 VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());

9372 }

9373

9374// After here, VPBB should not be used.

9375 VPBB =nullptr;

9376

9377assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&

9378 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&

9379"entry block must be set to a VPRegionBlock having a non-empty entry "

9380"VPBasicBlock");

9381 RecipeBuilder.fixHeaderPhis();

9382

9383// Update wide induction increments to use the same step as the corresponding

9384// wide induction. This enables detecting induction increments directly in

9385// VPlan and removes redundant splats.

9386for (constauto &[Phi,ID] : Legal->getInductionVars()) {

9387auto *IVInc = cast<Instruction>(

9388Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));

9389if (IVInc->getOperand(0) != Phi || IVInc->getOpcode() != Instruction::Add)

9390continue;

9391VPWidenInductionRecipe *WideIV =

9392 cast<VPWidenInductionRecipe>(RecipeBuilder.getRecipe(Phi));

9393VPRecipeBase *R = RecipeBuilder.getRecipe(IVInc);

9394R->setOperand(1, WideIV->getStepValue());

9395 }

9396

9397if (auto *UncountableExitingBlock =

9398 Legal->getUncountableEarlyExitingBlock()) {

9399if (!VPlanTransforms::handleUncountableEarlyExit(

9400 *Plan, *PSE.getSE(), OrigLoop, UncountableExitingBlock,

9401 RecipeBuilder)) {

9402reportVectorizationFailure(

9403"Some exit values in loop with uncountable exit not supported yet",

9404"UncountableEarlyExitLoopsUnsupportedExitValue", ORE, OrigLoop);

9405returnnullptr;

9406 }

9407 }

9408DenseMap<VPValue *, VPValue *> IVEndValues;

9409addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);

9410SetVector<VPIRInstruction *> ExitUsersToFix =

9411collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan);

9412addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);

9413addUsersInExitBlocks(*Plan, ExitUsersToFix);

9414

9415// ---------------------------------------------------------------------------

9416// Transform initial VPlan: Apply previously taken decisions, in order, to

9417// bring the VPlan to its final state.

9418// ---------------------------------------------------------------------------

9419

9420// Adjust the recipes for any inloop reductions.

9421 adjustRecipesForReductions(Plan, RecipeBuilder,Range.Start);

9422

9423// Interleave memory: for each Interleave Group we marked earlier as relevant

9424// for this VPlan, replace the Recipes widening its memory instructions with a

9425// single VPInterleaveRecipe at its insertion point.

9426VPlanTransforms::createInterleaveGroups(

9427 *Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());

9428

9429for (ElementCount VF :Range)

9430 Plan->addVF(VF);

9431 Plan->setName("Initial VPlan");

9432

9433// Replace VPValues for known constant strides guaranteed by predicate scalar

9434// evolution.

9435auto CanUseVersionedStride = [&Plan](VPUser &U,unsigned) {

9436auto *R = cast<VPRecipeBase>(&U);

9437returnR->getParent()->getParent() ||

9438R->getParent() ==

9439 Plan->getVectorLoopRegion()->getSinglePredecessor();

9440 };

9441for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {

9442auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();

9443auto *ScevStride = dyn_cast<SCEVConstant>(PSE.getSCEV(StrideV));

9444// Only handle constant strides for now.

9445if (!ScevStride)

9446continue;

9447

9448auto *CI = Plan->getOrAddLiveIn(

9449 ConstantInt::get(Stride->getType(), ScevStride->getAPInt()));

9450if (VPValue *StrideVPV = Plan->getLiveIn(StrideV))

9451 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);

9452

9453// The versioned value may not be used in the loop directly but through a

9454// sext/zext. Add new live-ins in those cases.

9455for (Value *U : StrideV->users()) {

9456if (!isa<SExtInst, ZExtInst>(U))

9457continue;

9458VPValue *StrideVPV = Plan->getLiveIn(U);

9459if (!StrideVPV)

9460continue;

9461unsigned BW =U->getType()->getScalarSizeInBits();

9462APInt C = isa<SExtInst>(U) ? ScevStride->getAPInt().sext(BW)

9463 : ScevStride->getAPInt().zext(BW);

9464VPValue *CI = Plan->getOrAddLiveIn(ConstantInt::get(U->getType(),C));

9465 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);

9466 }

9467 }

9468

9469VPlanTransforms::dropPoisonGeneratingRecipes(*Plan, [this](BasicBlock *BB) {

9470return Legal->blockNeedsPredication(BB);

9471 });

9472

9473// Sink users of fixed-order recurrence past the recipe defining the previous

9474// value and introduce FirstOrderRecurrenceSplice VPInstructions.

9475if (!VPlanTransforms::adjustFixedOrderRecurrences(*Plan, Builder))

9476returnnullptr;

9477

9478if (useActiveLaneMask(Style)) {

9479// TODO: Move checks to VPlanTransforms::addActiveLaneMask once

9480// TailFoldingStyle is visible there.

9481bool ForControlFlow =useActiveLaneMaskForControlFlow(Style);

9482bool WithoutRuntimeCheck =

9483Style ==TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;

9484VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,

9485 WithoutRuntimeCheck);

9486 }

9487VPlanTransforms::optimizeInductionExitUsers(*Plan, IVEndValues);

9488

9489assert(verifyVPlanIsValid(*Plan) &&"VPlan is invalid");

9490return Plan;

9491}

9492

9493VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {

9494// Outer loop handling: They may require CFG and instruction level

9495// transformations before even evaluating whether vectorization is profitable.

9496// Since we cannot modify the incoming IR, we need to build VPlan upfront in

9497// the vectorization pipeline.

9498assert(!OrigLoop->isInnermost());

9499assert(EnableVPlanNativePath &&"VPlan-native path is not enabled.");

9500

9501// Create new empty VPlan

9502auto Plan =VPlan::createInitialVPlan(Legal->getWidestInductionType(), PSE,

9503true,false, OrigLoop);

9504

9505// Build hierarchical CFG

9506VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);

9507 HCFGBuilder.buildHierarchicalCFG();

9508

9509for (ElementCount VF :Range)

9510 Plan->addVF(VF);

9511

9512VPlanTransforms::VPInstructionsToVPRecipes(

9513 Plan,

9514 [this](PHINode *P) {return Legal->getIntOrFpInductionDescriptor(P); },

9515 *PSE.getSE(), *TLI);

9516

9517// Tail folding is not supported for outer loops, so the induction increment

9518// is guaranteed to not wrap.

9519bool HasNUW =true;

9520addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,

9521DebugLoc());

9522

9523// Collect mapping of IR header phis to header phi recipes, to be used in

9524// addScalarResumePhis.

9525VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,

9526 Builder);

9527for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

9528if (isa<VPCanonicalIVPHIRecipe>(&R))

9529continue;

9530auto *HeaderR = cast<VPHeaderPHIRecipe>(&R);

9531 RecipeBuilder.setRecipe(HeaderR->getUnderlyingInstr(), HeaderR);

9532 }

9533DenseMap<VPValue *, VPValue *> IVEndValues;

9534// TODO: IVEndValues are not used yet in the native path, to optimize exit

9535// values.

9536addScalarResumePhis(RecipeBuilder, *Plan, IVEndValues);

9537

9538assert(verifyVPlanIsValid(*Plan) &&"VPlan is invalid");

9539return Plan;

9540}

9541

9542// Adjust the recipes for reductions. For in-loop reductions the chain of

9543// instructions leading from the loop exit instr to the phi need to be converted

9544// to reductions, with one operand being vector and the other being the scalar

9545// reduction chain. For other reductions, a select is introduced between the phi

9546// and users outside the vector region when folding the tail.

9547//

9548// A ComputeReductionResult recipe is added to the middle block, also for

9549// in-loop reductions which compute their result in-loop, because generating

9550// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.

9551//

9552// Adjust AnyOf reductions; replace the reduction phi for the selected value

9553// with a boolean reduction phi node to check if the condition is true in any

9554// iteration. The final value is selected by the final ComputeReductionResult.

9555void LoopVectorizationPlanner::adjustRecipesForReductions(

9556VPlanPtr &Plan,VPRecipeBuilder &RecipeBuilder,ElementCount MinVF) {

9557using namespaceVPlanPatternMatch;

9558VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();

9559VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();

9560VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();

9561SmallVector<VPRecipeBase *> ToDelete;

9562

9563for (VPRecipeBase &R : Header->phis()) {

9564auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);

9565if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))

9566continue;

9567

9568constRecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

9569RecurKind Kind = RdxDesc.getRecurrenceKind();

9570assert(

9571 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&

9572 !RecurrenceDescriptor::isFindLastIVRecurrenceKind(Kind) &&

9573"AnyOf and FindLast reductions are not allowed for in-loop reductions");

9574

9575// Collect the chain of "link" recipes for the reduction starting at PhiR.

9576SetVector<VPSingleDefRecipe *> Worklist;

9577 Worklist.insert(PhiR);

9578for (unsignedI = 0;I != Worklist.size(); ++I) {

9579VPSingleDefRecipe *Cur = Worklist[I];

9580for (VPUser *U : Cur->users()) {

9581auto *UserRecipe = cast<VPSingleDefRecipe>(U);

9582if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {

9583assert((UserRecipe->getParent() == MiddleVPBB ||

9584 UserRecipe->getParent() == Plan->getScalarPreheader()) &&

9585"U must be either in the loop region, the middle block or the "

9586"scalar preheader.");

9587continue;

9588 }

9589 Worklist.insert(UserRecipe);

9590 }

9591 }

9592

9593// Visit operation "Links" along the reduction chain top-down starting from

9594// the phi until LoopExitValue. We keep track of the previous item

9595// (PreviousLink) to tell which of the two operands of a Link will remain

9596// scalar and which will be reduced. For minmax by select(cmp), Link will be

9597// the select instructions. Blend recipes of in-loop reduction phi's will

9598// get folded to their non-phi operand, as the reduction recipe handles the

9599// condition directly.

9600VPSingleDefRecipe *PreviousLink = PhiR;// Aka Worklist[0].

9601for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {

9602Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();

9603

9604// Index of the first operand which holds a non-mask vector operand.

9605unsigned IndexOfFirstOperand;

9606// Recognize a call to the llvm.fmuladd intrinsic.

9607bool IsFMulAdd = (Kind ==RecurKind::FMulAdd);

9608VPValue *VecOp;

9609VPBasicBlock *LinkVPBB = CurrentLink->getParent();

9610if (IsFMulAdd) {

9611assert(

9612RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&

9613"Expected instruction to be a call to the llvm.fmuladd intrinsic");

9614assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||

9615 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&

9616 CurrentLink->getOperand(2) == PreviousLink &&

9617"expected a call where the previous link is the added operand");

9618

9619// If the instruction is a call to the llvm.fmuladd intrinsic then we

9620// need to create an fmul recipe (multiplying the first two operands of

9621// the fmuladd together) to use as the vector operand for the fadd

9622// reduction.

9623VPInstruction *FMulRecipe =newVPInstruction(

9624 Instruction::FMul,

9625 {CurrentLink->getOperand(0), CurrentLink->getOperand(1)},

9626 CurrentLinkI->getFastMathFlags());

9627 LinkVPBB->insert(FMulRecipe, CurrentLink->getIterator());

9628 VecOp = FMulRecipe;

9629 }else {

9630auto *Blend = dyn_cast<VPBlendRecipe>(CurrentLink);

9631if (PhiR->isInLoop() && Blend) {

9632assert(Blend->getNumIncomingValues() == 2 &&

9633"Blend must have 2 incoming values");

9634if (Blend->getIncomingValue(0) == PhiR)

9635 Blend->replaceAllUsesWith(Blend->getIncomingValue(1));

9636else {

9637assert(Blend->getIncomingValue(1) == PhiR &&

9638"PhiR must be an operand of the blend");

9639 Blend->replaceAllUsesWith(Blend->getIncomingValue(0));

9640 }

9641continue;

9642 }

9643

9644if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {

9645if (isa<VPWidenRecipe>(CurrentLink)) {

9646assert(isa<CmpInst>(CurrentLinkI) &&

9647"need to have the compare of the select");

9648continue;

9649 }

9650assert(isa<VPWidenSelectRecipe>(CurrentLink) &&

9651"must be a select recipe");

9652 IndexOfFirstOperand = 1;

9653 }else {

9654assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&

9655"Expected to replace a VPWidenSC");

9656 IndexOfFirstOperand = 0;

9657 }

9658// Note that for non-commutable operands (cmp-selects), the semantics of

9659// the cmp-select are captured in the recurrence kind.

9660unsigned VecOpId =

9661 CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink

9662 ? IndexOfFirstOperand + 1

9663 : IndexOfFirstOperand;

9664 VecOp = CurrentLink->getOperand(VecOpId);

9665assert(VecOp != PreviousLink &&

9666 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -

9667 (VecOpId - IndexOfFirstOperand)) ==

9668 PreviousLink &&

9669"PreviousLink must be the operand other than VecOp");

9670 }

9671

9672BasicBlock *BB = CurrentLinkI->getParent();

9673VPValue *CondOp =nullptr;

9674if (CM.blockNeedsPredicationForAnyReason(BB))

9675 CondOp = RecipeBuilder.getBlockInMask(BB);

9676

9677auto *RedRecipe =newVPReductionRecipe(

9678 RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp,

9679 CM.useOrderedReductions(RdxDesc), CurrentLinkI->getDebugLoc());

9680// Append the recipe to the end of the VPBasicBlock because we need to

9681// ensure that it comes after all of it's inputs, including CondOp.

9682// Delete CurrentLink as it will be invalid if its operand is replaced

9683// with a reduction defined at the bottom of the block in the next link.

9684 LinkVPBB->appendRecipe(RedRecipe);

9685 CurrentLink->replaceAllUsesWith(RedRecipe);

9686 ToDelete.push_back(CurrentLink);

9687 PreviousLink = RedRecipe;

9688 }

9689 }

9690VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();

9691 Builder.setInsertPoint(&*LatchVPBB->begin());

9692VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();

9693for (VPRecipeBase &R :

9694 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

9695VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);

9696if (!PhiR)

9697continue;

9698

9699constRecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

9700// If tail is folded by masking, introduce selects between the phi

9701// and the users outside the vector region of each reduction, at the

9702// beginning of the dedicated latch block.

9703auto *OrigExitingVPV = PhiR->getBackedgeValue();

9704auto *NewExitingVPV = PhiR->getBackedgeValue();

9705if (!PhiR->isInLoop() && CM.foldTailByMasking()) {

9706VPValue *Cond = RecipeBuilder.getBlockInMask(OrigLoop->getHeader());

9707assert(OrigExitingVPV->getDefiningRecipe()->getParent() != LatchVPBB &&

9708"reduction recipe must be defined before latch");

9709Type *PhiTy = PhiR->getOperand(0)->getLiveInIRValue()->getType();

9710 std::optional<FastMathFlags> FMFs =

9711 PhiTy->isFloatingPointTy()

9712 ? std::make_optional(RdxDesc.getFastMathFlags())

9713 :std::nullopt;

9714 NewExitingVPV =

9715 Builder.createSelect(Cond, OrigExitingVPV, PhiR, {},"", FMFs);

9716 OrigExitingVPV->replaceUsesWithIf(NewExitingVPV, [](VPUser &U,unsigned) {

9717return isa<VPInstruction>(&U) &&

9718 cast<VPInstruction>(&U)->getOpcode() ==

9719VPInstruction::ComputeReductionResult;

9720 });

9721if (CM.usePredicatedReductionSelect(

9722 PhiR->getRecurrenceDescriptor().getOpcode(), PhiTy))

9723 PhiR->setOperand(1, NewExitingVPV);

9724 }

9725

9726// If the vector reduction can be performed in a smaller type, we truncate

9727// then extend the loop exit value to enable InstCombine to evaluate the

9728// entire expression in the smaller type.

9729Type *PhiTy = PhiR->getStartValue()->getLiveInIRValue()->getType();

9730if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&

9731 !RecurrenceDescriptor::isAnyOfRecurrenceKind(

9732 RdxDesc.getRecurrenceKind())) {

9733assert(!PhiR->isInLoop() &&"Unexpected truncated inloop reduction!");

9734Type *RdxTy = RdxDesc.getRecurrenceType();

9735auto *Trunc =

9736newVPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);

9737auto *Extnd =

9738 RdxDesc.isSigned()

9739 ?newVPWidenCastRecipe(Instruction::SExt, Trunc, PhiTy)

9740 : newVPWidenCastRecipe(Instruction::ZExt, Trunc, PhiTy);

9741

9742 Trunc->insertAfter(NewExitingVPV->getDefiningRecipe());

9743 Extnd->insertAfter(Trunc);

9744if (PhiR->getOperand(1) == NewExitingVPV)

9745 PhiR->setOperand(1, Extnd->getVPSingleValue());

9746 NewExitingVPV = Extnd;

9747 }

9748

9749// We want code in the middle block to appear to execute on the location of

9750// the scalar loop's latch terminator because: (a) it is all compiler

9751// generated, (b) these instructions are always executed after evaluating

9752// the latch conditional branch, and (c) other passes may add new

9753// predecessors which terminate on this line. This is the easiest way to

9754// ensure we don't accidentally cause an extra step back into the loop while

9755// debugging.

9756DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();

9757

9758// TODO: At the moment ComputeReductionResult also drives creation of the

9759// bc.merge.rdx phi nodes, hence it needs to be created unconditionally here

9760// even for in-loop reductions, until the reduction resume value handling is

9761// also modeled in VPlan.

9762auto *FinalReductionResult =newVPInstruction(

9763VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);

9764// Update all users outside the vector region.

9765 OrigExitingVPV->replaceUsesWithIf(

9766 FinalReductionResult, [](VPUser &User,unsigned) {

9767auto *Parent = cast<VPRecipeBase>(&User)->getParent();

9768return Parent && !Parent->getParent();

9769 });

9770 FinalReductionResult->insertBefore(*MiddleVPBB, IP);

9771

9772// Adjust AnyOf reductions; replace the reduction phi for the selected value

9773// with a boolean reduction phi node to check if the condition is true in

9774// any iteration. The final value is selected by the final

9775// ComputeReductionResult.

9776if (RecurrenceDescriptor::isAnyOfRecurrenceKind(

9777 RdxDesc.getRecurrenceKind())) {

9778auto *Select = cast<VPRecipeBase>(*find_if(PhiR->users(), [](VPUser *U) {

9779 return isa<VPWidenSelectRecipe>(U) ||

9780 (isa<VPReplicateRecipe>(U) &&

9781 cast<VPReplicateRecipe>(U)->getUnderlyingInstr()->getOpcode() ==

9782 Instruction::Select);

9783 }));

9784VPValue *Cmp =Select->getOperand(0);

9785// If the compare is checking the reduction PHI node, adjust it to check

9786// the start value.

9787if (VPRecipeBase *CmpR =Cmp->getDefiningRecipe()) {

9788for (unsignedI = 0;I != CmpR->getNumOperands(); ++I)

9789if (CmpR->getOperand(I) == PhiR)

9790 CmpR->setOperand(I, PhiR->getStartValue());

9791 }

9792VPBuilder::InsertPointGuard Guard(Builder);

9793 Builder.setInsertPoint(Select);

9794

9795// If the true value of the select is the reduction phi, the new value is

9796// selected if the negated condition is true in any iteration.

9797if (Select->getOperand(1) == PhiR)

9798Cmp = Builder.createNot(Cmp);

9799VPValue *Or = Builder.createOr(PhiR, Cmp);

9800Select->getVPSingleValue()->replaceAllUsesWith(Or);

9801// Delete Select now that it has invalid types.

9802 ToDelete.push_back(Select);

9803

9804// Convert the reduction phi to operate on bools.

9805 PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(

9806 OrigLoop->getHeader()->getContext())));

9807continue;

9808 }

9809

9810if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(

9811 RdxDesc.getRecurrenceKind())) {

9812// Adjust the start value for FindLastIV recurrences to use the sentinel

9813// value after generating the ResumePhi recipe, which uses the original

9814// start value.

9815 PhiR->setOperand(0, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()));

9816 }

9817 }

9818

9819VPlanTransforms::clearReductionWrapFlags(*Plan);

9820for (VPRecipeBase *R : ToDelete)

9821R->eraseFromParent();

9822}

9823

9824voidVPDerivedIVRecipe::execute(VPTransformState &State) {

9825assert(!State.Lane &&"VPDerivedIVRecipe being replicated.");

9826

9827// Fast-math-flags propagate from the original induction instruction.

9828IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);

9829if (FPBinOp)

9830 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());

9831

9832Value *Step = State.get(getStepValue(),VPLane(0));

9833Value *Index = State.get(getOperand(1),VPLane(0));

9834Value *DerivedIV =emitTransformedIndex(

9835 State.Builder, Index,getStartValue()->getLiveInIRValue(), Step, Kind,

9836 cast_if_present<BinaryOperator>(FPBinOp));

9837 DerivedIV->setName(Name);

9838// If index is the vector trip count, the concrete value will only be set in

9839// prepareToExecute, leading to missed simplifications, e.g. if it is 0.

9840// TODO: Remove the special case for the vector trip count once it is computed

9841// in VPlan and can be used during VPlan simplification.

9842assert((DerivedIV != Index ||

9843getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&

9844"IV didn't need transforming?");

9845 State.set(this, DerivedIV,VPLane(0));

9846}

9847

9848voidVPReplicateRecipe::execute(VPTransformState &State) {

9849Instruction *UI =getUnderlyingInstr();

9850if (State.Lane) {// Generate a single instance.

9851assert((State.VF.isScalar() || !isUniform()) &&

9852"uniform recipe shouldn't be predicated");

9853assert(!State.VF.isScalable() &&"Can't scalarize a scalable vector");

9854 State.ILV->scalarizeInstruction(UI,this, *State.Lane, State);

9855// Insert scalar instance packing it into a vector.

9856if (State.VF.isVector() &&shouldPack()) {

9857// If we're constructing lane 0, initialize to start from poison.

9858if (State.Lane->isFirstLane()) {

9859assert(!State.VF.isScalable() &&"VF is assumed to be non scalable.");

9860Value *Poison =PoisonValue::get(

9861VectorType::get(UI->getType(), State.VF));

9862 State.set(this,Poison);

9863 }

9864 State.packScalarIntoVectorValue(this, *State.Lane);

9865 }

9866return;

9867 }

9868

9869if (IsUniform) {

9870// Uniform within VL means we need to generate lane 0.

9871 State.ILV->scalarizeInstruction(UI,this,VPLane(0), State);

9872return;

9873 }

9874

9875// A store of a loop varying value to a uniform address only needs the last

9876// copy of the store.

9877if (isa<StoreInst>(UI) &&

9878vputils::isUniformAfterVectorization(getOperand(1))) {

9879auto Lane =VPLane::getLastLaneForVF(State.VF);

9880 State.ILV->scalarizeInstruction(UI,this,VPLane(Lane), State);

9881return;

9882 }

9883

9884// Generate scalar instances for all VF lanes.

9885assert(!State.VF.isScalable() &&"Can't scalarize a scalable vector");

9886constunsigned EndLane = State.VF.getKnownMinValue();

9887for (unsigned Lane = 0; Lane < EndLane; ++Lane)

9888 State.ILV->scalarizeInstruction(UI,this,VPLane(Lane), State);

9889}

9890

9891// Determine how to lower the scalar epilogue, which depends on 1) optimising

9892// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing

9893// predication, and 4) a TTI hook that analyses whether the loop is suitable

9894// for predication.

9895staticScalarEpilogueLowering getScalarEpilogueLowering(

9896Function *F,Loop *L,LoopVectorizeHints &Hints,ProfileSummaryInfo *PSI,

9897BlockFrequencyInfo *BFI,TargetTransformInfo *TTI,TargetLibraryInfo *TLI,

9898LoopVectorizationLegality &LVL,InterleavedAccessInfo *IAI) {

9899// 1) OptSize takes precedence over all other options, i.e. if this is set,

9900// don't look at hints or options, and don't request a scalar epilogue.

9901// (For PGSO, as shouldOptimizeForSize isn't currently accessible from

9902// LoopAccessInfo (due to code dependency and not being able to reliably get

9903// PSI/BFI from a loop analysis under NPM), we cannot suppress the collection

9904// of strides in LoopAccessInfo::analyzeLoop() and vectorize without

9905// versioning when the vectorization is forced, unlike hasOptSize. So revert

9906// back to the old way and vectorize with versioning when forced. See D81345.)

9907if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,

9908PGSOQueryType::IRPass) &&

9909 Hints.getForce() !=LoopVectorizeHints::FK_Enabled))

9910returnCM_ScalarEpilogueNotAllowedOptSize;

9911

9912// 2) If set, obey the directives

9913if (PreferPredicateOverEpilogue.getNumOccurrences()) {

9914switch (PreferPredicateOverEpilogue) {

9915casePreferPredicateTy::ScalarEpilogue:

9916returnCM_ScalarEpilogueAllowed;

9917casePreferPredicateTy::PredicateElseScalarEpilogue:

9918returnCM_ScalarEpilogueNotNeededUsePredicate;

9919casePreferPredicateTy::PredicateOrDontVectorize:

9920returnCM_ScalarEpilogueNotAllowedUsePredicate;

9921 };

9922 }

9923

9924// 3) If set, obey the hints

9925switch (Hints.getPredicate()) {

9926caseLoopVectorizeHints::FK_Enabled:

9927returnCM_ScalarEpilogueNotNeededUsePredicate;

9928caseLoopVectorizeHints::FK_Disabled:

9929returnCM_ScalarEpilogueAllowed;

9930 };

9931

9932// 4) if the TTI hook indicates this is profitable, request predication.

9933TailFoldingInfo TFI(TLI, &LVL, IAI);

9934if (TTI->preferPredicateOverEpilogue(&TFI))

9935returnCM_ScalarEpilogueNotNeededUsePredicate;

9936

9937returnCM_ScalarEpilogueAllowed;

9938}

9939

9940// Process the loop in the VPlan-native vectorization path. This path builds

9941// VPlan upfront in the vectorization pipeline, which allows to apply

9942// VPlan-to-VPlan transformations from the very beginning without modifying the

9943// input LLVM IR.

9944staticboolprocessLoopInVPlanNativePath(

9945Loop *L,PredicatedScalarEvolution &PSE,LoopInfo *LI,DominatorTree *DT,

9946LoopVectorizationLegality *LVL,TargetTransformInfo *TTI,

9947TargetLibraryInfo *TLI,DemandedBits *DB,AssumptionCache *AC,

9948OptimizationRemarkEmitter *ORE,BlockFrequencyInfo *BFI,

9949ProfileSummaryInfo *PSI,LoopVectorizeHints &Hints,

9950LoopVectorizationRequirements &Requirements) {

9951

9952if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {

9953LLVM_DEBUG(dbgs() <<"LV: cannot compute the outer-loop trip count\n");

9954returnfalse;

9955 }

9956assert(EnableVPlanNativePath &&"VPlan-native path is disabled.");

9957Function *F = L->getHeader()->getParent();

9958InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());

9959

9960ScalarEpilogueLoweringSEL =

9961getScalarEpilogueLowering(F, L, Hints, PSI, BFI,TTI, TLI, *LVL, &IAI);

9962

9963LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE,F,

9964 &Hints, IAI);

9965// Use the planner for outer loop vectorization.

9966// TODO: CM is not used at this point inside the planner. Turn CM into an

9967// optional argument if we don't need it in the future.

9968LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,

9969 ORE);

9970

9971// Get user vectorization factor.

9972ElementCount UserVF = Hints.getWidth();

9973

9974 CM.collectElementTypesForWidening();

9975

9976// Plan how to best vectorize, return the best VF and its cost.

9977constVectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);

9978

9979// If we are stress testing VPlan builds, do not attempt to generate vector

9980// code. Masked vector code generation support will follow soon.

9981// Also, do not attempt to vectorize if no vector code will be produced.

9982if (VPlanBuildStressTest ||VectorizationFactor::Disabled() == VF)

9983returnfalse;

9984

9985VPlan &BestPlan = LVP.getPlanFor(VF.Width);

9986

9987 {

9988bool AddBranchWeights =

9989hasBranchWeightMD(*L->getLoopLatch()->getTerminator());

9990 GeneratedRTChecks Checks(PSE, DT, LI,TTI,F->getDataLayout(),

9991 AddBranchWeights, CM.CostKind);

9992InnerLoopVectorizer LB(L, PSE, LI, DT, TLI,TTI, AC, ORE, VF.Width,

9993 VF.Width, 1, LVL, &CM, BFI, PSI, Checks, BestPlan);

9994LLVM_DEBUG(dbgs() <<"Vectorizing outer loop in \""

9995 << L->getHeader()->getParent()->getName() <<"\"\n");

9996 LVP.executePlan(VF.Width, 1, BestPlan, LB, DT,false);

9997 }

9998

9999reportVectorization(ORE, L, VF, 1);

10000

10001// Mark the loop as already vectorized to avoid vectorizing again.

10002 Hints.setAlreadyVectorized();

10003assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));

10004returntrue;

10005}

10006

10007// Emit a remark if there are stores to floats that required a floating point

10008// extension. If the vectorized loop was generated with floating point there

10009// will be a performance penalty from the conversion overhead and the change in

10010// the vector width.

10011staticvoidcheckMixedPrecision(Loop *L,OptimizationRemarkEmitter *ORE) {

10012SmallVector<Instruction *, 4> Worklist;

10013for (BasicBlock *BB : L->getBlocks()) {

10014for (Instruction &Inst : *BB) {

10015if (auto *S = dyn_cast<StoreInst>(&Inst)) {

10016if (S->getValueOperand()->getType()->isFloatTy())

10017 Worklist.push_back(S);

10018 }

10019 }

10020 }

10021

10022// Traverse the floating point stores upwards searching, for floating point

10023// conversions.

10024SmallPtrSet<const Instruction *, 4> Visited;

10025SmallPtrSet<const Instruction *, 4> EmittedRemark;

10026while (!Worklist.empty()) {

10027auto *I = Worklist.pop_back_val();

10028if (!L->contains(I))

10029continue;

10030if (!Visited.insert(I).second)

10031continue;

10032

10033// Emit a remark if the floating point store required a floating

10034// point conversion.

10035// TODO: More work could be done to identify the root cause such as a

10036// constant or a function return type and point the user to it.

10037if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)

10038 ORE->emit([&]() {

10039returnOptimizationRemarkAnalysis(LV_NAME,"VectorMixedPrecision",

10040I->getDebugLoc(), L->getHeader())

10041 <<"floating point conversion changes vector width. "

10042 <<"Mixed floating point precision requires an up/down "

10043 <<"cast that will negatively impact performance.";

10044 });

10045

10046for (Use &Op :I->operands())

10047if (auto *OpI = dyn_cast<Instruction>(Op))

10048 Worklist.push_back(OpI);

10049 }

10050}

10051

10052staticboolareRuntimeChecksProfitable(GeneratedRTChecks &Checks,

10053VectorizationFactor &VF,Loop *L,

10054constTargetTransformInfo &TTI,

10055PredicatedScalarEvolution &PSE,

10056ScalarEpilogueLoweringSEL) {

10057InstructionCost CheckCost = Checks.getCost();

10058if (!CheckCost.isValid())

10059returnfalse;

10060

10061// When interleaving only scalar and vector cost will be equal, which in turn

10062// would lead to a divide by 0. Fall back to hard threshold.

10063if (VF.Width.isScalar()) {

10064if (CheckCost >VectorizeMemoryCheckThreshold) {

10065LLVM_DEBUG(

10066dbgs()

10067 <<"LV: Interleaving only is not profitable due to runtime checks\n");

10068returnfalse;

10069 }

10070returntrue;

10071 }

10072

10073// The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.

10074uint64_t ScalarC = *VF.ScalarCost.getValue();

10075if (ScalarC == 0)

10076returntrue;

10077

10078// First, compute the minimum iteration count required so that the vector

10079// loop outperforms the scalar loop.

10080// The total cost of the scalar loop is

10081// ScalarC * TC

10082// where

10083// * TC is the actual trip count of the loop.

10084// * ScalarC is the cost of a single scalar iteration.

10085//

10086// The total cost of the vector loop is

10087// RtC + VecC * (TC / VF) + EpiC

10088// where

10089// * RtC is the cost of the generated runtime checks

10090// * VecC is the cost of a single vector iteration.

10091// * TC is the actual trip count of the loop

10092// * VF is the vectorization factor

10093// * EpiCost is the cost of the generated epilogue, including the cost

10094// of the remaining scalar operations.

10095//

10096// Vectorization is profitable once the total vector cost is less than the

10097// total scalar cost:

10098// RtC + VecC * (TC / VF) + EpiC < ScalarC * TC

10099//

10100// Now we can compute the minimum required trip count TC as

10101// VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC

10102//

10103// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that

10104// the computations are performed on doubles, not integers and the result

10105// is rounded up, hence we get an upper estimate of the TC.

10106unsigned IntVF =getEstimatedRuntimeVF(L,TTI, VF.Width);

10107uint64_t RtC = *CheckCost.getValue();

10108uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();

10109uint64_t MinTC1 = Div == 0 ? 0 :divideCeil(RtC * IntVF, Div);

10110

10111// Second, compute a minimum iteration count so that the cost of the

10112// runtime checks is only a fraction of the total scalar loop cost. This

10113// adds a loop-dependent bound on the overhead incurred if the runtime

10114// checks fail. In case the runtime checks fail, the cost is RtC + ScalarC

10115// * TC. To bound the runtime check to be a fraction 1/X of the scalar

10116// cost, compute

10117// RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC

10118uint64_t MinTC2 =divideCeil(RtC * 10, ScalarC);

10119

10120// Now pick the larger minimum. If it is not a multiple of VF and a scalar

10121// epilogue is allowed, choose the next closest multiple of VF. This should

10122// partly compensate for ignoring the epilogue cost.

10123uint64_t MinTC = std::max(MinTC1, MinTC2);

10124if (SEL ==CM_ScalarEpilogueAllowed)

10125 MinTC =alignTo(MinTC, IntVF);

10126 VF.MinProfitableTripCount =ElementCount::getFixed(MinTC);

10127

10128LLVM_DEBUG(

10129dbgs() <<"LV: Minimum required TC for runtime checks to be profitable:"

10130 << VF.MinProfitableTripCount <<"\n");

10131

10132// Skip vectorization if the expected trip count is less than the minimum

10133// required trip count.

10134if (auto ExpectedTC =getSmallBestKnownTC(PSE, L)) {

10135if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),

10136 VF.MinProfitableTripCount)) {

10137LLVM_DEBUG(dbgs() <<"LV: Vectorization is not beneficial: expected "

10138"trip count < minimum profitable VF ("

10139 << *ExpectedTC <<" < " << VF.MinProfitableTripCount

10140 <<")\n");

10141

10142returnfalse;

10143 }

10144 }

10145returntrue;

10146}

10147

10148LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)

10149 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||

10150 !EnableLoopInterleaving),

10151 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||

10152 !EnableLoopVectorization) {}

10153

10154/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue

10155/// vectorization. Remove ResumePhis from \p MainPlan for inductions that

10156/// don't have a corresponding wide induction in \p EpiPlan.

10157staticvoidpreparePlanForMainVectorLoop(VPlan &MainPlan,VPlan &EpiPlan) {

10158// Collect PHI nodes of widened phis in the VPlan for the epilogue. Those

10159// will need their resume-values computed in the main vector loop. Others

10160// can be removed from the main VPlan.

10161SmallPtrSet<PHINode *, 2> EpiWidenedPhis;

10162for (VPRecipeBase &R :

10163 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {

10164if (isa<VPCanonicalIVPHIRecipe>(&R))

10165continue;

10166 EpiWidenedPhis.insert(

10167 cast<PHINode>(R.getVPSingleValue()->getUnderlyingValue()));

10168 }

10169for (VPRecipeBase &R :make_early_inc_range(

10170 *cast<VPIRBasicBlock>(MainPlan.getScalarHeader()))) {

10171auto *VPIRInst = cast<VPIRInstruction>(&R);

10172auto *IRI = dyn_cast<PHINode>(&VPIRInst->getInstruction());

10173if (!IRI)

10174break;

10175if (EpiWidenedPhis.contains(IRI))

10176continue;

10177// There is no corresponding wide induction in the epilogue plan that would

10178// need a resume value. Remove the VPIRInst wrapping the scalar header phi

10179// together with the corresponding ResumePhi. The resume values for the

10180// scalar loop will be created during execution of EpiPlan.

10181VPRecipeBase *ResumePhi = VPIRInst->getOperand(0)->getDefiningRecipe();

10182 VPIRInst->eraseFromParent();

10183 ResumePhi->eraseFromParent();

10184 }

10185VPlanTransforms::removeDeadRecipes(MainPlan);

10186

10187using namespaceVPlanPatternMatch;

10188VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();

10189VPValue *VectorTC = &MainPlan.getVectorTripCount();

10190// If there is a suitable resume value for the canonical induction in the

10191// scalar (which will become vector) epilogue loop we are done. Otherwise

10192// create it below.

10193if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {

10194returnmatch(&R, m_VPInstruction<VPInstruction::ResumePhi>(

10195m_Specific(VectorTC),m_SpecificInt(0)));

10196 }))

10197return;

10198VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());

10199 ScalarPHBuilder.createNaryOp(

10200VPInstruction::ResumePhi,

10201 {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},

10202"vec.epilog.resume.val");

10203}

10204

10205/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded

10206/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.

10207staticvoid

10208preparePlanForEpilogueVectorLoop(VPlan &Plan,Loop *L,

10209const SCEV2ValueTy &ExpandedSCEVs,

10210constEpilogueLoopVectorizationInfo &EPI) {

10211VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();

10212VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();

10213 Header->setName("vec.epilog.vector.body");

10214

10215// Re-use the trip count and steps expanded for the main loop, as

10216// skeleton creation needs it as a value that dominates both the scalar

10217// and vector epilogue loops

10218// TODO: This is a workaround needed for epilogue vectorization and it

10219// should be removed once induction resume value creation is done

10220// directly in VPlan.

10221for (auto &R :make_early_inc_range(*Plan.getEntry())) {

10222auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);

10223if (!ExpandR)

10224continue;

10225auto *ExpandedVal =

10226 Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);

10227 ExpandR->replaceAllUsesWith(ExpandedVal);

10228if (Plan.getTripCount() == ExpandR)

10229 Plan.resetTripCount(ExpandedVal);

10230 ExpandR->eraseFromParent();

10231 }

10232

10233// Ensure that the start values for all header phi recipes are updated before

10234// vectorizing the epilogue loop.

10235for (VPRecipeBase &R : Header->phis()) {

10236if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(&R)) {

10237// When vectorizing the epilogue loop, the canonical induction start

10238// value needs to be changed from zero to the value after the main

10239// vector loop. Find the resume value created during execution of the main

10240// VPlan.

10241// FIXME: Improve modeling for canonical IV start values in the epilogue

10242// loop.

10243BasicBlock *MainMiddle = find_singleton<BasicBlock>(

10244predecessors(L->getLoopPreheader()),

10245 [&EPI](BasicBlock *BB,bool) ->BasicBlock * {

10246 if (BB != EPI.MainLoopIterationCountCheck &&

10247 BB != EPI.EpilogueIterationCountCheck &&

10248 BB != EPI.SCEVSafetyCheck && BB != EPI.MemSafetyCheck)

10249 return BB;

10250 return nullptr;

10251 });

10252using namespacellvm::PatternMatch;

10253Type *IdxTy =IV->getScalarType();

10254PHINode *EPResumeVal = find_singleton<PHINode>(

10255 L->getLoopPreheader()->phis(),

10256 [&EPI, IdxTy, MainMiddle](PHINode &P,bool) ->PHINode * {

10257 if (P.getType() == IdxTy &&

10258 P.getIncomingValueForBlock(MainMiddle) == EPI.VectorTripCount &&

10259 match(

10260 P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),

10261 m_SpecificInt(0)))

10262 return &P;

10263 return nullptr;

10264 });

10265assert(EPResumeVal &&"must have a resume value for the canonical IV");

10266VPValue *VPV = Plan.getOrAddLiveIn(EPResumeVal);

10267assert(all_of(IV->users(),

10268 [](constVPUser *U) {

10269 return isa<VPScalarIVStepsRecipe>(U) ||

10270 isa<VPScalarCastRecipe>(U) ||

10271 isa<VPDerivedIVRecipe>(U) ||

10272 cast<VPInstruction>(U)->getOpcode() ==

10273 Instruction::Add;

10274 }) &&

10275"the canonical IV should only be used by its increment or "

10276"ScalarIVSteps when resetting the start value");

10277IV->setOperand(0, VPV);

10278continue;

10279 }

10280

10281Value *ResumeV =nullptr;

10282// TODO: Move setting of resume values to prepareToExecute.

10283if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {

10284 ResumeV = cast<PHINode>(ReductionPhi->getUnderlyingInstr())

10285 ->getIncomingValueForBlock(L->getLoopPreheader());

10286constRecurrenceDescriptor &RdxDesc =

10287 ReductionPhi->getRecurrenceDescriptor();

10288RecurKind RK = RdxDesc.getRecurrenceKind();

10289if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {

10290// VPReductionPHIRecipes for AnyOf reductions expect a boolean as

10291// start value; compare the final value from the main vector loop

10292// to the start value.

10293BasicBlock *PBB = cast<Instruction>(ResumeV)->getParent();

10294IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());

10295 ResumeV =

10296 Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());

10297 }elseif (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {

10298// VPReductionPHIRecipe for FindLastIV reductions requires an adjustment

10299// to the resume value. The resume value is adjusted to the sentinel

10300// value when the final value from the main vector loop equals the start

10301// value. This ensures correctness when the start value might not be

10302// less than the minimum value of a monotonically increasing induction

10303// variable.

10304BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();

10305IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());

10306Value *Cmp =

10307 Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());

10308 ResumeV =

10309 Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);

10310 }

10311 }else {

10312// Retrieve the induction resume values for wide inductions from

10313// their original phi nodes in the scalar loop.

10314PHINode *IndPhi = cast<VPWidenInductionRecipe>(&R)->getPHINode();

10315// Hook up to the PHINode generated by a ResumePhi recipe of main

10316// loop VPlan, which feeds the scalar loop.

10317 ResumeV = IndPhi->getIncomingValueForBlock(L->getLoopPreheader());

10318 }

10319assert(ResumeV &&"Must have a resume value");

10320VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);

10321 cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);

10322 }

10323}

10324

10325boolLoopVectorizePass::processLoop(Loop *L) {

10326assert((EnableVPlanNativePath || L->isInnermost()) &&

10327"VPlan-native path is not enabled. Only process inner loops.");

10328

10329LLVM_DEBUG(dbgs() <<"\nLV: Checking a loop in '"

10330 << L->getHeader()->getParent()->getName() <<"' from "

10331 << L->getLocStr() <<"\n");

10332

10333LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE,TTI);

10334

10335LLVM_DEBUG(

10336dbgs() <<"LV: Loop hints:"

10337 <<" force="

10338 << (Hints.getForce() ==LoopVectorizeHints::FK_Disabled

10339 ?"disabled"

10340 : (Hints.getForce() ==LoopVectorizeHints::FK_Enabled

10341 ?"enabled"

10342 :"?"))

10343 <<" width=" << Hints.getWidth()

10344 <<" interleave=" << Hints.getInterleave() <<"\n");

10345

10346// Function containing loop

10347Function *F = L->getHeader()->getParent();

10348

10349// Looking at the diagnostic output is the only way to determine if a loop

10350// was vectorized (other than looking at the IR or machine code), so it

10351// is important to generate an optimization remark for each loop. Most of

10352// these messages are generated as OptimizationRemarkAnalysis. Remarks

10353// generated as OptimizationRemark and OptimizationRemarkMissed are

10354// less verbose reporting vectorized loops and unvectorized loops that may

10355// benefit from vectorization, respectively.

10356

10357if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {

10358LLVM_DEBUG(dbgs() <<"LV: Loop hints prevent vectorization.\n");

10359returnfalse;

10360 }

10361

10362PredicatedScalarEvolution PSE(*SE, *L);

10363

10364// Check if it is legal to vectorize the loop.

10365LoopVectorizationRequirements Requirements;

10366LoopVectorizationLegality LVL(L, PSE,DT,TTI,TLI,F, *LAIs,LI,ORE,

10367 &Requirements, &Hints,DB,AC,BFI,PSI);

10368if (!LVL.canVectorize(EnableVPlanNativePath)) {

10369LLVM_DEBUG(dbgs() <<"LV: Not vectorizing: Cannot prove legality.\n");

10370 Hints.emitRemarkWithHints();

10371returnfalse;

10372 }

10373

10374if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {

10375reportVectorizationFailure("Auto-vectorization of loops with uncountable "

10376"early exit is not enabled",

10377"UncountableEarlyExitLoopsDisabled",ORE, L);

10378returnfalse;

10379 }

10380

10381if (LVL.hasStructVectorCall()) {

10382reportVectorizationFailure("Auto-vectorization of calls that return struct "

10383"types is not yet supported",

10384"StructCallVectorizationUnsupported",ORE, L);

10385returnfalse;

10386 }

10387

10388// Entrance to the VPlan-native vectorization path. Outer loops are processed

10389// here. They may require CFG and instruction level transformations before

10390// even evaluating whether vectorization is profitable. Since we cannot modify

10391// the incoming IR, we need to build VPlan upfront in the vectorization

10392// pipeline.

10393if (!L->isInnermost())

10394returnprocessLoopInVPlanNativePath(L, PSE,LI,DT, &LVL,TTI,TLI,DB,AC,

10395ORE,BFI,PSI, Hints, Requirements);

10396

10397assert(L->isInnermost() &&"Inner loop expected.");

10398

10399InterleavedAccessInfo IAI(PSE, L,DT,LI, LVL.getLAI());

10400bool UseInterleaved =TTI->enableInterleavedAccessVectorization();

10401

10402// If an override option has been passed in for interleaved accesses, use it.

10403if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)

10404 UseInterleaved =EnableInterleavedMemAccesses;

10405

10406// Analyze interleaved memory accesses.

10407if (UseInterleaved)

10408 IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));

10409

10410if (LVL.hasUncountableEarlyExit()) {

10411BasicBlock *LoopLatch = L->getLoopLatch();

10412if (IAI.requiresScalarEpilogue() ||

10413any_of(LVL.getCountableExitingBlocks(),

10414 [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {

10415reportVectorizationFailure("Auto-vectorization of early exit loops "

10416"requiring a scalar epilogue is unsupported",

10417"UncountableEarlyExitUnsupported",ORE, L);

10418returnfalse;

10419 }

10420 }

10421

10422// Check the function attributes and profiles to find out if this function

10423// should be optimized for size.

10424ScalarEpilogueLoweringSEL =

10425getScalarEpilogueLowering(F, L, Hints,PSI,BFI,TTI,TLI, LVL, &IAI);

10426

10427// Check the loop for a trip count threshold: vectorize loops with a tiny trip

10428// count by optimizing for size, to minimize overheads.

10429auto ExpectedTC =getSmallBestKnownTC(PSE, L);

10430if (ExpectedTC && *ExpectedTC <TinyTripCountVectorThreshold) {

10431LLVM_DEBUG(dbgs() <<"LV: Found a loop with a very small trip count. "

10432 <<"This loop is worth vectorizing only if no scalar "

10433 <<"iteration overheads are incurred.");

10434if (Hints.getForce() ==LoopVectorizeHints::FK_Enabled)

10435LLVM_DEBUG(dbgs() <<" But vectorizing was explicitly forced.\n");

10436else {

10437if (*ExpectedTC >TTI->getMinTripCountTailFoldingThreshold()) {

10438LLVM_DEBUG(dbgs() <<"\n");

10439// Predicate tail-folded loops are efficient even when the loop

10440// iteration count is low. However, setting the epilogue policy to

10441// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops

10442// with runtime checks. It's more effective to let

10443// `areRuntimeChecksProfitable` determine if vectorization is beneficial

10444// for the loop.

10445if (SEL !=CM_ScalarEpilogueNotNeededUsePredicate)

10446SEL =CM_ScalarEpilogueNotAllowedLowTripLoop;

10447 }else {

10448LLVM_DEBUG(dbgs() <<" But the target considers the trip count too "

10449"small to consider vectorizing.\n");

10450reportVectorizationFailure(

10451"The trip count is below the minial threshold value.",

10452"loop trip count is too low, avoiding vectorization",

10453"LowTripCount",ORE, L);

10454 Hints.emitRemarkWithHints();

10455returnfalse;

10456 }

10457 }

10458 }

10459

10460// Check the function attributes to see if implicit floats or vectors are

10461// allowed.

10462if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {

10463reportVectorizationFailure(

10464"Can't vectorize when the NoImplicitFloat attribute is used",

10465"loop not vectorized due to NoImplicitFloat attribute",

10466"NoImplicitFloat",ORE, L);

10467 Hints.emitRemarkWithHints();

10468returnfalse;

10469 }

10470

10471// Check if the target supports potentially unsafe FP vectorization.

10472// FIXME: Add a check for the type of safety issue (denormal, signaling)

10473// for the target we're vectorizing for, to make sure none of the

10474// additional fp-math flags can help.

10475if (Hints.isPotentiallyUnsafe() &&

10476TTI->isFPVectorizationPotentiallyUnsafe()) {

10477reportVectorizationFailure(

10478"Potentially unsafe FP op prevents vectorization",

10479"loop not vectorized due to unsafe FP support.",

10480"UnsafeFP",ORE, L);

10481 Hints.emitRemarkWithHints();

10482returnfalse;

10483 }

10484

10485bool AllowOrderedReductions;

10486// If the flag is set, use that instead and override the TTI behaviour.

10487if (ForceOrderedReductions.getNumOccurrences() > 0)

10488 AllowOrderedReductions =ForceOrderedReductions;

10489else

10490 AllowOrderedReductions =TTI->enableOrderedReductions();

10491if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {

10492ORE->emit([&]() {

10493auto *ExactFPMathInst = Requirements.getExactFPInst();

10494returnOptimizationRemarkAnalysisFPCommute(DEBUG_TYPE,"CantReorderFPOps",

10495 ExactFPMathInst->getDebugLoc(),

10496 ExactFPMathInst->getParent())

10497 <<"loop not vectorized: cannot prove it is safe to reorder "

10498"floating-point operations";

10499 });

10500LLVM_DEBUG(dbgs() <<"LV: loop not vectorized: cannot prove it is safe to "

10501"reorder floating-point operations\n");

10502 Hints.emitRemarkWithHints();

10503returnfalse;

10504 }

10505

10506// Use the cost model.

10507LoopVectorizationCostModel CM(SEL, L, PSE,LI, &LVL, *TTI,TLI,DB,AC,ORE,

10508F, &Hints, IAI);

10509// Use the planner for vectorization.

10510LoopVectorizationPlanner LVP(L,LI,DT,TLI, *TTI, &LVL, CM, IAI, PSE, Hints,

10511ORE);

10512

10513// Get user vectorization factor and interleave count.

10514ElementCount UserVF = Hints.getWidth();

10515unsigned UserIC = Hints.getInterleave();

10516

10517// Plan how to best vectorize.

10518 LVP.plan(UserVF, UserIC);

10519VectorizationFactor VF = LVP.computeBestVF();

10520unsigned IC = 1;

10521

10522if (ORE->allowExtraAnalysis(LV_NAME))

10523 LVP.emitInvalidCostRemarks(ORE);

10524

10525bool AddBranchWeights =

10526hasBranchWeightMD(*L->getLoopLatch()->getTerminator());

10527 GeneratedRTChecks Checks(PSE,DT,LI,TTI,F->getDataLayout(),

10528 AddBranchWeights, CM.CostKind);

10529if (LVP.hasPlanWithVF(VF.Width)) {

10530// Select the interleave count.

10531 IC = CM.selectInterleaveCount(VF.Width, VF.Cost);

10532

10533unsigned SelectedIC = std::max(IC, UserIC);

10534// Optimistically generate runtime checks if they are needed. Drop them if

10535// they turn out to not be profitable.

10536if (VF.Width.isVector() || SelectedIC > 1)

10537 Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);

10538

10539// Check if it is profitable to vectorize with runtime checks.

10540bool ForceVectorization =

10541 Hints.getForce() ==LoopVectorizeHints::FK_Enabled;

10542if (!ForceVectorization &&

10543 !areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE,SEL)) {

10544ORE->emit([&]() {

10545returnOptimizationRemarkAnalysisAliasing(

10546DEBUG_TYPE,"CantReorderMemOps", L->getStartLoc(),

10547 L->getHeader())

10548 <<"loop not vectorized: cannot prove it is safe to reorder "

10549"memory operations";

10550 });

10551LLVM_DEBUG(dbgs() <<"LV: Too many memory checks needed.\n");

10552 Hints.emitRemarkWithHints();

10553returnfalse;

10554 }

10555 }

10556

10557// Identify the diagnostic messages that should be produced.

10558 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;

10559bool VectorizeLoop =true, InterleaveLoop =true;

10560if (VF.Width.isScalar()) {

10561LLVM_DEBUG(dbgs() <<"LV: Vectorization is possible but not beneficial.\n");

10562 VecDiagMsg = std::make_pair(

10563"VectorizationNotBeneficial",

10564"the cost-model indicates that vectorization is not beneficial");

10565 VectorizeLoop =false;

10566 }

10567

10568if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {

10569// Tell the user interleaving was avoided up-front, despite being explicitly

10570// requested.

10571LLVM_DEBUG(dbgs() <<"LV: Ignoring UserIC, because vectorization and "

10572"interleaving should be avoided up front\n");

10573 IntDiagMsg = std::make_pair(

10574"InterleavingAvoided",

10575"Ignoring UserIC, because interleaving was avoided up front");

10576 InterleaveLoop =false;

10577 }elseif (IC == 1 && UserIC <= 1) {

10578// Tell the user interleaving is not beneficial.

10579LLVM_DEBUG(dbgs() <<"LV: Interleaving is not beneficial.\n");

10580 IntDiagMsg = std::make_pair(

10581"InterleavingNotBeneficial",

10582"the cost-model indicates that interleaving is not beneficial");

10583 InterleaveLoop =false;

10584if (UserIC == 1) {

10585 IntDiagMsg.first ="InterleavingNotBeneficialAndDisabled";

10586 IntDiagMsg.second +=

10587" and is explicitly disabled or interleave count is set to 1";

10588 }

10589 }elseif (IC > 1 && UserIC == 1) {

10590// Tell the user interleaving is beneficial, but it explicitly disabled.

10591LLVM_DEBUG(

10592dbgs() <<"LV: Interleaving is beneficial but is explicitly disabled.");

10593 IntDiagMsg = std::make_pair(

10594"InterleavingBeneficialButDisabled",

10595"the cost-model indicates that interleaving is beneficial "

10596"but is explicitly disabled or interleave count is set to 1");

10597 InterleaveLoop =false;

10598 }

10599

10600// If there is a histogram in the loop, do not just interleave without

10601// vectorizing. The order of operations will be incorrect without the

10602// histogram intrinsics, which are only used for recipes with VF > 1.

10603if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {

10604LLVM_DEBUG(dbgs() <<"LV: Not interleaving without vectorization due "

10605 <<"to histogram operations.\n");

10606 IntDiagMsg = std::make_pair(

10607"HistogramPreventsScalarInterleaving",

10608"Unable to interleave without vectorization due to constraints on "

10609"the order of histogram operations");

10610 InterleaveLoop =false;

10611 }

10612

10613// Override IC if user provided an interleave count.

10614 IC = UserIC > 0 ? UserIC : IC;

10615

10616// Emit diagnostic messages, if any.

10617constchar *VAPassName = Hints.vectorizeAnalysisPassName();

10618if (!VectorizeLoop && !InterleaveLoop) {

10619// Do not vectorize or interleaving the loop.

10620ORE->emit([&]() {

10621returnOptimizationRemarkMissed(VAPassName, VecDiagMsg.first,

10622 L->getStartLoc(), L->getHeader())

10623 << VecDiagMsg.second;

10624 });

10625ORE->emit([&]() {

10626returnOptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,

10627 L->getStartLoc(), L->getHeader())

10628 << IntDiagMsg.second;

10629 });

10630returnfalse;

10631 }

10632

10633if (!VectorizeLoop && InterleaveLoop) {

10634LLVM_DEBUG(dbgs() <<"LV: Interleave Count is " << IC <<'\n');

10635ORE->emit([&]() {

10636returnOptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,

10637 L->getStartLoc(), L->getHeader())

10638 << VecDiagMsg.second;

10639 });

10640 }elseif (VectorizeLoop && !InterleaveLoop) {

10641LLVM_DEBUG(dbgs() <<"LV: Found a vectorizable loop (" << VF.Width

10642 <<") in " << L->getLocStr() <<'\n');

10643ORE->emit([&]() {

10644returnOptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,

10645 L->getStartLoc(), L->getHeader())

10646 << IntDiagMsg.second;

10647 });

10648 }elseif (VectorizeLoop && InterleaveLoop) {

10649LLVM_DEBUG(dbgs() <<"LV: Found a vectorizable loop (" << VF.Width

10650 <<") in " << L->getLocStr() <<'\n');

10651LLVM_DEBUG(dbgs() <<"LV: Interleave Count is " << IC <<'\n');

10652 }

10653

10654bool DisableRuntimeUnroll =false;

10655MDNode *OrigLoopID = L->getLoopID();

10656 {

10657using namespaceore;

10658if (!VectorizeLoop) {

10659assert(IC > 1 &&"interleave count should not be 1 or 0");

10660// If we decided that it is not legal to vectorize the loop, then

10661// interleave it.

10662VPlan &BestPlan = LVP.getPlanFor(VF.Width);

10663InnerLoopVectorizer Unroller(

10664 L, PSE,LI,DT,TLI,TTI,AC,ORE,ElementCount::getFixed(1),

10665ElementCount::getFixed(1), IC, &LVL, &CM,BFI,PSI, Checks, BestPlan);

10666

10667 LVP.executePlan(VF.Width, IC, BestPlan, Unroller,DT,false);

10668

10669ORE->emit([&]() {

10670returnOptimizationRemark(LV_NAME,"Interleaved", L->getStartLoc(),

10671 L->getHeader())

10672 <<"interleaved loop (interleaved count: "

10673 << NV("InterleaveCount", IC) <<")";

10674 });

10675 }else {

10676// If we decided that it is *legal* to vectorize the loop, then do it.

10677

10678VPlan &BestPlan = LVP.getPlanFor(VF.Width);

10679// Consider vectorizing the epilogue too if it's profitable.

10680VectorizationFactor EpilogueVF =

10681 LVP.selectEpilogueVectorizationFactor(VF.Width, IC);

10682if (EpilogueVF.Width.isVector()) {

10683 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());

10684

10685// The first pass vectorizes the main loop and creates a scalar epilogue

10686// to be vectorized by executing the plan (potentially with a different

10687// factor) again shortly afterwards.

10688VPlan &BestEpiPlan = LVP.getPlanFor(EpilogueVF.Width);

10689preparePlanForMainVectorLoop(*BestMainPlan, BestEpiPlan);

10690EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,

10691 BestEpiPlan);

10692EpilogueVectorizerMainLoop MainILV(L, PSE,LI,DT,TLI,TTI,AC,ORE,

10693 EPI, &LVL, &CM,BFI,PSI, Checks,

10694 *BestMainPlan);

10695auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,

10696 *BestMainPlan, MainILV,DT,false);

10697 ++LoopsVectorized;

10698

10699// Second pass vectorizes the epilogue and adjusts the control flow

10700// edges from the first pass.

10701 EPI.MainLoopVF = EPI.EpilogueVF;

10702 EPI.MainLoopUF = EPI.EpilogueUF;

10703EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE,LI,DT,TLI,TTI,AC,

10704ORE, EPI, &LVL, &CM,BFI,PSI,

10705 Checks, BestEpiPlan);

10706 EpilogILV.setTripCount(MainILV.getTripCount());

10707preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);

10708

10709 LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,

10710DT,true, &ExpandedSCEVs);

10711 ++LoopsEpilogueVectorized;

10712

10713if (!MainILV.areSafetyChecksAdded())

10714 DisableRuntimeUnroll =true;

10715 }else {

10716InnerLoopVectorizer LB(L, PSE,LI,DT,TLI,TTI,AC,ORE, VF.Width,

10717 VF.MinProfitableTripCount, IC, &LVL, &CM,BFI,

10718PSI, Checks, BestPlan);

10719 LVP.executePlan(VF.Width, IC, BestPlan, LB,DT,false);

10720 ++LoopsVectorized;

10721

10722// Add metadata to disable runtime unrolling a scalar loop when there

10723// are no runtime checks about strides and memory. A scalar loop that is

10724// rarely used is not worth unrolling.

10725if (!LB.areSafetyChecksAdded())

10726 DisableRuntimeUnroll =true;

10727 }

10728// Report the vectorization decision.

10729reportVectorization(ORE, L, VF, IC);

10730 }

10731

10732if (ORE->allowExtraAnalysis(LV_NAME))

10733checkMixedPrecision(L,ORE);

10734 }

10735

10736assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&

10737"DT not preserved correctly");

10738

10739 std::optional<MDNode *> RemainderLoopID =

10740makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,

10741LLVMLoopVectorizeFollowupEpilogue});

10742if (RemainderLoopID) {

10743 L->setLoopID(*RemainderLoopID);

10744 }else {

10745if (DisableRuntimeUnroll)

10746addRuntimeUnrollDisableMetaData(L);

10747

10748// Mark the loop as already vectorized to avoid vectorizing again.

10749 Hints.setAlreadyVectorized();

10750 }

10751

10752assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));

10753returntrue;

10754}

10755

10756LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {

10757

10758// Don't attempt if

10759// 1. the target claims to have no vector registers, and

10760// 2. interleaving won't help ILP.

10761//

10762// The second condition is necessary because, even if the target has no

10763// vector registers, loop vectorization may still enable scalar

10764// interleaving.

10765if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&

10766TTI->getMaxInterleaveFactor(ElementCount::getFixed(1)) < 2)

10767returnLoopVectorizeResult(false,false);

10768

10769bool Changed =false, CFGChanged =false;

10770

10771// The vectorizer requires loops to be in simplified form.

10772// Since simplification may add new inner loops, it has to run before the

10773// legality and profitability checks. This means running the loop vectorizer

10774// will simplify all loops, regardless of whether anything end up being

10775// vectorized.

10776for (constauto &L : *LI)

10777 Changed |= CFGChanged |=

10778simplifyLoop(L,DT,LI,SE,AC,nullptr,false/* PreserveLCSSA */);

10779

10780// Build up a worklist of inner-loops to vectorize. This is necessary as

10781// the act of vectorizing or partially unrolling a loop creates new loops

10782// and can invalidate iterators across the loops.

10783SmallVector<Loop *, 8> Worklist;

10784

10785for (Loop *L : *LI)

10786collectSupportedLoops(*L,LI,ORE, Worklist);

10787

10788 LoopsAnalyzed += Worklist.size();

10789

10790// Now walk the identified inner loops.

10791while (!Worklist.empty()) {

10792Loop *L = Worklist.pop_back_val();

10793

10794// For the inner loops we actually process, form LCSSA to simplify the

10795// transform.

10796 Changed |=formLCSSARecursively(*L, *DT,LI,SE);

10797

10798 Changed |= CFGChanged |=processLoop(L);

10799

10800if (Changed) {

10801LAIs->clear();

10802

10803#ifndef NDEBUG

10804if (VerifySCEV)

10805SE->verify();

10806#endif

10807 }

10808 }

10809

10810// Process each loop nest in the function.

10811returnLoopVectorizeResult(Changed, CFGChanged);

10812}

10813

10814PreservedAnalyses LoopVectorizePass::run(Function &F,

10815FunctionAnalysisManager &AM) {

10816LI = &AM.getResult<LoopAnalysis>(F);

10817// There are no loops in the function. Return before computing other

10818// expensive analyses.

10819if (LI->empty())

10820returnPreservedAnalyses::all();

10821SE = &AM.getResult<ScalarEvolutionAnalysis>(F);

10822TTI = &AM.getResult<TargetIRAnalysis>(F);

10823DT = &AM.getResult<DominatorTreeAnalysis>(F);

10824TLI = &AM.getResult<TargetLibraryAnalysis>(F);

10825AC = &AM.getResult<AssumptionAnalysis>(F);

10826DB = &AM.getResult<DemandedBitsAnalysis>(F);

10827ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

10828LAIs = &AM.getResult<LoopAccessAnalysis>(F);

10829

10830auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);

10831PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());

10832BFI =nullptr;

10833if (PSI &&PSI->hasProfileSummary())

10834BFI = &AM.getResult<BlockFrequencyAnalysis>(F);

10835LoopVectorizeResult Result =runImpl(F);

10836if (!Result.MadeAnyChange)

10837returnPreservedAnalyses::all();

10838PreservedAnalyses PA;

10839

10840if (isAssignmentTrackingEnabled(*F.getParent())) {

10841for (auto &BB :F)

10842RemoveRedundantDbgInstrs(&BB);

10843 }

10844

10845 PA.preserve<LoopAnalysis>();

10846 PA.preserve<DominatorTreeAnalysis>();

10847 PA.preserve<ScalarEvolutionAnalysis>();

10848 PA.preserve<LoopAccessAnalysis>();

10849

10850if (Result.MadeCFGChange) {

10851// Making CFG changes likely means a loop got vectorized. Indicate that

10852// extra simplification passes should be run.

10853// TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only

10854// be run if runtime checks have been added.

10855 AM.getResult<ShouldRunExtraVectorPasses>(F);

10856 PA.preserve<ShouldRunExtraVectorPasses>();

10857 }else {

10858 PA.preserveSet<CFGAnalyses>();

10859 }

10860return PA;

10861}

10862

10863voidLoopVectorizePass::printPipeline(

10864raw_ostream &OS,function_ref<StringRef(StringRef)> MapClassName2PassName) {

10865static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(

10866OS, MapClassName2PassName);

10867

10868OS <<'<';

10869OS << (InterleaveOnlyWhenForced ?"" :"no-") <<"interleave-forced-only;";

10870OS << (VectorizeOnlyWhenForced ?"" :"no-") <<"vectorize-forced-only;";

10871OS <<'>';

10872}

Poison

@ Poison

Definition:AArch64AsmPrinter.cpp:72

getIntrinsicID

static unsigned getIntrinsicID(const SDNode *N)

Definition:AArch64ISelLowering.cpp:7713

FMAInstKind::Accumulator

@ Accumulator

RegSize

unsigned RegSize

Definition:AArch64MIPeepholeOpt.cpp:161

Arguments

AMDGPU Lower Kernel Arguments

Definition:AMDGPULowerKernelArguments.cpp:504

Select

AMDGPU Register Bank Select

Definition:AMDGPURegBankSelect.cpp:71

PHI

Rewrite undef for PHI

Definition:AMDGPURewriteUndefForPHI.cpp:100

APInt.h

This file implements a class to represent arbitrary precision integral constant values and operations...

PostInc

@ PostInc

Definition:ARCInstrInfo.cpp:34

ToRemove

ReachingDefAnalysis InstSet & ToRemove

Definition:ARMLowOverheadLoops.cpp:531

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

Definition:ARMSLSHardening.cpp:73

static bool isEqual(const Function &Caller, const Function &Callee)

Definition:Attributes.cpp:2469

Attributes.h

This file contains the simple types necessary to represent the attributes associated with functions a...

BasicAliasAnalysis.h

This is the interface for LLVM's primary stateless and local alias analysis.

BasicBlockUtils.h

BlockFrequencyInfo.h

IsEmptyBlock

static bool IsEmptyBlock(MachineBasicBlock *MBB)

Definition:BranchFolding.cpp:1234

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

Info

Analysis containing CSE Info

Definition:CSEInfo.cpp:27

#define clEnumValN(ENUMVAL, FLAGNAME, DESC)

Definition:CommandLine.h:686

Constants.h

This file contains the declarations for the subclasses of Constant, which represent the different fla...

DataLayout.h

RetTy

return RetTy

Definition:DeadArgumentElimination.cpp:361

Idx

Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx

Definition:DeadArgumentElimination.cpp:353

DebugLoc.h

Debug.h

LLVM_DEBUG

#define LLVM_DEBUG(...)

Definition:Debug.h:106

DEBUG_WITH_TYPE

#define DEBUG_WITH_TYPE(TYPE,...)

DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.

Definition:Debug.h:64

DemandedBits.h

DenseMapInfo.h

This file defines DenseMapInfo traits for DenseMap.

DenseMap.h

This file defines the DenseMap class.

std::string Name

Definition:ELFObjHandler.cpp:77

End

bool End

Definition:ELF_riscv.cpp:480

static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")

Check

#define Check(C,...)

Definition:GenericConvergenceVerifierImpl.h:34

DEBUG_TYPE

#define DEBUG_TYPE

Definition:GenericCycleImpl.h:31

GlobalsModRef.h

This is the interface for a simple mod/ref and alias analysis over globals.

Hashing.h

GEP

Hexagon Common GEP

Definition:HexagonCommonGEP.cpp:170

#define _

Definition:HexagonMCCodeEmitter.cpp:46

IRBuilder.h

BasicBlock.h

CFG.h

This file provides various utilities for inspecting and working with the control flow graph in LLVM I...

Module.h This file contains the declarations for the Module class.

Operator.h

Type.h

Use.h

This defines the Use class.

This file defines an InstructionCost class that is used when calculating the cost of an instruction,...

Instructions.h

Intrinsics.h

Legalize the Machine IR a function s Machine IR

Definition:Legalizer.cpp:80

LoopAccessAnalysis.h

LoopAnalysisManager.h

This header provides classes for managing per-loop analyses.

LoopInfo.h

LoopIterator.h

VerboseDebug

static const char * VerboseDebug

Definition:LoopNestAnalysis.cpp:23

LoopSimplify.h

Reduction

loop Loop Strength Reduction

Definition:LoopStrengthReduce.cpp:7191

LoopUtils.h

LV_NAME

#define LV_NAME

Definition:LoopVectorizationLegality.cpp:34

LoopVectorizationLegality.h

This file defines the LoopVectorizationLegality class.

LoopVectorizationPlanner.h

This file provides a LoopVectorizationPlanner class.

collectSupportedLoops

static void collectSupportedLoops(Loop &L, LoopInfo *LI, OptimizationRemarkEmitter *ORE, SmallVectorImpl< Loop * > &V)

Definition:LoopVectorize.cpp:2179

EpilogueVectorizationMinVF

static cl::opt< unsigned > EpilogueVectorizationMinVF("epilogue-vectorization-minimum-VF", cl::Hidden, cl::desc("Only loops with vectorization factor equal to or larger than " "the specified value are considered for epilogue vectorization."))

EpilogueVectorizationForceVF

static cl::opt< unsigned > EpilogueVectorizationForceVF("epilogue-vectorization-force-VF", cl::init(1), cl::Hidden, cl::desc("When epilogue vectorization is enabled, and a value greater than " "1 is specified, forces the given VF for all applicable epilogue " "loops."))

addScalarResumePhis

static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)

Create resume phis in the scalar preheader for first-order recurrences, reductions and inductions,...

Definition:LoopVectorize.cpp:8992

addRuntimeUnrollDisableMetaData

static void addRuntimeUnrollDisableMetaData(Loop *L)

Definition:LoopVectorize.cpp:7545

determineVPlanVF

static ElementCount determineVPlanVF(const TargetTransformInfo &TTI, LoopVectorizationCostModel &CM)

Definition:LoopVectorize.cpp:7028

VectorizeMemoryCheckThreshold

static cl::opt< unsigned > VectorizeMemoryCheckThreshold("vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks"))

preparePlanForMainVectorLoop

static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan)

Prepare MainPlan for vectorizing the main vector loop during epilogue vectorization.

Definition:LoopVectorize.cpp:10157

TinyTripCountVectorThreshold

static cl::opt< unsigned > TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), cl::Hidden, cl::desc("Loops with a constant trip count that is smaller than this " "value are vectorized only if no scalar iteration overheads " "are incurred."))

Loops with a known constant trip count below this number are vectorized only if no scalar iteration o...

debugVectorizationMessage

static void debugVectorizationMessage(const StringRef Prefix, const StringRef DebugMsg, Instruction *I)

Write a DebugMsg about vectorization to the debug output stream.

Definition:LoopVectorize.cpp:840

EnableCondStoresVectorization

static cl::opt< bool > EnableCondStoresVectorization("enable-cond-stores-vec", cl::init(true), cl::Hidden, cl::desc("Enable if predication of stores during vectorization."))

addResumePhiRecipeForInduction

static VPInstruction * addResumePhiRecipeForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)

Create and return a ResumePhi for WideIV, unless it is truncated.

Definition:LoopVectorize.cpp:8954

emitTransformedIndex

static Value * emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue, Value *Step, InductionDescriptor::InductionKind InductionKind, const BinaryOperator *InductionBinOp)

Compute the transformed value of Index at offset StartValue using step StepValue.

Definition:LoopVectorize.cpp:2216

getDebugLocFromInstOrOperands

static DebugLoc getDebugLocFromInstOrOperands(Instruction *I)

Look for a meaningful debug location on the instruction or its operands.

Definition:LoopVectorize.cpp:820

areRuntimeChecksProfitable

static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks, VectorizationFactor &VF, Loop *L, const TargetTransformInfo &TTI, PredicatedScalarEvolution &PSE, ScalarEpilogueLowering SEL)

Definition:LoopVectorize.cpp:10052

replaceVPBBWithIRVPBB

static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB)

Replace VPBB with a VPIRBasicBlock wrapping IRBB.

Definition:LoopVectorize.cpp:2617

LLVMLoopVectorizeFollowupAll

const char LLVMLoopVectorizeFollowupAll[]

Definition:LoopVectorize.cpp:167

collectUsersInExitBlocks

static SetVector< VPIRInstruction * > collectUsersInExitBlocks(Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan)

Definition:LoopVectorize.cpp:9053

addExitUsersForFirstOrderRecurrences

static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, SetVector< VPIRInstruction * > &ExitUsersToFix)

Handle users in the exit block for first order reductions in the original exit block.

Definition:LoopVectorize.cpp:9110

ForceTargetSupportsScalableVectors

static cl::opt< bool > ForceTargetSupportsScalableVectors("force-target-supports-scalable-vectors", cl::init(false), cl::Hidden, cl::desc("Pretend that scalable vectors are supported, even if the target does " "not support them. This flag should only be used for testing."))

addCanonicalIVRecipes

static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW, DebugLoc DL)

Definition:LoopVectorize.cpp:8928

getVScaleForTuning

static std::optional< unsigned > getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI)

Convenience function that returns the value of vscale_range iff vscale_range.min == vscale_range....

Definition:LoopVectorize.cpp:4238

useActiveLaneMaskForControlFlow

static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style)

Definition:LoopVectorize.cpp:2132

MemCheckBypassWeights

static constexpr uint32_t MemCheckBypassWeights[]

Definition:LoopVectorize.cpp:398

ForceTargetInstructionCost

cl::opt< unsigned > ForceTargetInstructionCost("force-target-instruction-cost", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's expected cost for " "an instruction to a single constant value. Mostly " "useful for getting consistent testing."))

getMaxVScale

std::optional< unsigned > getMaxVScale(const Function &F, const TargetTransformInfo &TTI)

Definition:LoopVectorize.cpp:2295

MinItersBypassWeights

static constexpr uint32_t MinItersBypassWeights[]

Definition:LoopVectorize.cpp:401

ForceTargetNumScalarRegs

static cl::opt< unsigned > ForceTargetNumScalarRegs("force-target-num-scalar-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of scalar registers."))

UseWiderVFIfCallVariantsPresent

static cl::opt< bool > UseWiderVFIfCallVariantsPresent("vectorizer-maximize-bandwidth-for-vector-calls", cl::init(true), cl::Hidden, cl::desc("Try wider VFs if they enable the use of vector variants"))

planContainsAdditionalSimplifications

static bool planContainsAdditionalSimplifications(VPlan &Plan, VPCostContext &CostCtx, Loop *TheLoop)

Return true if the original loop \ TheLoop contains any instructions that do not have corresponding r...

Definition:LoopVectorize.cpp:7409

SmallLoopCost

static cl::opt< unsigned > SmallLoopCost("small-loop-cost", cl::init(20), cl::Hidden, cl::desc("The cost of a loop that is considered 'small' by the interleaver."))

ForceTargetNumVectorRegs

static cl::opt< unsigned > ForceTargetNumVectorRegs("force-target-num-vector-regs", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's number of vector registers."))

isExplicitVecOuterLoop

static bool isExplicitVecOuterLoop(Loop *OuterLp, OptimizationRemarkEmitter *ORE)

Definition:LoopVectorize.cpp:2151

EnableIndVarRegisterHeur

static cl::opt< bool > EnableIndVarRegisterHeur("enable-ind-var-reg-heur", cl::init(true), cl::Hidden, cl::desc("Count the induction variable only once when interleaving"))

maybeVectorizeType

static Type * maybeVectorizeType(Type *Elt, ElementCount VF)

Definition:LoopVectorize.cpp:2858

getSmallBestKnownTC

static std::optional< unsigned > getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L, bool CanUseConstantMax=true)

Returns "best known" trip count for the specified loop L as defined by the following procedure: 1) Re...

Definition:LoopVectorize.cpp:420

ForceTailFoldingStyle

static cl::opt< TailFoldingStyle > ForceTailFoldingStyle("force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values(clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN(TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), clEnumValN(TailFoldingStyle::DataWithoutLaneMask, "data-without-lane-mask", "Create lane mask with compare/stepvector"), clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", "Create lane mask using active.lane.mask intrinsic, and use " "it for both data and control flow"), clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, "data-and-control-without-rt-check", "Similar to data-and-control, but remove the runtime check"), clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", "Use predicated EVL instructions for tail folding. If EVL " "is unsupported, fallback to data-without-lane-mask.")))

EnableEpilogueVectorization

static cl::opt< bool > EnableEpilogueVectorization("enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops."))

getScalarEpilogueLowering

static ScalarEpilogueLowering getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI)

Definition:LoopVectorize.cpp:9895

VerboseDebug

const char VerboseDebug[]

Definition:LoopVectorize.cpp:162

fixReductionScalarResumeWhenVectorizingEpilog

static void fixReductionScalarResumeWhenVectorizingEpilog(VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock, BasicBlock *BypassBlock)

Definition:LoopVectorize.cpp:7582

PreferPredicatedReductionSelect

static cl::opt< bool > PreferPredicatedReductionSelect("prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc("Prefer predicating a reduction operation over an after loop select."))

createWidenInductionRecipes

static VPWidenIntOrFpInductionRecipe * createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start, const InductionDescriptor &IndDesc, VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop)

Creates a VPWidenIntOrFpInductionRecpipe for Phi.

Definition:LoopVectorize.cpp:8317

SCEVCheckBypassWeights

static constexpr uint32_t SCEVCheckBypassWeights[]

Definition:LoopVectorize.cpp:395

PreferInLoopReductions

static cl::opt< bool > PreferInLoopReductions("prefer-inloop-reductions", cl::init(false), cl::Hidden, cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference."))

LLVMLoopVectorizeFollowupVectorized

const char LLVMLoopVectorizeFollowupVectorized[]

Definition:LoopVectorize.cpp:168

EnableLoadStoreRuntimeInterleave

static cl::opt< bool > EnableLoadStoreRuntimeInterleave("enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden, cl::desc("Enable runtime interleaving until load/store ports are saturated"))

VPlanBuildStressTest

static cl::opt< bool > VPlanBuildStressTest("vplan-build-stress-test", cl::init(false), cl::Hidden, cl::desc("Build VPlan for every supported loop nest in the function and bail " "out right after the build (stress test the VPlan H-CFG construction " "in the VPlan-native vectorization path)."))

hasIrregularType

static bool hasIrregularType(Type *Ty, const DataLayout &DL)

A helper function that returns true if the given type is irregular.

Definition:LoopVectorize.cpp:406

LoopVectorizeWithBlockFrequency

static cl::opt< bool > LoopVectorizeWithBlockFrequency("loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden, cl::desc("Enable the use of the block frequency analysis to access PGO " "heuristics minimizing code growth in cold regions and being more " "aggressive in hot regions."))

getExpandedStep

static Value * getExpandedStep(const InductionDescriptor &ID, const SCEV2ValueTy &ExpandedSCEVs)

Return the expanded step for ID using ExpandedSCEVs to look up SCEV expansion results.

Definition:LoopVectorize.cpp:2647

LLVMLoopVectorizeFollowupEpilogue

const char LLVMLoopVectorizeFollowupEpilogue[]

Definition:LoopVectorize.cpp:170

preparePlanForEpilogueVectorLoop

static void preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, const SCEV2ValueTy &ExpandedSCEVs, const EpilogueLoopVectorizationInfo &EPI)

Prepare Plan for vectorizing the epilogue loop.

Definition:LoopVectorize.cpp:10208

useActiveLaneMask

static bool useActiveLaneMask(TailFoldingStyle Style)

Definition:LoopVectorize.cpp:2126

getEstimatedRuntimeVF

static unsigned getEstimatedRuntimeVF(const Loop *L, const TargetTransformInfo &TTI, ElementCount VF)

This function attempts to return a value that represents the vectorization factor at runtime.

Definition:LoopVectorize.cpp:4255

isIndvarOverflowCheckKnownFalse

static bool isIndvarOverflowCheckKnownFalse(const LoopVectorizationCostModel *Cost, ElementCount VF, std::optional< unsigned > UF=std::nullopt)

For the given VF and UF and maximum trip count computed for the loop, return whether the induction va...

Definition:LoopVectorize.cpp:2310

addFullyUnrolledInstructionsToIgnore

static void addFullyUnrolledInstructionsToIgnore(Loop *L, const LoopVectorizationLegality::InductionList &IL, SmallPtrSetImpl< Instruction * > &InstsToIgnore)

Knowing that loop L executes a single vector iteration, add instructions that will get simplified and...

Definition:LoopVectorize.cpp:2662

PreferPredicateOverEpilogue

static cl::opt< PreferPredicateTy::Option > PreferPredicateOverEpilogue("prefer-predicate-over-epilogue", cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", "Don't tail-predicate loops, create scalar epilogue"), clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " "tail-folding fails.")))

EnableInterleavedMemAccesses

static cl::opt< bool > EnableInterleavedMemAccesses("enable-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on interleaved memory accesses in a loop"))

EnableMaskedInterleavedMemAccesses

static cl::opt< bool > EnableMaskedInterleavedMemAccesses("enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden, cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"))

An interleave-group may need masking if it resides in a block that needs predication,...

ForceOrderedReductions

static cl::opt< bool > ForceOrderedReductions("force-ordered-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " "FP reductions"))

cse

static void cse(BasicBlock *BB)

Perform cse of induction variable instructions.

Definition:LoopVectorize.cpp:2811

getAddressAccessSCEV

static const SCEV * getAddressAccessSCEV(Value *Ptr, LoopVectorizationLegality *Legal, PredicatedScalarEvolution &PSE, const Loop *TheLoop)

Gets Address Access SCEV after verifying that the access pattern is loop invariant except the inducti...

Definition:LoopVectorize.cpp:5577

ForceSafeDivisor

static cl::opt< cl::boolOrDefault > ForceSafeDivisor("force-widen-divrem-via-safe-divisor", cl::Hidden, cl::desc("Override cost based safe divisor widening for div/rem instructions"))

DEBUG_TYPE

#define DEBUG_TYPE

Definition:LoopVectorize.cpp:159

ForceTargetMaxVectorInterleaveFactor

static cl::opt< unsigned > ForceTargetMaxVectorInterleaveFactor("force-target-max-vector-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "vectorized loops."))

processLoopInVPlanNativePath

static bool processLoopInVPlanNativePath(Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements)

Definition:LoopVectorize.cpp:9944

useMaskedInterleavedAccesses

static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI)

Definition:LoopVectorize.cpp:2341

NumberOfStoresToPredicate

static cl::opt< unsigned > NumberOfStoresToPredicate("vectorize-num-stores-pred", cl::init(1), cl::Hidden, cl::desc("Max number of stores to be predicated behind an if."))

The number of stores in a loop that are allowed to need predication.

MaxNestedScalarReductionIC

static cl::opt< unsigned > MaxNestedScalarReductionIC("max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden, cl::desc("The maximum interleave count to use when interleaving a scalar " "reduction in a nested loop."))

ForceTargetMaxScalarInterleaveFactor

static cl::opt< unsigned > ForceTargetMaxScalarInterleaveFactor("force-target-max-scalar-interleave", cl::init(0), cl::Hidden, cl::desc("A flag that overrides the target's max interleave factor for " "scalar loops."))

checkMixedPrecision

static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE)

Definition:LoopVectorize.cpp:10011

addUsersInExitBlocks

static void addUsersInExitBlocks(VPlan &Plan, const SetVector< VPIRInstruction * > &ExitUsersToFix)

Definition:LoopVectorize.cpp:9087

willGenerateVectors

static bool willGenerateVectors(VPlan &Plan, ElementCount VF, const TargetTransformInfo &TTI)

Check if any recipe of Plan will generate a vector value, which will be assigned a vector register.

Definition:LoopVectorize.cpp:4439

EnableEarlyExitVectorization

static cl::opt< bool > EnableEarlyExitVectorization("enable-early-exit-vectorization", cl::init(false), cl::Hidden, cl::desc("Enable vectorization of early exit loops with uncountable exits."))

MaximizeBandwidth

static cl::opt< bool > MaximizeBandwidth("vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " "will be determined by the smallest type in loop."))

createLVAnalysis

static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop, Instruction *I, DebugLoc DL={})

Create an analysis remark that explains why vectorization failed.

Definition:LoopVectorize.cpp:860

LoopVectorize.h

LoopVersioning.h

#define F(x, y, z)

Definition:MD5.cpp:55

#define I(x, y, z)

Definition:MD5.cpp:58

MDBuilder.h

Operands

mir Rename Register Operands

Definition:MIRNamerPass.cpp:74

MapVector.h

This file implements a map that provides insertion order iteration.

Interval

std::pair< uint64_t, uint64_t > Interval

Definition:MappedBlockStream.cpp:36

MathExtras.h

Metadata.h

This file contains the declarations for metadata subclasses.

Range

ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))

uint64_t IntrinsicInst * II

Definition:NVVMIntrRange.cpp:51

NativeFormatting.h

static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")

OptimizationRemarkEmitter.h

#define P(N)

if(PassOpts->AAPipeline)

Definition:PassBuilderBindings.cpp:64

PatternMatch.h

ProfDataUtils.h

This file contains the declarations for profiling metadata utility functions.

ProfileSummaryInfo.h

Cond

const SmallVectorImpl< MachineOperand > & Cond

Definition:RISCVRedundantCopyElimination.cpp:75

CreateMul

static BinaryOperator * CreateMul(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)

Definition:Reassociate.cpp:261

CreateAdd

static BinaryOperator * CreateAdd(Value *S1, Value *S2, const Twine &Name, BasicBlock::iterator InsertBefore, Value *FlagsOp)

Definition:Reassociate.cpp:248

assert

assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

STLExtras.h

This file contains some templates that are useful if you are working with the STL at all.

raw_pwrite_stream & OS

Definition:SampleProfWriter.cpp:51

#define OP(OPC)

Definition:Instruction.h:45

ScalarEvolutionExpander.h

ScalarEvolutionExpressions.h

ScalarEvolution.h

SizeOpts.h

SmallPtrSet.h

This file defines the SmallPtrSet class.

SmallVector.h

This file defines the SmallVector class.

Statistic.h

This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC

#define STATISTIC(VARNAME, DESC)

Definition:Statistic.h:166

StringRef.h

Ptr

@ Ptr

Definition:TargetLibraryInfo.cpp:77

Int

@ Int

Definition:TargetLibraryInfo.cpp:65

TargetLibraryInfo.h

TargetTransformInfo.h

This pass exposes codegen information to IR-level passes.

Local.h

Twine.h

TypeSwitch.h

This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...

VPRecipeBuilder.h

VPlanAnalysis.h

VPlanHCFGBuilder.h

This file defines the VPlanHCFGBuilder class which contains the public interface (buildHierarchicalCF...

VPlanPatternMatch.h

VPlanTransforms.h

This file provides utility VPlan to VPlan transformations.

VPlanUtils.h

VPlanVerifier.h

This file declares the class VPlanVerifier, which contains utility functions to check the consistency...

VPlan.h

This file contains the declarations of the Vectorization Plan base classes:

static const char PassName[]

Definition:X86LowerAMXIntrinsics.cpp:666

RHS

Value * RHS

Definition:X86PartialReduction.cpp:74

LHS

Value * LHS

Definition:X86PartialReduction.cpp:73

static const uint32_t IV[8]

Definition:blake3_impl.h:78

MemberTy

VectorType

Definition:ItaniumDemangle.h:1173

llvm::APInt

Class for arbitrary precision integers.

Definition:APInt.h:78

llvm::APInt::getAllOnes

static APInt getAllOnes(unsigned numBits)

Return an APInt of a specified width with all bits set.

Definition:APInt.h:234

llvm::APInt::getZExtValue

uint64_t getZExtValue() const

Get zero extended value.

Definition:APInt.h:1520

llvm::APInt::getSExtValue

int64_t getSExtValue() const

Get sign extended value.

Definition:APInt.h:1542

llvm::AnalysisManager

A container for analyses that lazily runs them and caches their results.

Definition:PassManager.h:253

llvm::AnalysisManager::getResult

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

Definition:PassManager.h:410

llvm::ArrayRef

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

Definition:ArrayRef.h:41

llvm::ArrayRef::size

size_t size() const

size - Get the array size.

Definition:ArrayRef.h:168

llvm::AssumptionAnalysis

A function analysis which provides an AssumptionCache.

Definition:AssumptionCache.h:173

llvm::AssumptionCache

A cache of @llvm.assume calls within a function.

Definition:AssumptionCache.h:42

llvm::AssumptionCache::registerAssumption

void registerAssumption(AssumeInst *CI)

Add an @llvm.assume intrinsic to this function's cache.

Definition:AssumptionCache.cpp:185

llvm::Attribute::getVScaleRangeMin

unsigned getVScaleRangeMin() const

Returns the minimum value for the vscale_range attribute.

Definition:Attributes.cpp:460

llvm::BasicBlock

LLVM Basic Block Representation.

Definition:BasicBlock.h:61

llvm::BasicBlock::begin

iterator begin()

Instruction iterator methods.

Definition:BasicBlock.h:461

llvm::BasicBlock::phis

iterator_range< const_phi_iterator > phis() const

Returns a range that iterates over the phis in the basic block.

Definition:BasicBlock.h:530

llvm::BasicBlock::getFirstNonPHIIt

InstListType::const_iterator getFirstNonPHIIt() const

Returns an iterator to the first instruction in this block that is not a PHINode instruction.

Definition:BasicBlock.cpp:381

llvm::BasicBlock::getSinglePredecessor

const BasicBlock * getSinglePredecessor() const

Return the predecessor of this block if it has a single predecessor block.

Definition:BasicBlock.cpp:481

llvm::BasicBlock::getSingleSuccessor

const BasicBlock * getSingleSuccessor() const

Return the successor of this block if it has a single successor.

Definition:BasicBlock.cpp:511

llvm::BasicBlock::getParent

const Function * getParent() const

Return the enclosing method, or null if none.

Definition:BasicBlock.h:220

llvm::BasicBlock::getContext

LLVMContext & getContext() const

Get the context in which this basic block lives.

Definition:BasicBlock.cpp:168

llvm::BasicBlock::getTerminator

const Instruction * getTerminator() const LLVM_READONLY

Returns the terminator instruction if the block is well formed or null if the block is not well forme...

Definition:BasicBlock.h:240

llvm::BinaryOperator

Definition:InstrTypes.h:170

llvm::BinaryOperator::getOpcode

BinaryOps getOpcode() const

Definition:InstrTypes.h:370

llvm::BlockFrequencyAnalysis

Analysis pass which computes BlockFrequencyInfo.

Definition:BlockFrequencyInfo.h:114

llvm::BlockFrequencyInfo

BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...

Definition:BlockFrequencyInfo.h:37

llvm::BranchInst

Conditional or Unconditional Branch instruction.

Definition:Instructions.h:3016

llvm::BranchInst::isConditional

bool isConditional() const

Definition:Instructions.h:3090

llvm::BranchInst::Create

static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)

Definition:Instructions.h:3072

llvm::BranchInst::getSuccessor

BasicBlock * getSuccessor(unsigned i) const

Definition:Instructions.h:3104

llvm::BranchInst::getCondition

Value * getCondition() const

Definition:Instructions.h:3092

llvm::CFGAnalyses

Represents analyses that only rely on functions' control flow.

Definition:Analysis.h:72

llvm::CallBase::isNoBuiltin

bool isNoBuiltin() const

Return true if the call should not be treated as a call to a builtin.

Definition:InstrTypes.h:1875

llvm::CallBase::getCalledFunction

Function * getCalledFunction() const

Returns the function called, or null if this is an indirect function invocation or the function signa...

Definition:InstrTypes.h:1341

llvm::CallBase::getArgOperand

Value * getArgOperand(unsigned i) const

Definition:InstrTypes.h:1286

llvm::CallBase::args

iterator_range< User::op_iterator > args()

Iteration adapter for range-for loops.

Definition:InstrTypes.h:1277

llvm::CallBase::arg_size

unsigned arg_size() const

Definition:InstrTypes.h:1284

llvm::CallInst

This class represents a function call, abstracting a target machine's calling convention.

Definition:Instructions.h:1479

llvm::CmpInst::Predicate

Predicate

This enumeration lists the possible predicates for CmpInst subclasses.

Definition:InstrTypes.h:673

llvm::CmpInst::BAD_ICMP_PREDICATE

@ BAD_ICMP_PREDICATE

Definition:InstrTypes.h:706

llvm::CmpInst::ICMP_UGT

@ ICMP_UGT

unsigned greater than

Definition:InstrTypes.h:696

llvm::CmpInst::ICMP_ULT

@ ICMP_ULT

unsigned less than

Definition:InstrTypes.h:698

llvm::CmpInst::ICMP_EQ

@ ICMP_EQ

equal

Definition:InstrTypes.h:694

llvm::CmpInst::ICMP_NE

@ ICMP_NE

not equal

Definition:InstrTypes.h:695

llvm::CmpInst::ICMP_ULE

@ ICMP_ULE

unsigned less or equal

Definition:InstrTypes.h:699

llvm::CmpInst::getInversePredicate

Predicate getInversePredicate() const

For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...

Definition:InstrTypes.h:787

llvm::ConstantInt

This is the shared class of boolean and integer constants.

Definition:Constants.h:83

llvm::ConstantInt::getTrue

static ConstantInt * getTrue(LLVMContext &Context)

Definition:Constants.cpp:866

llvm::ConstantInt::getFalse

static ConstantInt * getFalse(LLVMContext &Context)

Definition:Constants.cpp:873

llvm::DWARFExpression::Operation

This class represents an Operation in the Expression.

Definition:DWARFExpression.h:32

llvm::DataLayout

A parsed version of the target data layout string in and methods for querying it.

Definition:DataLayout.h:63

llvm::DebugLoc

A debug info location.

Definition:DebugLoc.h:33

llvm::DemandedBitsAnalysis

An analysis that produces DemandedBits for a function.

Definition:DemandedBits.h:103

llvm::DemandedBits

Definition:DemandedBits.h:40

llvm::DenseMapBase::lookup

ValueT lookup(const_arg_type_t< KeyT > Val) const

lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...

Definition:DenseMap.h:194

llvm::DenseMapBase::find

iterator find(const_arg_type_t< KeyT > Val)

Definition:DenseMap.h:156

llvm::DenseMapBase::empty

bool empty() const

Definition:DenseMap.h:98

llvm::DenseMapBase::begin

iterator begin()

Definition:DenseMap.h:75

llvm::DenseMapBase::count

size_type count(const_arg_type_t< KeyT > Val) const

Return 1 if the specified key is in the map, 0 otherwise.

Definition:DenseMap.h:152

llvm::DenseMapBase::end

iterator end()

Definition:DenseMap.h:84

llvm::DenseMapBase::at

const ValueT & at(const_arg_type_t< KeyT > Val) const

at - Return the entry for the specified key, or abort if no such entry exists.

Definition:DenseMap.h:202

llvm::DenseMapBase::contains

bool contains(const_arg_type_t< KeyT > Val) const

Return true if the specified key is in the map, false otherwise.

Definition:DenseMap.h:147

llvm::DenseMapBase::insert

std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)

Definition:DenseMap.h:211

llvm::DenseMapBase::clear

void clear()

Definition:DenseMap.h:110

llvm::DenseMapIterator

Definition:DenseMap.h:1189

llvm::DenseMap

Definition:DenseMap.h:727

llvm::DenseSet

Implements a dense probed hash-table based set.

Definition:DenseSet.h:278

llvm::DomTreeNodeBase::getIDom

DomTreeNodeBase * getIDom() const

Definition:GenericDomTree.h:90

llvm::DominatorTreeAnalysis

Analysis pass which computes a DominatorTree.

Definition:Dominators.h:279

llvm::DominatorTreeBase::verify

bool verify(VerificationLevel VL=VerificationLevel::Full) const

verify - checks if the tree is correct.

Definition:GenericDomTree.h:905

llvm::DominatorTreeBase::changeImmediateDominator

void changeImmediateDominator(DomTreeNodeBase< NodeT > *N, DomTreeNodeBase< NodeT > *NewIDom)

changeImmediateDominator - This method is used to update the dominator tree information when a node's...

Definition:GenericDomTree.h:723

llvm::DominatorTreeBase::addNewBlock

DomTreeNodeBase< NodeT > * addNewBlock(NodeT *BB, NodeT *DomBB)

Add a new node to the dominator tree information.

Definition:GenericDomTree.h:687

llvm::DominatorTreeBase::eraseNode

void eraseNode(NodeT *BB)

eraseNode - Removes a node from the dominator tree.

Definition:GenericDomTree.h:737

llvm::DominatorTreeBase::getNode

DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const

getNode - return the (Post)DominatorTree node for the specified basic block.

Definition:GenericDomTree.h:401

llvm::DominatorTreeBase::properlyDominates

bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const

properlyDominates - Returns true iff A dominates B and A != B.

Definition:GenericDomTree.h:443

llvm::DominatorTree

Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.

Definition:Dominators.h:162

llvm::DominatorTree::dominates

bool dominates(const BasicBlock *BB, const Use &U) const

Return true if the (end of the) basic block BB dominates the use U.

Definition:Dominators.cpp:122

llvm::ElementCount

Definition:TypeSize.h:300

llvm::ElementCount::isVector

constexpr bool isVector() const

One or more elements.

Definition:TypeSize.h:326

llvm::ElementCount::getScalable

static constexpr ElementCount getScalable(ScalarTy MinVal)

Definition:TypeSize.h:314

llvm::ElementCount::getFixed

static constexpr ElementCount getFixed(ScalarTy MinVal)

Definition:TypeSize.h:311

llvm::ElementCount::get

static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)

Definition:TypeSize.h:317

llvm::ElementCount::isScalar

constexpr bool isScalar() const

Exactly one element.

Definition:TypeSize.h:322

llvm::EpilogueVectorizerEpilogueLoop

Definition:LoopVectorize.cpp:788

llvm::EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck

BasicBlock * emitMinimumVectorEpilogueIterCountCheck(BasicBlock *Bypass, BasicBlock *Insert)

Emits an iteration count bypass check after the main vector loop has finished to see if there are any...

Definition:LoopVectorize.cpp:7991

llvm::EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd

void printDebugTracesAtEnd() override

Definition:LoopVectorize.cpp:8054

llvm::EpilogueVectorizerEpilogueLoop::EpilogueVectorizerEpilogueLoop

EpilogueVectorizerEpilogueLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)

Definition:LoopVectorize.cpp:790

llvm::EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart

void printDebugTracesAtStart() override

Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...

Definition:LoopVectorize.cpp:8046

llvm::EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton

BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final

Implements the interface for creating a vectorized skeleton using the epilogue loop strategy (ie the ...

Definition:LoopVectorize.cpp:7913

llvm::EpilogueVectorizerMainLoop

A specialized derived class of inner loop vectorizer that performs vectorization of main loops in the...

Definition:LoopVectorize.cpp:759

llvm::EpilogueVectorizerMainLoop::printDebugTracesAtEnd

void printDebugTracesAtEnd() override

Definition:LoopVectorize.cpp:7846

llvm::EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton

BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final

Implements the interface for creating a vectorized skeleton using the main loop strategy (ie the firs...

Definition:LoopVectorize.cpp:7802

llvm::EpilogueVectorizerMainLoop::EpilogueVectorizerMainLoop

EpilogueVectorizerMainLoop(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)

Definition:LoopVectorize.cpp:761

llvm::EpilogueVectorizerMainLoop::printDebugTracesAtStart

void printDebugTracesAtStart() override

Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...

Definition:LoopVectorize.cpp:7836

llvm::EpilogueVectorizerMainLoop::emitIterationCountCheck

BasicBlock * emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue)

Emits an iteration count bypass check once for the main loop (when ForEpilogue is false) and once for...

Definition:LoopVectorize.cpp:7854

llvm::FPMathOperator::getFastMathFlags

FastMathFlags getFastMathFlags() const

Convenience function for getting all the fast-math flags.

Definition:Operator.h:338

llvm::FastMathFlags

Convenience struct for specifying and reasoning about fast-math flags.

Definition:FMF.h:20

llvm::FunctionType

Class to represent function types.

Definition:DerivedTypes.h:105

llvm::FunctionType::param_begin

param_iterator param_begin() const

Definition:DerivedTypes.h:130

llvm::FunctionType::param_end

param_iterator param_end() const

Definition:DerivedTypes.h:131

llvm::Function

Definition:Function.h:63

llvm::Function::hasOptSize

bool hasOptSize() const

Optimize this function for size (-Os) or minimum size (-Oz).

Definition:Function.h:707

llvm::Function::getFunctionType

FunctionType * getFunctionType() const

Returns the FunctionType for me.

Definition:Function.h:216

llvm::Function::getDataLayout

const DataLayout & getDataLayout() const

Get the data layout of the module this function belongs to.

Definition:Function.cpp:373

llvm::Function::getFnAttribute

Attribute getFnAttribute(Attribute::AttrKind Kind) const

Return the attribute for the given attribute kind.

Definition:Function.cpp:766

llvm::Function::hasMinSize

bool hasMinSize() const

Optimize this function for minimum size (-Oz).

Definition:Function.h:704

llvm::Function::hasFnAttribute

bool hasFnAttribute(Attribute::AttrKind Kind) const

Return true if the function has the attribute.

Definition:Function.cpp:731

llvm::GEPNoWrapFlags

Represents flags for the getelementptr instruction/expression.

Definition:GEPNoWrapFlags.h:26

llvm::GEPNoWrapFlags::inBounds

static GEPNoWrapFlags inBounds()

Definition:GEPNoWrapFlags.h:50

llvm::GEPNoWrapFlags::none

static GEPNoWrapFlags none()

Definition:GEPNoWrapFlags.h:46

llvm::IRBuilderBase::FastMathFlagGuard

Definition:IRBuilder.h:416

llvm::IRBuilderBase

Common base class shared among various IRBuilders.

Definition:IRBuilder.h:113

llvm::IRBuilderBase::getTrue

ConstantInt * getTrue()

Get the constant value for i1 true.

Definition:IRBuilder.h:485

llvm::IRBuilderBase::CreateSelect

Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)

Definition:IRBuilder.cpp:1053

llvm::IRBuilderBase::setFastMathFlags

void setFastMathFlags(FastMathFlags NewFMF)

Set the fast-math flags to be used with generated fp-math operators.

Definition:IRBuilder.h:330

llvm::IRBuilderBase::CreateICmpNE

Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")

Definition:IRBuilder.h:2274

llvm::IRBuilderBase::CreateBinaryIntrinsic

Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with 2 operands which is mangled on the first type.

Definition:IRBuilder.cpp:889

llvm::IRBuilderBase::CreateICmpEQ

Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")

Definition:IRBuilder.h:2270

llvm::IRBuilderBase::Insert

InstTy * Insert(InstTy *I, const Twine &Name="") const

Insert and return the specified instruction.

Definition:IRBuilder.h:164

llvm::IRBuilderBase::CreateSub

Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)

Definition:IRBuilder.h:1387

llvm::IRBuilderBase::CreateAdd

Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)

Definition:IRBuilder.h:1370

llvm::IRBuilderBase::getFalse

ConstantInt * getFalse()

Get the constant value for i1 false.

Definition:IRBuilder.h:490

llvm::IRBuilderBase::SetInsertPoint

void SetInsertPoint(BasicBlock *TheBB)

This specifies that created instructions should be appended to the end of the specified block.

Definition:IRBuilder.h:199

llvm::IRBuilderBase::CreateICmp

Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")

Definition:IRBuilder.h:2380

llvm::IRBuilderBase::CreateURem

Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")

Definition:IRBuilder.h:1447

llvm::IRBuilder

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

Definition:IRBuilder.h:2705

llvm::InductionDescriptor

A struct for saving information about induction variables.

Definition:IVDescriptors.h:334

llvm::InductionDescriptor::getStep

const SCEV * getStep() const

Definition:IVDescriptors.h:350

llvm::InductionDescriptor::InductionKind

InductionKind

This enum represents the kinds of inductions that we support.

Definition:IVDescriptors.h:337

llvm::InductionDescriptor::IK_NoInduction

@ IK_NoInduction

Not an induction variable.

Definition:IVDescriptors.h:338

llvm::InductionDescriptor::IK_FpInduction

@ IK_FpInduction

Floating point induction variable.

Definition:IVDescriptors.h:341

llvm::InductionDescriptor::IK_PtrInduction

@ IK_PtrInduction

Pointer induction var. Step = C.

Definition:IVDescriptors.h:340

llvm::InductionDescriptor::IK_IntInduction

@ IK_IntInduction

Integer induction variable. Step = C.

Definition:IVDescriptors.h:339

llvm::InductionDescriptor::getCastInsts

const SmallVectorImpl< Instruction * > & getCastInsts() const

Returns a reference to the type cast instructions in the induction update chain, that are redundant w...

Definition:IVDescriptors.h:403

llvm::InductionDescriptor::getStartValue

Value * getStartValue() const

Definition:IVDescriptors.h:348

llvm::InnerLoopAndEpilogueVectorizer

An extension of the inner loop vectorizer that creates a skeleton for a vectorized loop that has its ...

Definition:LoopVectorize.cpp:719

llvm::InnerLoopAndEpilogueVectorizer::InnerLoopAndEpilogueVectorizer

InnerLoopAndEpilogueVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI, LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)

Definition:LoopVectorize.cpp:721

llvm::InnerLoopAndEpilogueVectorizer::createVectorizedLoopSkeleton

BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final

Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...

Definition:LoopVectorize.cpp:737

llvm::InnerLoopAndEpilogueVectorizer::createEpilogueVectorizedLoopSkeleton

virtual BasicBlock * createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)=0

The interface for creating a vectorized skeleton using one of two different strategies,...

llvm::InnerLoopAndEpilogueVectorizer::EPI

EpilogueLoopVectorizationInfo & EPI

Holds and updates state information required to vectorize the main loop and its epilogue in two separ...

Definition:LoopVectorize.cpp:753

llvm::InnerLoopVectorizer

InnerLoopVectorizer vectorizes loops which contain only one basic block to a specified vectorization ...

Definition:LoopVectorize.cpp:466

llvm::InnerLoopVectorizer::printDebugTracesAtStart

virtual void printDebugTracesAtStart()

Allow subclasses to override and print debug traces before/after vplan execution, when trace informat...

Definition:LoopVectorize.cpp:578

llvm::InnerLoopVectorizer::TripCount

Value * TripCount

Trip count of the original loop.

Definition:LoopVectorize.cpp:643

llvm::InnerLoopVectorizer::sinkScalarOperands

void sinkScalarOperands(Instruction *PredInst)

Iteratively sink the scalarized operands of a predicated instruction into the block that was created ...

Definition:LoopVectorize.cpp:2937

llvm::InnerLoopVectorizer::TLI

const TargetLibraryInfo * TLI

Target Library Info.

Definition:LoopVectorize.cpp:601

llvm::InnerLoopVectorizer::MinProfitableTripCount

ElementCount MinProfitableTripCount

Definition:LoopVectorize.cpp:616

llvm::InnerLoopVectorizer::TTI

const TargetTransformInfo * TTI

Target Transform Info.

Definition:LoopVectorize.cpp:604

llvm::InnerLoopVectorizer::VectorTripCount

Value * VectorTripCount

Trip count of the widened loop (TripCount - TripCount % (VF*UF))

Definition:LoopVectorize.cpp:646

llvm::InnerLoopVectorizer::areSafetyChecksAdded

bool areSafetyChecksAdded()

Definition:LoopVectorize.cpp:507

llvm::InnerLoopVectorizer::emitSCEVChecks

BasicBlock * emitSCEVChecks(BasicBlock *Bypass)

Emit a bypass check to see if all of the SCEV assumptions we've had to make are correct.

Definition:LoopVectorize.cpp:2557

llvm::InnerLoopVectorizer::createVectorizedLoopSkeleton

virtual BasicBlock * createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs)

Create a new empty loop that will contain vectorized instructions later on, while the old loop will b...

Definition:LoopVectorize.cpp:2718

llvm::InnerLoopVectorizer::Cost

LoopVectorizationCostModel * Cost

The profitablity analysis.

Definition:LoopVectorize.cpp:652

llvm::InnerLoopVectorizer::AdditionalBypassBlock

BasicBlock * AdditionalBypassBlock

The additional bypass block which conditionally skips over the epilogue loop after executing the main...

Definition:LoopVectorize.cpp:677

llvm::InnerLoopVectorizer::BFI

BlockFrequencyInfo * BFI

BFI and PSI are used to check for profile guided size optimizations.

Definition:LoopVectorize.cpp:658

llvm::InnerLoopVectorizer::getTripCount

Value * getTripCount() const

Returns the original loop trip count.

Definition:LoopVectorize.cpp:522

llvm::InnerLoopVectorizer::LoopMiddleBlock

BasicBlock * LoopMiddleBlock

Middle Block between the vector and the scalar.

Definition:LoopVectorize.cpp:634

llvm::InnerLoopVectorizer::ORE

OptimizationRemarkEmitter * ORE

Interface to emit optimization remarks.

Definition:LoopVectorize.cpp:610

llvm::InnerLoopVectorizer::scalarizeInstruction

void scalarizeInstruction(const Instruction *Instr, VPReplicateRecipe *RepRecipe, const VPLane &Lane, VPTransformState &State)

A helper function to scalarize a single Instruction in the innermost loop.

Definition:LoopVectorize.cpp:2349

llvm::InnerLoopVectorizer::PredicatedInstructions

SmallVector< Instruction *, 4 > PredicatedInstructions

Store instructions that were predicated.

Definition:LoopVectorize.cpp:640

llvm::InnerLoopVectorizer::Induction2AdditionalBypassValue

DenseMap< PHINode *, Value * > Induction2AdditionalBypassValue

Mapping of induction phis to their additional bypass values.

Definition:LoopVectorize.cpp:672

llvm::InnerLoopVectorizer::introduceCheckBlockInVPlan

void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB)

Introduces a new VPIRBasicBlock for CheckIRBB to Plan between the vector preheader and its predecesso...

Definition:LoopVectorize.cpp:2457

llvm::InnerLoopVectorizer::createVectorLoopSkeleton

void createVectorLoopSkeleton(StringRef Prefix)

Emit basic blocks (prefixed with Prefix) for the iteration check, vector loop preheader,...

Definition:LoopVectorize.cpp:2628

llvm::InnerLoopVectorizer::emitMemRuntimeChecks

BasicBlock * emitMemRuntimeChecks(BasicBlock *Bypass)

Emit bypass checks to check any memory assumptions we may have made.

Definition:LoopVectorize.cpp:2576

llvm::InnerLoopVectorizer::AddedSafetyChecks

bool AddedSafetyChecks

Definition:LoopVectorize.cpp:655

llvm::InnerLoopVectorizer::LoopScalarPreHeader

BasicBlock * LoopScalarPreHeader

The scalar-loop preheader.

Definition:LoopVectorize.cpp:631

llvm::InnerLoopVectorizer::createInductionAdditionalBypassValues

void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount)

Create and record the values for induction variables to resume coming from the additional bypass bloc...

Definition:LoopVectorize.cpp:2682

llvm::InnerLoopVectorizer::VectorPHVPB

VPBlockBase * VectorPHVPB

The vector preheader block of Plan, used as target for check blocks introduced during skeleton creati...

Definition:LoopVectorize.cpp:683

llvm::InnerLoopVectorizer::Legal

LoopVectorizationLegality * Legal

The legality analysis.

Definition:LoopVectorize.cpp:649

llvm::InnerLoopVectorizer::InnerLoopVectorizer

InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, ElementCount VecWidth, ElementCount MinProfitableTripCount, unsigned UnrollFactor, LoopVectorizationLegality *LVL, LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks, VPlan &Plan)

Definition:LoopVectorize.cpp:468

llvm::InnerLoopVectorizer::Plan

VPlan & Plan

Definition:LoopVectorize.cpp:679

llvm::InnerLoopVectorizer::emitIterationCountCheck

void emitIterationCountCheck(BasicBlock *Bypass)

Emit a bypass check to see if the vector trip count is zero, including if it overflows.

Definition:LoopVectorize.cpp:2472

llvm::InnerLoopVectorizer::PSE

PredicatedScalarEvolution & PSE

A wrapper around ScalarEvolution used to add runtime SCEV checks.

Definition:LoopVectorize.cpp:592

llvm::InnerLoopVectorizer::LI

LoopInfo * LI

Loop Info.

Definition:LoopVectorize.cpp:595

llvm::InnerLoopVectorizer::PSI

ProfileSummaryInfo * PSI

Definition:LoopVectorize.cpp:659

llvm::InnerLoopVectorizer::getInductionAdditionalBypassValue

Value * getInductionAdditionalBypassValue(PHINode *OrigPhi) const

induction header phi.

Definition:LoopVectorize.cpp:531

llvm::InnerLoopVectorizer::getAdditionalBypassBlock

BasicBlock * getAdditionalBypassBlock() const

Return the additional bypass block which targets the scalar loop by skipping the epilogue loop after ...

Definition:LoopVectorize.cpp:537

llvm::InnerLoopVectorizer::DT

DominatorTree * DT

Dominator Tree.

Definition:LoopVectorize.cpp:598

llvm::InnerLoopVectorizer::setTripCount

void setTripCount(Value *TC)

Used to set the trip count after ILV's construction and after the preheader block has been executed.

Definition:LoopVectorize.cpp:527

llvm::InnerLoopVectorizer::OptForSizeBasedOnProfile

bool OptForSizeBasedOnProfile

Definition:LoopVectorize.cpp:663

llvm::InnerLoopVectorizer::fixVectorizedLoop

void fixVectorizedLoop(VPTransformState &State)

Fix the vectorized code, taking care of header phi's, and more.

Definition:LoopVectorize.cpp:2886

llvm::InnerLoopVectorizer::LoopVectorPreHeader

BasicBlock * LoopVectorPreHeader

The vector-loop preheader.

Definition:LoopVectorize.cpp:628

llvm::InnerLoopVectorizer::printDebugTracesAtEnd

virtual void printDebugTracesAtEnd()

Definition:LoopVectorize.cpp:579

llvm::InnerLoopVectorizer::AC

AssumptionCache * AC

Assumption Cache.

Definition:LoopVectorize.cpp:607

llvm::InnerLoopVectorizer::getOrCreateVectorTripCount

Value * getOrCreateVectorTripCount(BasicBlock *InsertBlock)

Returns (and creates if needed) the trip count of the widened loop.

Definition:LoopVectorize.cpp:2408

llvm::InnerLoopVectorizer::Builder

IRBuilder Builder

The builder that we use.

Definition:LoopVectorize.cpp:623

llvm::InnerLoopVectorizer::fixNonInductionPHIs

void fixNonInductionPHIs(VPTransformState &State)

Fix the non-induction PHIs in Plan.

Definition:LoopVectorize.cpp:3013

llvm::InnerLoopVectorizer::UF

unsigned UF

The vectorization unroll factor to use.

Definition:LoopVectorize.cpp:620

llvm::InnerLoopVectorizer::LoopBypassBlocks

SmallVector< BasicBlock *, 4 > LoopBypassBlocks

A list of all bypass blocks. The first block is the entry of the loop.

Definition:LoopVectorize.cpp:637

llvm::InnerLoopVectorizer::RTChecks

GeneratedRTChecks & RTChecks

Structure to hold information about generated runtime checks, responsible for cleaning the checks,...

Definition:LoopVectorize.cpp:667

llvm::InnerLoopVectorizer::~InnerLoopVectorizer

virtual ~InnerLoopVectorizer()=default

llvm::InnerLoopVectorizer::VF

ElementCount VF

The vectorization SIMD factor to use.

Definition:LoopVectorize.cpp:614

llvm::InnerLoopVectorizer::OrigLoop

Loop * OrigLoop

The original loop.

Definition:LoopVectorize.cpp:587

llvm::InstructionCost

Definition:InstructionCost.h:29

llvm::InstructionCost::setInvalid

void setInvalid()

Definition:InstructionCost.h:81

llvm::InstructionCost::getInvalid

static InstructionCost getInvalid(CostType Val=0)

Definition:InstructionCost.h:73

llvm::InstructionCost::CostType

int64_t CostType

Definition:InstructionCost.h:31

llvm::InstructionCost::getMax

static InstructionCost getMax()

Definition:InstructionCost.h:71

llvm::InstructionCost::getValue

std::optional< CostType > getValue() const

This function is intended to be used as sparingly as possible, since the class provides the full rang...

Definition:InstructionCost.h:87

llvm::InstructionCost::isValid

bool isValid() const

Definition:InstructionCost.h:79

llvm::Instruction

Definition:Instruction.h:68

llvm::Instruction::getDebugLoc

const DebugLoc & getDebugLoc() const

Return the debug location for this node as a DebugLoc.

Definition:Instruction.h:511

llvm::Instruction::getModule

const Module * getModule() const

Return the module owning the function this instruction belongs to or nullptr it the function does not...

Definition:Instruction.cpp:68

llvm::Instruction::isBinaryOp

bool isBinaryOp() const

Definition:Instruction.h:315

llvm::Instruction::eraseFromParent

InstListType::iterator eraseFromParent()

This method unlinks 'this' from the containing basic block and deletes it.

Definition:Instruction.cpp:94

llvm::Instruction::replaceSuccessorWith

void replaceSuccessorWith(BasicBlock *OldBB, BasicBlock *NewBB)

Replace specified successor OldBB to point at the provided block.

Definition:Instruction.cpp:1311

llvm::Instruction::user_back

Instruction * user_back()

Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...

Definition:Instruction.h:169

llvm::Instruction::getFastMathFlags

FastMathFlags getFastMathFlags() const LLVM_READONLY

Convenience function for getting all the fast-math flags, which must be an operator which supports th...

Definition:Instruction.cpp:651

llvm::Instruction::getOpcodeName

const char * getOpcodeName() const

Definition:Instruction.h:312

llvm::Instruction::getOpcode

unsigned getOpcode() const

Returns a member of one of the enums like Instruction::Add.

Definition:Instruction.h:310

llvm::Instruction::moveBefore

void moveBefore(Instruction *MovePos)

Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...

Definition:Instruction.cpp:175

llvm::IntegerType::get

static IntegerType * get(LLVMContext &C, unsigned NumBits)

This static method is the primary way of constructing an IntegerType.

Definition:Type.cpp:311

llvm::InterleaveGroup

The group of interleaved loads/stores sharing the same stride and close to each other.

Definition:VectorUtils.h:488

llvm::InterleaveGroup::getFactor

uint32_t getFactor() const

Definition:VectorUtils.h:504

llvm::InterleaveGroup::getMember

InstTy * getMember(uint32_t Index) const

Get the member with the given index Index.

Definition:VectorUtils.h:558

llvm::InterleaveGroup::getInsertPos

InstTy * getInsertPos() const

Definition:VectorUtils.h:574

llvm::InterleaveGroup::getNumMembers

uint32_t getNumMembers() const

Definition:VectorUtils.h:506

llvm::InterleavedAccessInfo

Drive the analysis of interleaved memory accesses in the loop.

Definition:VectorUtils.h:630

llvm::InterleavedAccessInfo::getInterleaveGroup

InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const

Get the interleave group that Instr belongs to.

Definition:VectorUtils.h:675

llvm::InterleavedAccessInfo::requiresScalarEpilogue

bool requiresScalarEpilogue() const

Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...

Definition:VectorUtils.h:686

llvm::InterleavedAccessInfo::isInterleaved

bool isInterleaved(Instruction *Instr) const

Check if Instr belongs to any interleave group.

Definition:VectorUtils.h:667

llvm::InterleavedAccessInfo::invalidateGroups

bool invalidateGroups()

Invalidate groups, e.g., in case all blocks in loop will be predicated contrary to original assumptio...

Definition:VectorUtils.h:650

llvm::InterleavedAccessInfo::getInterleaveGroups

iterator_range< SmallPtrSetIterator< llvm::InterleaveGroup< Instruction > * > > getInterleaveGroups()

Definition:VectorUtils.h:680

llvm::InterleavedAccessInfo::analyzeInterleaving

void analyzeInterleaving(bool EnableMaskedInterleavedGroup)

Analyze the interleaved accesses and collect them in interleave groups.

Definition:VectorUtils.cpp:1303

llvm::InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue

void invalidateGroupsRequiringScalarEpilogue()

Invalidate groups that require a scalar epilogue (due to gaps).

Definition:VectorUtils.cpp:1606

llvm::IntervalMap

Definition:IntervalMap.h:936

llvm::IntrinsicCostAttributes

Definition:TargetTransformInfo.h:119

llvm::IntrinsicInst

A wrapper class for inspecting calls to intrinsic functions.

Definition:IntrinsicInst.h:48

llvm::LLVMContext

This is an important class for using LLVM in a threaded context.

Definition:LLVMContext.h:67

llvm::LoadInst

An instruction for reading from memory.

Definition:Instructions.h:176

llvm::LoadInst::getPointerOperandType

Type * getPointerOperandType() const

Definition:Instructions.h:258

llvm::LoopAccessAnalysis

This analysis provides dependence information for the memory accesses of a loop.

Definition:LoopAccessAnalysis.h:909

llvm::LoopAccessInfoManager::clear

void clear()

Definition:LoopAccessAnalysis.cpp:3088

llvm::LoopAccessInfo

Drive the analysis of memory accesses in the loop.

Definition:LoopAccessAnalysis.h:636

llvm::LoopAccessInfo::getRuntimePointerChecking

const RuntimePointerChecking * getRuntimePointerChecking() const

Definition:LoopAccessAnalysis.h:655

llvm::LoopAccessInfo::getNumRuntimePointerChecks

unsigned getNumRuntimePointerChecks() const

Number of memchecks required to prove independence of otherwise may-alias pointers.

Definition:LoopAccessAnalysis.h:661

llvm::LoopAccessInfo::getSymbolicStrides

const DenseMap< Value *, const SCEV * > & getSymbolicStrides() const

If an access has a symbolic strides, this maps the pointer value to the stride symbol.

Definition:LoopAccessAnalysis.h:693

llvm::LoopAnalysis

Analysis pass that exposes the LoopInfo for a function.

Definition:LoopInfo.h:566

llvm::LoopBase::contains

bool contains(const LoopT *L) const

Return true if the specified loop is contained within in this loop.

Definition:GenericLoopInfo.h:124

llvm::LoopBase::getLoopLatch

BlockT * getLoopLatch() const

If there is a single latch block for this loop, return it.

Definition:GenericLoopInfoImpl.h:256

llvm::LoopBase::isInnermost

bool isInnermost() const

Return true if the loop does not contain any (natural) loops.

Definition:GenericLoopInfo.h:167

llvm::LoopBase::getExitBlocks

void getExitBlocks(SmallVectorImpl< BlockT * > &ExitBlocks) const

Return all of the successor blocks of this loop.

Definition:GenericLoopInfoImpl.h:64

llvm::LoopBase::getUniqueLatchExitBlock

BlockT * getUniqueLatchExitBlock() const

Return the unique exit block for the latch, or null if there are multiple different exit blocks or th...

Definition:GenericLoopInfoImpl.h:163

llvm::LoopBase::getExitingBlocks

void getExitingBlocks(SmallVectorImpl< BlockT * > &ExitingBlocks) const

Return all blocks inside the loop that have successors outside of the loop.

Definition:GenericLoopInfoImpl.h:33

llvm::LoopBase::getHeader

BlockT * getHeader() const

Definition:GenericLoopInfo.h:90

llvm::LoopBase::getLoopDepth

unsigned getLoopDepth() const

Return the nesting level of this loop.

Definition:GenericLoopInfo.h:82

llvm::LoopBase::addBasicBlockToLoop

void addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase< BlockT, LoopT > &LI)

This method is used by other analyses to update loop information.

Definition:GenericLoopInfoImpl.h:282

llvm::LoopBase::blocks

iterator_range< block_iterator > blocks() const

Definition:GenericLoopInfo.h:180

llvm::LoopBase::getLoopPreheader

BlockT * getLoopPreheader() const

If there is a preheader for this loop, return it.

Definition:GenericLoopInfoImpl.h:210

llvm::LoopBase::getBlocks

ArrayRef< BlockT * > getBlocks() const

Get a list of the basic blocks which make up this loop.

Definition:GenericLoopInfo.h:173

llvm::LoopBase::getExitingBlock

BlockT * getExitingBlock() const

If getExitingBlocks would return exactly one block, return that block.

Definition:GenericLoopInfoImpl.h:48

llvm::LoopBase::getParentLoop

LoopT * getParentLoop() const

Return the parent loop if it exists or nullptr for top level loops.

Definition:GenericLoopInfo.h:99

llvm::LoopBase::isLoopExiting

bool isLoopExiting(const BlockT *BB) const

True if terminator in the block can branch to another block that is outside of the current loop.

Definition:GenericLoopInfo.h:227

llvm::LoopBlocksDFS

Store the result of a depth first search within basic blocks contained by a single loop.

Definition:LoopIterator.h:97

llvm::LoopBlocksDFS::beginRPO

RPOIterator beginRPO() const

Reverse iterate over the cached postorder blocks.

Definition:LoopIterator.h:136

llvm::LoopBlocksDFS::perform

void perform(const LoopInfo *LI)

Traverse the loop blocks and store the DFS result.

Definition:LoopInfo.cpp:1266

llvm::LoopBlocksDFS::endRPO

RPOIterator endRPO() const

Definition:LoopIterator.h:140

llvm::LoopBlocksRPO

Wrapper class to LoopBlocksDFS that provides a standard begin()/end() interface for the DFS reverse p...

Definition:LoopIterator.h:172

llvm::LoopBlocksRPO::perform

void perform(const LoopInfo *LI)

Traverse the loop blocks and store the DFS result.

Definition:LoopIterator.h:180

llvm::LoopInfoBase::removeBlock

void removeBlock(BlockT *BB)

This method completely removes BB from all data structures, including all of the Loop objects it is n...

Definition:GenericLoopInfo.h:671

llvm::LoopInfoBase::empty

bool empty() const

Definition:GenericLoopInfo.h:585

llvm::LoopInfoBase::getLoopFor

LoopT * getLoopFor(const BlockT *BB) const

Return the inner most loop that BB lives in.

Definition:GenericLoopInfo.h:606

llvm::LoopInfo

Definition:LoopInfo.h:407

llvm::LoopVectorizationCostModel

LoopVectorizationCostModel - estimates the expected speedups due to vectorization.

Definition:LoopVectorize.cpp:966

llvm::LoopVectorizationCostModel::ElementTypesInLoop

SmallPtrSet< Type *, 16 > ElementTypesInLoop

All element types found in the loop.

Definition:LoopVectorize.cpp:1791

llvm::LoopVectorizationCostModel::collectElementTypesForWidening

void collectElementTypesForWidening()

Collect all element types in the loop for which widening is needed.

Definition:LoopVectorize.cpp:4807

llvm::LoopVectorizationCostModel::canVectorizeReductions

bool canVectorizeReductions(ElementCount VF) const

Returns true if the target machine supports all of the reduction variables found for the given VF.

Definition:LoopVectorize.cpp:1299

llvm::LoopVectorizationCostModel::isEpilogueVectorizationProfitable

bool isEpilogueVectorizationProfitable(const ElementCount VF, const unsigned IC) const

Returns true if epilogue vectorization is considered profitable, and false otherwise.

Definition:LoopVectorize.cpp:4639

llvm::LoopVectorizationCostModel::requiresScalarEpilogue

bool requiresScalarEpilogue(VFRange Range) const

Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...

Definition:LoopVectorize.cpp:1389

llvm::LoopVectorizationCostModel::isPredicatedInst

bool isPredicatedInst(Instruction *I) const

Returns true if I is an instruction that needs to be predicated at runtime.

Definition:LoopVectorize.cpp:3263

llvm::LoopVectorizationCostModel::hasPredStores

bool hasPredStores() const

Definition:LoopVectorize.cpp:1533

llvm::LoopVectorizationCostModel::collectValuesToIgnore

void collectValuesToIgnore()

Collect values we want to ignore in the cost model.

Definition:LoopVectorize.cpp:6826

llvm::LoopVectorizationCostModel::collectInLoopReductions

void collectInLoopReductions()

Split reductions into those that happen in the loop, and those that happen outside.

Definition:LoopVectorize.cpp:6983

llvm::LoopVectorizationCostModel::getSmallestAndWidestTypes

std::pair< unsigned, unsigned > getSmallestAndWidestTypes()

Definition:LoopVectorize.cpp:4776

llvm::LoopVectorizationCostModel::isUniformAfterVectorization

bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const

Returns true if I is known to be uniform after vectorization.

Definition:LoopVectorize.cpp:1084

llvm::LoopVectorizationCostModel::usePredicatedReductionSelect

bool usePredicatedReductionSelect(unsigned Opcode, Type *PhiTy) const

Returns true if the predicated reduction select should be used to set the incoming value for the redu...

Definition:LoopVectorize.cpp:1499

llvm::LoopVectorizationCostModel::PSE

PredicatedScalarEvolution & PSE

Predicated scalar evolution analysis.

Definition:LoopVectorize.cpp:1752

llvm::LoopVectorizationCostModel::Hints

const LoopVectorizeHints * Hints

Loop Vectorize Hint.

Definition:LoopVectorize.cpp:1778

llvm::LoopVectorizationCostModel::getMaxSafeElements

std::optional< unsigned > getMaxSafeElements() const

Return maximum safe number of elements to be processed per vector iteration, which do not prevent sto...

Definition:LoopVectorize.cpp:1477

llvm::LoopVectorizationCostModel::TTI

const TargetTransformInfo & TTI

Vector target information.

Definition:LoopVectorize.cpp:1761

llvm::LoopVectorizationCostModel::LoopVectorizationCostModel

LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI)

Definition:LoopVectorize.cpp:970

llvm::LoopVectorizationCostModel::TheFunction

const Function * TheFunction

Definition:LoopVectorize.cpp:1775

llvm::LoopVectorizationCostModel::Legal

LoopVectorizationLegality * Legal

Vectorization legality.

Definition:LoopVectorize.cpp:1758

llvm::LoopVectorizationCostModel::isLegalMaskedLoad

bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const

Returns true if the target machine supports masked load operation for the given DataType and kind of ...

Definition:LoopVectorize.cpp:1277

llvm::LoopVectorizationCostModel::getReductionPatternCost

std::optional< InstructionCost > getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy) const

Return the cost of instructions in an inloop reduction pattern, if I is part of that pattern.

Definition:LoopVectorize.cpp:5773

llvm::LoopVectorizationCostModel::getInstructionCost

InstructionCost getInstructionCost(Instruction *I, ElementCount VF)

Returns the execution time cost of an instruction for a given vector width.

Definition:LoopVectorize.cpp:6387

llvm::LoopVectorizationCostModel::DB

DemandedBits * DB

Demanded bits analysis.

Definition:LoopVectorize.cpp:1767

llvm::LoopVectorizationCostModel::interleavedAccessCanBeWidened

bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const

Returns true if I is a memory instruction in an interleaved-group of memory accesses that can be vect...

Definition:LoopVectorize.cpp:3377

llvm::LoopVectorizationCostModel::TLI

const TargetLibraryInfo * TLI

Target Library Info.

Definition:LoopVectorize.cpp:1764

llvm::LoopVectorizationCostModel::memoryInstructionCanBeWidened

bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF)

Returns true if I is a memory instruction with consecutive memory access that can be widened.

Definition:LoopVectorize.cpp:3451

llvm::LoopVectorizationCostModel::getInterleavedAccessGroup

const InterleaveGroup< Instruction > * getInterleavedAccessGroup(Instruction *Instr) const

Get the interleaved access group that Instr belongs to.

Definition:LoopVectorize.cpp:1357

llvm::LoopVectorizationCostModel::getVectorIntrinsicCost

InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const

Estimate cost of an intrinsic call instruction CI if it were vectorized with factor VF.

Definition:LoopVectorize.cpp:2865

llvm::LoopVectorizationCostModel::isScalarAfterVectorization

bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const

Returns true if I is known to be scalar after vectorization.

Definition:LoopVectorize.cpp:1104

llvm::LoopVectorizationCostModel::isOptimizableIVTruncate

bool isOptimizableIVTruncate(Instruction *I, ElementCount VF)

Return True if instruction I is an optimizable truncate whose operand is an induction variable.

Definition:LoopVectorize.cpp:1225

llvm::LoopVectorizationCostModel::computeMaxVF

FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC)

Definition:LoopVectorize.cpp:3931

llvm::LoopVectorizationCostModel::TheLoop

Loop * TheLoop

The loop that we evaluate.

Definition:LoopVectorize.cpp:1749

llvm::LoopVectorizationCostModel::CostKind

TTI::TargetCostKind CostKind

The kind of cost that we are calculating.

Definition:LoopVectorize.cpp:1794

llvm::LoopVectorizationCostModel::getTailFoldingStyle

TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow=true) const

Returns the TailFoldingStyle that is best for the current loop.

Definition:LoopVectorize.cpp:1407

llvm::LoopVectorizationCostModel::InterleaveInfo

InterleavedAccessInfo & InterleaveInfo

The interleave access information contains groups of interleaved accesses with the same stride and cl...

Definition:LoopVectorize.cpp:1782

llvm::LoopVectorizationCostModel::ValuesToIgnore

SmallPtrSet< const Value *, 16 > ValuesToIgnore

Values to ignore in the cost model.

Definition:LoopVectorize.cpp:1785

llvm::LoopVectorizationCostModel::setVectorizedCallDecision

void setVectorizedCallDecision(ElementCount VF)

A call may be vectorized in different ways depending on whether we have vectorized variants available...

Definition:LoopVectorize.cpp:6201

llvm::LoopVectorizationCostModel::invalidateCostModelingDecisions

void invalidateCostModelingDecisions()

Invalidates decisions already taken by the cost model.

Definition:LoopVectorize.cpp:1520

llvm::LoopVectorizationCostModel::isAccessInterleaved

bool isAccessInterleaved(Instruction *Instr) const

Check if Instr belongs to any interleaved access group.

Definition:LoopVectorize.cpp:1351

llvm::LoopVectorizationCostModel::selectUserVectorizationFactor

bool selectUserVectorizationFactor(ElementCount UserVF)

Setup cost-based decisions for user vectorization factor.

Definition:LoopVectorize.cpp:994

llvm::LoopVectorizationCostModel::ORE

OptimizationRemarkEmitter * ORE

Interface to emit optimization remarks.

Definition:LoopVectorize.cpp:1773

llvm::LoopVectorizationCostModel::isLegalMaskedStore

bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const

Returns true if the target machine supports masked store operation for the given DataType and kind of...

Definition:LoopVectorize.cpp:1270

llvm::LoopVectorizationCostModel::LI

LoopInfo * LI

Loop Info analysis.

Definition:LoopVectorize.cpp:1755

llvm::LoopVectorizationCostModel::requiresScalarEpilogue

bool requiresScalarEpilogue(bool IsVectorizing) const

Returns true if we're required to use a scalar epilogue for at least the final iteration of the origi...

Definition:LoopVectorize.cpp:1363

llvm::LoopVectorizationCostModel::calculateRegisterUsage

SmallVector< RegisterUsage, 8 > calculateRegisterUsage(ArrayRef< ElementCount > VFs)

Definition:LoopVectorize.cpp:5135

llvm::LoopVectorizationCostModel::VecValuesToIgnore

SmallPtrSet< const Value *, 16 > VecValuesToIgnore

Values to ignore in the cost model when VF > 1.

Definition:LoopVectorize.cpp:1788

llvm::LoopVectorizationCostModel::isInLoopReduction

bool isInLoopReduction(PHINode *Phi) const

Returns true if the Phi is part of an inloop reduction.

Definition:LoopVectorize.cpp:1493

llvm::LoopVectorizationCostModel::isProfitableToScalarize

bool isProfitableToScalarize(Instruction *I, ElementCount VF) const

Definition:LoopVectorize.cpp:1070

llvm::LoopVectorizationCostModel::setWideningDecision

void setWideningDecision(const InterleaveGroup< Instruction > *Grp, ElementCount VF, InstWidening W, InstructionCost Cost)

Save vectorization decision W and Cost taken by the cost model for interleaving group Grp and vector ...

Definition:LoopVectorize.cpp:1147

llvm::LoopVectorizationCostModel::getMinimalBitwidths

const MapVector< Instruction *, uint64_t > & getMinimalBitwidths() const

Definition:LoopVectorize.cpp:1064

llvm::LoopVectorizationCostModel::getCallWideningDecision

CallWideningDecision getCallWideningDecision(CallInst *CI, ElementCount VF) const

Definition:LoopVectorize.cpp:1216

llvm::LoopVectorizationCostModel::isLegalGatherOrScatter

bool isLegalGatherOrScatter(Value *V, ElementCount VF)

Returns true if the target machine can represent V as a masked gather or scatter operation.

Definition:LoopVectorize.cpp:1284

llvm::LoopVectorizationCostModel::canTruncateToMinimalBitwidth

bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const

Definition:LoopVectorize.cpp:1119

llvm::LoopVectorizationCostModel::runtimeChecksRequired

bool runtimeChecksRequired()

Definition:LoopVectorize.cpp:3710

llvm::LoopVectorizationCostModel::shouldConsiderInvariant

bool shouldConsiderInvariant(Value *Op)

Returns true if Op should be considered invariant and if it is trivially hoistable.

Definition:LoopVectorize.cpp:6373

llvm::LoopVectorizationCostModel::foldTailByMasking

bool foldTailByMasking() const

Returns true if all loop blocks should be masked to fold tail loop.

Definition:LoopVectorize.cpp:1464

llvm::LoopVectorizationCostModel::foldTailWithEVL

bool foldTailWithEVL() const

Returns true if VP intrinsics with explicit vector length support should be generated in the tail fol...

Definition:LoopVectorize.cpp:1488

llvm::LoopVectorizationCostModel::collectUniformsAndScalars

void collectUniformsAndScalars(ElementCount VF)

Collect Uniform and Scalar values for the given VF.

Definition:LoopVectorize.cpp:1258

llvm::LoopVectorizationCostModel::blockNeedsPredicationForAnyReason

bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const

Returns true if the instructions in this block requires predication for any reason,...

Definition:LoopVectorize.cpp:1482

llvm::LoopVectorizationCostModel::setCallWideningDecision

void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind, Function *Variant, Intrinsic::ID IID, std::optional< unsigned > MaskPos, InstructionCost Cost)

Definition:LoopVectorize.cpp:1207

llvm::LoopVectorizationCostModel::setTailFoldingStyles

void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC)

Selects and saves TailFoldingStyle for 2 options - if IV update may overflow or not.

Definition:LoopVectorize.cpp:1418

llvm::LoopVectorizationCostModel::AC

AssumptionCache * AC

Assumption cache.

Definition:LoopVectorize.cpp:1770

llvm::LoopVectorizationCostModel::setWideningDecision

void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, InstructionCost Cost)

Save vectorization decision W and Cost taken by the cost model for instruction I and vector width VF.

Definition:LoopVectorize.cpp:1139

llvm::LoopVectorizationCostModel::InstWidening

InstWidening

Decision that was taken during cost calculation for memory instruction.

Definition:LoopVectorize.cpp:1126

llvm::LoopVectorizationCostModel::CM_Scalarize

@ CM_Scalarize

Definition:LoopVectorize.cpp:1132

llvm::LoopVectorizationCostModel::CM_Widen_Reverse

@ CM_Widen_Reverse

Definition:LoopVectorize.cpp:1129

llvm::LoopVectorizationCostModel::CM_Interleave

@ CM_Interleave

Definition:LoopVectorize.cpp:1130

llvm::LoopVectorizationCostModel::CM_IntrinsicCall

@ CM_IntrinsicCall

Definition:LoopVectorize.cpp:1134

llvm::LoopVectorizationCostModel::CM_VectorCall

@ CM_VectorCall

Definition:LoopVectorize.cpp:1133

llvm::LoopVectorizationCostModel::CM_Widen

@ CM_Widen

Definition:LoopVectorize.cpp:1128

llvm::LoopVectorizationCostModel::CM_GatherScatter

@ CM_GatherScatter

Definition:LoopVectorize.cpp:1131

llvm::LoopVectorizationCostModel::CM_Unknown

@ CM_Unknown

Definition:LoopVectorize.cpp:1127

llvm::LoopVectorizationCostModel::isScalarWithPredication

bool isScalarWithPredication(Instruction *I, ElementCount VF) const

Returns true if I is an instruction which requires predication and for which our chosen predication s...

Definition:LoopVectorize.cpp:3221

llvm::LoopVectorizationCostModel::getVectorCallCost

InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const

Estimate cost of a call instruction CI if it were vectorized with factor VF.

Definition:LoopVectorize.cpp:2831

llvm::LoopVectorizationCostModel::useOrderedReductions

bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const

Returns true if we should use strict in-order reductions for the given RdxDesc.

Definition:LoopVectorize.cpp:1057

llvm::LoopVectorizationCostModel::getDivRemSpeculationCost

std::pair< InstructionCost, InstructionCost > getDivRemSpeculationCost(Instruction *I, ElementCount VF) const

Return the costs for our two available strategies for lowering a div/rem operation which requires spe...

Definition:LoopVectorize.cpp:3314

llvm::LoopVectorizationCostModel::isDivRemScalarWithPredication

bool isDivRemScalarWithPredication(InstructionCost ScalarCost, InstructionCost SafeDivisorCost) const

Given costs for both strategies, return true if the scalar predication lowering should be used for di...

Definition:LoopVectorize.cpp:1309

llvm::LoopVectorizationCostModel::expectedCost

InstructionCost expectedCost(ElementCount VF)

Returns the expected execution cost.

Definition:LoopVectorize.cpp:5522

llvm::LoopVectorizationCostModel::setCostBasedWideningDecision

void setCostBasedWideningDecision(ElementCount VF)

Memory access instruction may be vectorized in more than one way.

Definition:LoopVectorize.cpp:6014

llvm::LoopVectorizationCostModel::getWideningDecision

InstWidening getWideningDecision(Instruction *I, ElementCount VF) const

Return the cost model decision for the given instruction I and vector width VF.

Definition:LoopVectorize.cpp:1176

llvm::LoopVectorizationCostModel::isScalarEpilogueAllowed

bool isScalarEpilogueAllowed() const

Returns true if a scalar epilogue is not allowed due to optsize or a loop hint annotation.

Definition:LoopVectorize.cpp:1402

llvm::LoopVectorizationCostModel::getWideningCost

InstructionCost getWideningCost(Instruction *I, ElementCount VF)

Return the vectorization cost for the given instruction I and vector width VF.

Definition:LoopVectorize.cpp:1191

llvm::LoopVectorizationCostModel::selectInterleaveCount

unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost)

Definition:LoopVectorize.cpp:4851

llvm::LoopVectorizationCostModel::collectInstsToScalarize

void collectInstsToScalarize(ElementCount VF)

Collects the instructions to scalarize for each predicated instruction in the loop.

Definition:LoopVectorize.cpp:5357

llvm::LoopVectorizationLegality

LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...

Definition:LoopVectorizationLegality.h:252

llvm::LoopVectorizationLegality::getNumStores

unsigned getNumStores() const

Definition:LoopVectorizationLegality.h:423

llvm::LoopVectorizationLegality::isInvariantStoreOfReduction

bool isInvariantStoreOfReduction(StoreInst *SI)

Returns True if given store is a final invariant store of one of the reductions found in the loop.

Definition:LoopVectorizationLegality.cpp:1337

llvm::LoopVectorizationLegality::hasVectorCallVariants

bool hasVectorCallVariants() const

Returns true if there is at least one function call in the loop which has a vectorized variant availa...

Definition:LoopVectorizationLegality.h:417

llvm::LoopVectorizationLegality::getMaxSafeVectorWidthInBits

uint64_t getMaxSafeVectorWidthInBits() const

Definition:LoopVectorizationLegality.h:388

llvm::LoopVectorizationLegality::getFixedOrderRecurrences

RecurrenceSet & getFixedOrderRecurrences()

Return the fixed-order recurrences found in the loop.

Definition:LoopVectorizationLegality.h:308

llvm::LoopVectorizationLegality::isInvariantAddressOfReduction

bool isInvariantAddressOfReduction(Value *V)

Returns True if given address is invariant and is used to store recurrent expression.

Definition:LoopVectorizationLegality.cpp:1344

llvm::LoopVectorizationLegality::blockNeedsPredication

bool blockNeedsPredication(BasicBlock *BB) const

Return true if the block BB needs to be predicated in order for the loop to be vectorized.

Definition:LoopVectorizationLegality.cpp:1402

llvm::LoopVectorizationLegality::canVectorize

bool canVectorize(bool UseVPlanNativePath)

Returns true if it is legal to vectorize this loop.

Definition:LoopVectorizationLegality.cpp:1762

llvm::LoopVectorizationLegality::isConsecutivePtr

int isConsecutivePtr(Type *AccessTy, Value *Ptr) const

Check if this pointer is consecutive when vectorizing.

Definition:LoopVectorizationLegality.cpp:454

llvm::LoopVectorizationLegality::getHistogramInfo

std::optional< const HistogramInfo * > getHistogramInfo(Instruction *I) const

Returns a HistogramInfo* for the given instruction if it was determined to be part of a load -> updat...

Definition:LoopVectorizationLegality.h:429

llvm::LoopVectorizationLegality::canVectorizeFPMath

bool canVectorizeFPMath(bool EnableStrictReductions)

Returns true if it is legal to vectorize the FP math operations in this loop.

Definition:LoopVectorizationLegality.cpp:1311

llvm::LoopVectorizationLegality::isReductionVariable

bool isReductionVariable(PHINode *PN) const

Returns True if PN is a reduction variable in this loop.

Definition:LoopVectorizationLegality.h:343

llvm::LoopVectorizationLegality::isFixedOrderRecurrence

bool isFixedOrderRecurrence(const PHINode *Phi) const

Returns True if Phi is a fixed-order recurrence in this loop.

Definition:LoopVectorizationLegality.cpp:1397

llvm::LoopVectorizationLegality::getPointerInductionDescriptor

const InductionDescriptor * getPointerInductionDescriptor(PHINode *Phi) const

Returns a pointer to the induction descriptor, if Phi is pointer induction.

Definition:LoopVectorizationLegality.cpp:1378

llvm::LoopVectorizationLegality::getIntOrFpInductionDescriptor

const InductionDescriptor * getIntOrFpInductionDescriptor(PHINode *Phi) const

Returns a pointer to the induction descriptor, if Phi is an integer or floating point induction.

Definition:LoopVectorizationLegality.cpp:1367

llvm::LoopVectorizationLegality::isInductionPhi

bool isInductionPhi(const Value *V) const

Returns True if V is a Phi node of an induction variable in this loop.

Definition:LoopVectorizationLegality.cpp:1357

llvm::LoopVectorizationLegality::getPrimaryInduction

PHINode * getPrimaryInduction()

Returns the primary induction variable.

Definition:LoopVectorizationLegality.h:299

llvm::LoopVectorizationLegality::getCountableExitingBlocks

const SmallVector< BasicBlock *, 4 > & getCountableExitingBlocks() const

Returns all exiting blocks with a countable exit, i.e.

Definition:LoopVectorizationLegality.h:456

llvm::LoopVectorizationLegality::getInductionVars

const InductionList & getInductionVars() const

Returns the induction variables found in the loop.

Definition:LoopVectorizationLegality.h:305

llvm::LoopVectorizationLegality::hasStructVectorCall

bool hasStructVectorCall() const

Returns true if there is at least one function call in the loop which returns a struct type and needs...

Definition:LoopVectorizationLegality.h:421

llvm::LoopVectorizationLegality::isInvariant

bool isInvariant(Value *V) const

Returns true if V is invariant across all loop iterations according to SCEV.

Definition:LoopVectorizationLegality.cpp:472

llvm::LoopVectorizationLegality::getReductionVars

const ReductionList & getReductionVars() const

Returns the reduction variables found in the loop.

Definition:LoopVectorizationLegality.h:302

llvm::LoopVectorizationLegality::isSafeForAnyVectorWidth

bool isSafeForAnyVectorWidth() const

Definition:LoopVectorizationLegality.h:384

llvm::LoopVectorizationLegality::getNumLoads

unsigned getNumLoads() const

Definition:LoopVectorizationLegality.h:424

llvm::LoopVectorizationLegality::canFoldTailByMasking

bool canFoldTailByMasking() const

Return true if we can vectorize this loop while folding its tail by masking.

Definition:LoopVectorizationLegality.cpp:1879

llvm::LoopVectorizationLegality::prepareToFoldTailByMasking

void prepareToFoldTailByMasking()

Mark all respective loads/stores for masking.

Definition:LoopVectorizationLegality.cpp:1937

llvm::LoopVectorizationLegality::getWidestInductionType

Type * getWidestInductionType()

Returns the widest induction type.

Definition:LoopVectorizationLegality.h:311

llvm::LoopVectorizationLegality::hasUncountableEarlyExit

bool hasUncountableEarlyExit() const

Returns true if the loop has exactly one uncountable early exit, i.e.

Definition:LoopVectorizationLegality.h:394

llvm::LoopVectorizationLegality::hasHistograms

bool hasHistograms() const

Returns a list of all known histogram operations in the loop.

Definition:LoopVectorizationLegality.h:438

llvm::LoopVectorizationLegality::getLAI

const LoopAccessInfo * getLAI() const

Definition:LoopVectorizationLegality.h:382

llvm::LoopVectorizationLegality::isUniformMemOp

bool isUniformMemOp(Instruction &I, ElementCount VF) const

A uniform memory op is a load or store which accesses the same memory location on all VF lanes,...

Definition:LoopVectorizationLegality.cpp:600

llvm::LoopVectorizationLegality::getUncountableEarlyExitingBlock

BasicBlock * getUncountableEarlyExitingBlock() const

Returns the uncountable early exiting block, if there is exactly one.

Definition:LoopVectorizationLegality.h:399

llvm::LoopVectorizationLegality::isMaskRequired

bool isMaskRequired(const Instruction *I) const

Returns true if vector representation of the instruction I requires mask.

Definition:LoopVectorizationLegality.h:411

llvm::LoopVectorizationLegality::getRuntimePointerChecking

const RuntimePointerChecking * getRuntimePointerChecking() const

Returns the information that we collected about runtime memory check.

Definition:LoopVectorizationLegality.h:378

llvm::LoopVectorizationPlanner

Planner drives the vectorization process after having passed Legality checks.

Definition:LoopVectorizationPlanner.h:357

llvm::LoopVectorizationPlanner::selectEpilogueVectorizationFactor

VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, unsigned IC)

Definition:LoopVectorize.cpp:4666

llvm::LoopVectorizationPlanner::getPlanFor

VPlan & getPlanFor(ElementCount VF) const

Return the VPlan for VF.

Definition:VPlan.cpp:1639

llvm::LoopVectorizationPlanner::planInVPlanNativePath

VectorizationFactor planInVPlanNativePath(ElementCount UserVF)

Use the VPlan-native path to plan how to best vectorize, return the best VF and its cost.

Definition:LoopVectorize.cpp:7044

llvm::LoopVectorizationPlanner::buildVPlans

void buildVPlans(ElementCount MinVF, ElementCount MaxVF)

Build VPlans for power-of-2 VF's between MinVF and MaxVF inclusive, according to the information gath...

Definition:VPlan.cpp:1624

llvm::LoopVectorizationPlanner::computeBestVF

VectorizationFactor computeBestVF()

Compute and return the most profitable vectorization factor.

Definition:LoopVectorize.cpp:7457

llvm::LoopVectorizationPlanner::emitInvalidCostRemarks

void emitInvalidCostRemarks(OptimizationRemarkEmitter *ORE)

Emit remarks for recipes with invalid costs in the available VPlans.

Definition:LoopVectorize.cpp:4326

llvm::LoopVectorizationPlanner::getDecisionAndClampRange

static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)

Test a Predicate on a Range of VF's.

Definition:VPlan.cpp:1605

llvm::LoopVectorizationPlanner::printPlans

void printPlans(raw_ostream &O)

Definition:VPlan.cpp:1653

llvm::LoopVectorizationPlanner::plan

void plan(ElementCount UserVF, unsigned UserIC)

Build VPlans for the specified UserVF and UserIC if they are non-zero or all applicable candidate VFs...

Definition:LoopVectorize.cpp:7095

llvm::LoopVectorizationPlanner::executePlan

DenseMap< const SCEV *, Value * > executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan, InnerLoopVectorizer &LB, DominatorTree *DT, bool VectorizingEpilogue, const DenseMap< const SCEV *, Value * > *ExpandedSCEVs=nullptr)

Generate the IR code for the vectorized loop captured in VPlan BestPlan according to the best selecte...

Definition:LoopVectorize.cpp:7639

llvm::LoopVectorizationPlanner::hasPlanWithVF

bool hasPlanWithVF(ElementCount VF) const

Look through the existing plans and return true if we have one with vectorization factor VF.

Definition:LoopVectorizationPlanner.h:463

llvm::LoopVectorizationRequirements

This holds vectorization requirements that must be verified late in the process.

Definition:LoopVectorizationLegality.h:213

llvm::LoopVectorizationRequirements::getExactFPInst

Instruction * getExactFPInst()

Definition:LoopVectorizationLegality.h:221

llvm::LoopVectorizeHints

Utility class for getting and setting loop vectorizer hints in the form of loop metadata.

Definition:LoopVectorizationLegality.h:60

llvm::LoopVectorizeHints::isScalableVectorizationDisabled

bool isScalableVectorizationDisabled() const

Definition:LoopVectorizationLegality.h:161

llvm::LoopVectorizeHints::getForce

enum ForceKind getForce() const

Definition:LoopVectorizationLegality.h:153

llvm::LoopVectorizeHints::allowVectorization

bool allowVectorization(Function *F, Loop *L, bool VectorizeOnlyWhenForced) const

Definition:LoopVectorizationLegality.cpp:182

llvm::LoopVectorizeHints::allowReordering

bool allowReordering() const

When enabling loop hints are provided we allow the vectorizer to change the order of operations that ...

Definition:LoopVectorizationLegality.cpp:250

llvm::LoopVectorizeHints::emitRemarkWithHints

void emitRemarkWithHints() const

Dumps all the hint information.

Definition:LoopVectorizationLegality.cpp:215

llvm::LoopVectorizeHints::isPotentiallyUnsafe

bool isPotentiallyUnsafe() const

Definition:LoopVectorizationLegality.h:176

llvm::LoopVectorizeHints::getWidth

ElementCount getWidth() const

Definition:LoopVectorizationLegality.h:137

llvm::LoopVectorizeHints::FK_Enabled

@ FK_Enabled

Forcing enabled.

Definition:LoopVectorizationLegality.h:110

llvm::LoopVectorizeHints::FK_Undefined

@ FK_Undefined

Not selected.

Definition:LoopVectorizationLegality.h:108

llvm::LoopVectorizeHints::FK_Disabled

@ FK_Disabled

Forcing disabled.

Definition:LoopVectorizationLegality.h:109

llvm::LoopVectorizeHints::getPredicate

unsigned getPredicate() const

Definition:LoopVectorizationLegality.h:152

llvm::LoopVectorizeHints::setAlreadyVectorized

void setAlreadyVectorized()

Mark the loop L as already vectorized by setting the width to 1.

Definition:LoopVectorizationLegality.cpp:163

llvm::LoopVectorizeHints::vectorizeAnalysisPassName

const char * vectorizeAnalysisPassName() const

If hints are provided that force vectorization, use the AlwaysPrint pass name to force the frontend t...

Definition:LoopVectorizationLegality.cpp:240

llvm::LoopVectorizeHints::getInterleave

unsigned getInterleave() const

Definition:LoopVectorizationLegality.h:142

llvm::LoopVersioning::prepareNoAliasMetadata

void prepareNoAliasMetadata()

Set up the aliasing scopes based on the memchecks.

Definition:LoopVersioning.cpp:174

llvm::Loop

Represents a single loop in the control flow graph.

Definition:LoopInfo.h:39

llvm::Loop::hasLoopInvariantOperands

bool hasLoopInvariantOperands(const Instruction *I) const

Return true if all the operands of the specified instruction are loop invariant.

Definition:LoopInfo.cpp:67

llvm::Loop::getStartLoc

DebugLoc getStartLoc() const

Return the debug location of the start of this loop.

Definition:LoopInfo.cpp:632

llvm::Loop::isLoopInvariant

bool isLoopInvariant(const Value *V) const

Return true if the specified value is loop invariant.

Definition:LoopInfo.cpp:61

llvm::Loop::getLoopID

MDNode * getLoopID() const

Return the llvm.loop loop id metadata node for this loop if it is present.

Definition:LoopInfo.cpp:502

llvm::MDNode

Metadata node.

Definition:Metadata.h:1073

llvm::MDNode::replaceOperandWith

void replaceOperandWith(unsigned I, Metadata *New)

Replace a specific operand.

Definition:Metadata.cpp:1077

llvm::MDNode::getOperand

const MDOperand & getOperand(unsigned I) const

Definition:Metadata.h:1434

llvm::MDNode::get

static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)

Definition:Metadata.h:1549

llvm::MDNode::getNumOperands

unsigned getNumOperands() const

Return number of MDNode operands.

Definition:Metadata.h:1440

llvm::MDString::get

static MDString * get(LLVMContext &Context, StringRef Str)

Definition:Metadata.cpp:606

llvm::MapVector

This class implements a map that also provides access to all stored values in a deterministic order.

Definition:MapVector.h:36

llvm::MapVector::find

iterator find(const KeyT &Key)

Definition:MapVector.h:167

llvm::MapVector::contains

bool contains(const KeyT &Key) const

Definition:MapVector.h:163

llvm::MapVector::empty

bool empty() const

Definition:MapVector.h:79

llvm::MapVector::size

size_type size() const

Definition:MapVector.h:60

llvm::Module::getFunction

Function * getFunction(StringRef Name) const

Look up the specified function in the module symbol table.

Definition:Module.cpp:228

llvm::OptimizationRemarkAnalysisAliasing

Diagnostic information for optimization analysis remarks related to pointer aliasing.

Definition:DiagnosticInfo.h:968

llvm::OptimizationRemarkAnalysisFPCommute

Diagnostic information for optimization analysis remarks related to floating-point non-commutativity.

Definition:DiagnosticInfo.h:926

llvm::OptimizationRemarkAnalysis

Diagnostic information for optimization analysis remarks.

Definition:DiagnosticInfo.h:853

llvm::OptimizationRemarkEmitterAnalysis

Definition:OptimizationRemarkEmitter.h:164

llvm::OptimizationRemarkEmitter

The optimization diagnostic interface.

Definition:OptimizationRemarkEmitter.h:32

llvm::OptimizationRemarkEmitter::allowExtraAnalysis

bool allowExtraAnalysis(StringRef PassName) const

Whether we allow for extra compile-time budget to perform more analysis to produce fewer false positi...

Definition:OptimizationRemarkEmitter.h:97

llvm::OptimizationRemarkEmitter::emit

void emit(DiagnosticInfoOptimizationBase &OptDiag)

Output the remark via the diagnostic handler and to the optimization record file.

Definition:OptimizationRemarkEmitter.cpp:79

llvm::OptimizationRemarkMissed

Diagnostic information for missed-optimization remarks.

Definition:DiagnosticInfo.h:807

llvm::OptimizationRemark

Diagnostic information for applied optimization remarks.

Definition:DiagnosticInfo.h:762

llvm::OuterAnalysisManagerProxy

An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...

Definition:PassManager.h:692

llvm::PHINode

Definition:Instructions.h:2600

llvm::PHINode::addIncoming

void addIncoming(Value *V, BasicBlock *BB)

Add an incoming value to the end of the PHI list.

Definition:Instructions.h:2735

llvm::PHINode::getIncomingValueForBlock

Value * getIncomingValueForBlock(const BasicBlock *BB) const

Definition:Instructions.h:2775

llvm::PHINode::getIncomingValueNumForOperand

static unsigned getIncomingValueNumForOperand(unsigned i)

Definition:Instructions.h:2689

llvm::PoisonValue::get

static PoisonValue * get(Type *T)

Static factory methods - Return an 'poison' object of the specified type.

Definition:Constants.cpp:1878

llvm::PredicatedScalarEvolution

An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...

Definition:ScalarEvolution.h:2383

llvm::PredicatedScalarEvolution::getSE

ScalarEvolution * getSE() const

Returns the ScalarEvolution analysis used.

Definition:ScalarEvolution.h:2422

llvm::PredicatedScalarEvolution::getPredicate

const SCEVPredicate & getPredicate() const

Definition:ScalarEvolution.cpp:15171

llvm::PredicatedScalarEvolution::getSmallConstantMaxTripCount

unsigned getSmallConstantMaxTripCount()

Returns the upper bound of the loop trip count as a normal unsigned value, or 0 if the trip count is ...

Definition:ScalarEvolution.cpp:15151

llvm::PredicatedScalarEvolution::getBackedgeTakenCount

const SCEV * getBackedgeTakenCount()

Get the (predicated) backedge count for the analyzed loop.

Definition:ScalarEvolution.cpp:15130

llvm::PredicatedScalarEvolution::getSymbolicMaxBackedgeTakenCount

const SCEV * getSymbolicMaxBackedgeTakenCount()

Get the (predicated) symbolic max backedge count for the analyzed loop.

Definition:ScalarEvolution.cpp:15140

llvm::PredicatedScalarEvolution::getSCEV

const SCEV * getSCEV(Value *V)

Returns the SCEV expression of V, in the context of the current SCEV predicate.

Definition:ScalarEvolution.cpp:15111

llvm::PreservedAnalyses

A set of analyses that are preserved following a run of a transformation pass.

Definition:Analysis.h:111

llvm::PreservedAnalyses::all

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

Definition:Analysis.h:117

llvm::PreservedAnalyses::preserveSet

void preserveSet()

Mark an analysis set as preserved.

Definition:Analysis.h:146

llvm::PreservedAnalyses::preserve

void preserve()

Mark an analysis as preserved.

Definition:Analysis.h:131

llvm::ProfileSummaryAnalysis

An analysis pass based on the new PM to deliver ProfileSummaryInfo.

Definition:ProfileSummaryInfo.h:372

llvm::ProfileSummaryInfo

Analysis providing profile information.

Definition:ProfileSummaryInfo.h:41

llvm::ProfileSummaryInfo::hasProfileSummary

bool hasProfileSummary() const

Returns true if profile summary is available.

Definition:ProfileSummaryInfo.h:70

llvm::RecurrenceDescriptor

The RecurrenceDescriptor is used to identify recurrences variables in a loop.

Definition:IVDescriptors.h:77

llvm::RecurrenceDescriptor::isFMulAddIntrinsic

static bool isFMulAddIntrinsic(Instruction *I)

Returns true if the instruction is a call to the llvm.fmuladd intrinsic.

Definition:IVDescriptors.h:296

llvm::RecurrenceDescriptor::getFastMathFlags

FastMathFlags getFastMathFlags() const

Definition:IVDescriptors.h:214

llvm::RecurrenceDescriptor::getLoopExitInstr

Instruction * getLoopExitInstr() const

Definition:IVDescriptors.h:218

llvm::RecurrenceDescriptor::getOpcode

static unsigned getOpcode(RecurKind Kind)

Returns the opcode corresponding to the RecurrenceKind.

Definition:IVDescriptors.cpp:1130

llvm::RecurrenceDescriptor::getRecurrenceType

Type * getRecurrenceType() const

Returns the type of the recurrence.

Definition:IVDescriptors.h:264

llvm::RecurrenceDescriptor::getCastInsts

const SmallPtrSet< Instruction *, 8 > & getCastInsts() const

Returns a reference to the instructions used for type-promoting the recurrence.

Definition:IVDescriptors.h:277

llvm::RecurrenceDescriptor::getMinWidthCastToRecurrenceTypeInBits

unsigned getMinWidthCastToRecurrenceTypeInBits() const

Returns the minimum width used by the recurrence in bits.

Definition:IVDescriptors.h:280

llvm::RecurrenceDescriptor::getRecurrenceStartValue

TrackingVH< Value > getRecurrenceStartValue() const

Definition:IVDescriptors.h:216

llvm::RecurrenceDescriptor::getReductionOpChain

SmallVector< Instruction *, 4 > getReductionOpChain(PHINode *Phi, Loop *L) const

Attempts to find a chain of operations from Phi to LoopExitInst that can be treated as a set of reduc...

Definition:IVDescriptors.cpp:1167

llvm::RecurrenceDescriptor::isAnyOfRecurrenceKind

static bool isAnyOfRecurrenceKind(RecurKind Kind)

Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...

Definition:IVDescriptors.h:252

llvm::RecurrenceDescriptor::isFindLastIVRecurrenceKind

static bool isFindLastIVRecurrenceKind(RecurKind Kind)

Returns true if the recurrence kind is of the form select(cmp(),x,y) where one of (x,...

Definition:IVDescriptors.h:258

llvm::RecurrenceDescriptor::isSigned

bool isSigned() const

Returns true if all source operands of the recurrence are SExtInsts.

Definition:IVDescriptors.h:285

llvm::RecurrenceDescriptor::getRecurrenceKind

RecurKind getRecurrenceKind() const

Definition:IVDescriptors.h:210

llvm::RecurrenceDescriptor::isOrdered

bool isOrdered() const

Expose an ordered FP reduction to the instance users.

Definition:IVDescriptors.h:288

llvm::RecurrenceDescriptor::getSentinelValue

Value * getSentinelValue() const

Returns the sentinel value for FindLastIV recurrences to replace the start value.

Definition:IVDescriptors.h:268

llvm::RecurrenceDescriptor::isMinMaxRecurrenceKind

static bool isMinMaxRecurrenceKind(RecurKind Kind)

Returns true if the recurrence kind is any min/max kind.

Definition:IVDescriptors.h:246

llvm::RuntimePointerChecking::Need

bool Need

This flag indicates if we need to add the runtime check.

Definition:LoopAccessAnalysis.h:554

llvm::RuntimePointerChecking::getDiffChecks

std::optional< ArrayRef< PointerDiffInfo > > getDiffChecks() const

Definition:LoopAccessAnalysis.h:530

llvm::RuntimePointerChecking::getChecks

const SmallVectorImpl< RuntimePointerCheck > & getChecks() const

Returns the checks that generateChecks created.

Definition:LoopAccessAnalysis.h:521

llvm::SCEVConstant

This class represents a constant integer value.

Definition:ScalarEvolutionExpressions.h:60

llvm::SCEVConstant::getAPInt

const APInt & getAPInt() const

Definition:ScalarEvolutionExpressions.h:70

llvm::SCEVExpanderCleaner

Helper to remove instructions inserted during SCEV expansion, unless they are marked as used.

Definition:ScalarEvolutionExpander.h:548

llvm::SCEVExpander

This class uses information about analyze scalars to rewrite expressions in canonical form.

Definition:ScalarEvolutionExpander.h:63

llvm::SCEVExpander::getSE

ScalarEvolution * getSE()

Definition:ScalarEvolutionExpander.h:220

llvm::SCEVExpander::isInsertedInstruction

bool isInsertedInstruction(Instruction *I) const

Return true if the specified instruction was inserted by the code rewriter.

Definition:ScalarEvolutionExpander.h:407

llvm::SCEVExpander::expandCodeForPredicate

Value * expandCodeForPredicate(const SCEVPredicate *Pred, Instruction *Loc)

Generates a code sequence that evaluates this predicate.

Definition:ScalarEvolutionExpander.cpp:2067

llvm::SCEVPredicate

This class represents an assumption made using SCEV expressions which can be checked at run-time.

Definition:ScalarEvolution.h:214

llvm::SCEVPredicate::isAlwaysTrue

virtual bool isAlwaysTrue() const =0

Returns true if the predicate is always true.

llvm::SCEV

This class represents an analyzed expression in the program.

Definition:ScalarEvolution.h:71

llvm::SCEV::isOne

bool isOne() const

Return true if the expression is a constant one.

Definition:ScalarEvolution.cpp:450

llvm::SCEV::isZero

bool isZero() const

Return true if the expression is a constant zero.

Definition:ScalarEvolution.cpp:448

llvm::SCEV::getType

Type * getType() const

Return the LLVM type of this SCEV expression.

Definition:ScalarEvolution.cpp:386

llvm::ScalarEvolutionAnalysis

Analysis pass that exposes the ScalarEvolution for a function.

Definition:ScalarEvolution.h:2320

llvm::ScalarEvolution

The main scalar evolution driver.

Definition:ScalarEvolution.h:447

llvm::ScalarEvolution::getURemExpr

const SCEV * getURemExpr(const SCEV *LHS, const SCEV *RHS)

Represents an unsigned remainder expression based on unsigned division.

Definition:ScalarEvolution.cpp:3371

llvm::ScalarEvolution::getBackedgeTakenCount

const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)

If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...

Definition:ScalarEvolution.cpp:8350

llvm::ScalarEvolution::getConstant

const SCEV * getConstant(ConstantInt *V)

Definition:ScalarEvolution.cpp:473

llvm::ScalarEvolution::getSCEV

const SCEV * getSCEV(Value *V)

Return a SCEV expression for the full generality of the specified expression.

Definition:ScalarEvolution.cpp:4547

llvm::ScalarEvolution::getOne

const SCEV * getOne(Type *Ty)

Return a SCEV for the constant 1 of a specific type.

Definition:ScalarEvolution.h:656

llvm::ScalarEvolution::forgetLoop

void forgetLoop(const Loop *L)

This method should be called by the client when it has changed a loop in a way that may effect Scalar...

Definition:ScalarEvolution.cpp:8496

llvm::ScalarEvolution::isLoopInvariant

bool isLoopInvariant(const SCEV *S, const Loop *L)

Return true if the value of the given SCEV is unchanging in the specified loop.

Definition:ScalarEvolution.cpp:14100

llvm::ScalarEvolution::isSCEVable

bool isSCEVable(Type *Ty) const

Test if values of the given type are analyzable within the SCEV framework.

Definition:ScalarEvolution.cpp:4441

llvm::ScalarEvolution::forgetValue

void forgetValue(Value *V)

This method should be called by the client when it has changed a value in a way that may effect its v...

Definition:ScalarEvolution.cpp:8542

llvm::ScalarEvolution::forgetBlockAndLoopDispositions

void forgetBlockAndLoopDispositions(Value *V=nullptr)

Called when the client has changed the disposition of values in a loop or block.

Definition:ScalarEvolution.cpp:8597

llvm::ScalarEvolution::getMinusOne

const SCEV * getMinusOne(Type *Ty)

Return a SCEV for the constant -1 of a specific type.

Definition:ScalarEvolution.h:665

llvm::ScalarEvolution::forgetLcssaPhiWithNewPredecessor

void forgetLcssaPhiWithNewPredecessor(Loop *L, PHINode *V)

Forget LCSSA phi node V of loop L to which a new predecessor was added, such that it may no longer be...

Definition:ScalarEvolution.cpp:8557

llvm::ScalarEvolution::getSmallConstantTripCount

unsigned getSmallConstantTripCount(const Loop *L)

Returns the exact trip count of the loop if we can compute it, and the result is a small constant.

Definition:ScalarEvolution.cpp:8237

llvm::ScalarEvolution::getUnsignedRangeMax

APInt getUnsignedRangeMax(const SCEV *S)

Determine the max of the unsigned range for a particular SCEV.

Definition:ScalarEvolution.h:1007

llvm::ScalarEvolution::applyLoopGuards

const SCEV * applyLoopGuards(const SCEV *Expr, const Loop *L)

Try to apply information from loop guards for L to Expr.

Definition:ScalarEvolution.cpp:15967

llvm::ScalarEvolution::getAddExpr

const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Get a canonical add expression, or something simpler if possible.

Definition:ScalarEvolution.cpp:2526

llvm::ScalarEvolution::isKnownPredicate

bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)

Test if the given expression is known to satisfy the condition described by Pred, LHS,...

Definition:ScalarEvolution.cpp:11050

llvm::ScalarEvolution::verify

void verify() const

Definition:ScalarEvolution.cpp:14352

llvm::SelectInst

This class represents the LLVM 'select' instruction.

Definition:Instructions.h:1657

llvm::SetVector

A vector that has set insertion semantics.

Definition:SetVector.h:57

llvm::SetVector::getArrayRef

ArrayRef< value_type > getArrayRef() const

Definition:SetVector.h:84

llvm::SetVector::size

size_type size() const

Determine the number of elements in the SetVector.

Definition:SetVector.h:98

llvm::SetVector::end

iterator end()

Get an iterator to the end of the SetVector.

Definition:SetVector.h:113

llvm::SetVector::count

size_type count(const key_type &key) const

Count the number of elements of a given key in the SetVector.

Definition:SetVector.h:264

llvm::SetVector::empty

bool empty() const

Determine if the SetVector is empty or not.

Definition:SetVector.h:93

llvm::SetVector::begin

iterator begin()

Get an iterator to the beginning of the SetVector.

Definition:SetVector.h:103

llvm::SetVector::insert

bool insert(const value_type &X)

Insert a new element into the SetVector.

Definition:SetVector.h:162

llvm::SetVector::pop_back_val

value_type pop_back_val()

Definition:SetVector.h:285

llvm::SmallDenseMap

Definition:DenseMap.h:883

llvm::SmallPtrSetImplBase::size

size_type size() const

Definition:SmallPtrSet.h:94

llvm::SmallPtrSetImplBase::empty

bool empty() const

Definition:SmallPtrSet.h:93

llvm::SmallPtrSetImpl

A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...

Definition:SmallPtrSet.h:363

llvm::SmallPtrSetImpl::erase

bool erase(PtrType Ptr)

Remove pointer from the set.

Definition:SmallPtrSet.h:401

llvm::SmallPtrSetImpl::count

size_type count(ConstPtrType Ptr) const

count - Return 1 if the specified pointer is in the set, 0 otherwise.

Definition:SmallPtrSet.h:452

llvm::SmallPtrSetImpl::end

iterator end() const

Definition:SmallPtrSet.h:477

llvm::SmallPtrSetImpl::insert

std::pair< iterator, bool > insert(PtrType Ptr)

Inserts Ptr if and only if there is no element in the container equal to Ptr.

Definition:SmallPtrSet.h:384

llvm::SmallPtrSetImpl::begin

iterator begin() const

Definition:SmallPtrSet.h:472

llvm::SmallPtrSetImpl::contains

bool contains(ConstPtrType Ptr) const

Definition:SmallPtrSet.h:458

llvm::SmallPtrSet

SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.

Definition:SmallPtrSet.h:519

llvm::SmallSetVector

A SetVector that performs no allocations if smaller than a certain size.

Definition:SetVector.h:370

llvm::SmallSet

SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...

Definition:SmallSet.h:132

llvm::SmallSet::insert

std::pair< const_iterator, bool > insert(const T &V)

insert - Insert an element into the set if it isn't already there.

Definition:SmallSet.h:181

llvm::SmallVectorBase::empty

bool empty() const

Definition:SmallVector.h:81

llvm::SmallVectorBase::size

size_t size() const

Definition:SmallVector.h:78

llvm::SmallVectorImpl

This class consists of common code factored out of the SmallVector class to reduce code duplication b...

Definition:SmallVector.h:573

llvm::SmallVectorImpl::pop_back_val

T pop_back_val()

Definition:SmallVector.h:673

llvm::SmallVectorImpl::emplace_back

reference emplace_back(ArgTypes &&... Args)

Definition:SmallVector.h:937

llvm::SmallVectorImpl::append

void append(ItTy in_start, ItTy in_end)

Add the specified range to the end of the SmallVector.

Definition:SmallVector.h:683

llvm::SmallVectorImpl::clear

void clear()

Definition:SmallVector.h:610

llvm::SmallVectorTemplateBase::push_back

void push_back(const T &Elt)

Definition:SmallVector.h:413

llvm::SmallVectorTemplateCommon::end

iterator end()

Definition:SmallVector.h:269

llvm::SmallVectorTemplateCommon::begin

iterator begin()

Definition:SmallVector.h:267

llvm::SmallVector

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Definition:SmallVector.h:1196

llvm::StoreInst

An instruction for storing to memory.

Definition:Instructions.h:292

llvm::StringRef

StringRef - Represent a constant reference to a string, i.e.

Definition:StringRef.h:51

llvm::SwitchInst

Multiway switch.

Definition:Instructions.h:3154

llvm::TargetIRAnalysis

Analysis pass providing the TargetTransformInfo.

Definition:TargetTransformInfo.h:3194

llvm::TargetLibraryAnalysis

Analysis pass providing the TargetLibraryInfo.

Definition:TargetLibraryInfo.h:614

llvm::TargetLibraryInfo

Provides information about what library functions are available for the current target.

Definition:TargetLibraryInfo.h:280

llvm::TargetTransformInfo

This pass provides access to the codegen interfaces that are needed for IR-level transformations.

Definition:TargetTransformInfo.h:212

llvm::TargetTransformInfo::getVScaleForTuning

std::optional< unsigned > getVScaleForTuning() const

Definition:TargetTransformInfo.cpp:789

llvm::TargetTransformInfo::isLegalMaskedScatter

bool isLegalMaskedScatter(Type *DataType, Align Alignment) const

Return true if the target supports masked scatter.

Definition:TargetTransformInfo.cpp:501

llvm::TargetTransformInfo::enableAggressiveInterleaving

bool enableAggressiveInterleaving(bool LoopHasReductions) const

Don't restrict interleaved unrolling to small loops.

Definition:TargetTransformInfo.cpp:653

llvm::TargetTransformInfo::preferInLoopReduction

bool preferInLoopReduction(unsigned Opcode, Type *Ty, ReductionFlags Flags) const

Definition:TargetTransformInfo.cpp:1363

llvm::TargetTransformInfo::supportsEfficientVectorElementLoadStore

bool supportsEfficientVectorElementLoadStore() const

If target has efficient vector element load/store instructions, it can return true here so that inser...

Definition:TargetTransformInfo.cpp:641

llvm::TargetTransformInfo::prefersVectorizedAddressing

bool prefersVectorizedAddressing() const

Return true if target doesn't mind addresses in vectors.

Definition:TargetTransformInfo.cpp:556

llvm::TargetTransformInfo::getCmpSelInstrCost

InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1067

llvm::TargetTransformInfo::hasBranchDivergence

bool hasBranchDivergence(const Function *F=nullptr) const

Return true if branch divergence exists.

Definition:TargetTransformInfo.cpp:289

llvm::TargetTransformInfo::getAddressComputationCost

InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE=nullptr, const SCEV *Ptr=nullptr) const

Definition:TargetTransformInfo.cpp:1198

llvm::TargetTransformInfo::getUnrollingPreferences

void getUnrollingPreferences(Loop *L, ScalarEvolution &, UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const

Get target-customized preferences for the generic loop unrolling transformation.

Definition:TargetTransformInfo.cpp:399

llvm::TargetTransformInfo::getOperandsScalarizationOverhead

InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const

Estimate the overhead of scalarizing an instructions unique non-constant operands.

Definition:TargetTransformInfo.cpp:635

llvm::TargetTransformInfo::getRegisterBitWidth

TypeSize getRegisterBitWidth(RegisterKind K) const

Definition:TargetTransformInfo.cpp:776

llvm::TargetTransformInfo::isLegalMaskedGather

bool isLegalMaskedGather(Type *DataType, Align Alignment) const

Return true if the target supports masked gather.

Definition:TargetTransformInfo.cpp:490

llvm::TargetTransformInfo::getMemoryOpCost

InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1125

llvm::TargetTransformInfo::getMaxVScale

std::optional< unsigned > getMaxVScale() const

Definition:TargetTransformInfo.cpp:785

llvm::TargetTransformInfo::getInterleavedMemoryOpCost

InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const

Definition:TargetTransformInfo.cpp:1165

llvm::TargetTransformInfo::enableOrderedReductions

bool enableOrderedReductions() const

Return true if we should be enabling ordered reductions for the target.

Definition:TargetTransformInfo.cpp:543

llvm::TargetTransformInfo::getIntrinsicInstrCost

InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const

Definition:TargetTransformInfo.cpp:1177

llvm::TargetTransformInfo::getArithmeticReductionCost

InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Calculate the cost of vector reduction intrinsics.

Definition:TargetTransformInfo.cpp:1215

llvm::TargetTransformInfo::getCastInstrCost

InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1039

llvm::TargetTransformInfo::getPartialReductionExtendKind

static PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)

Get the kind of extension that an instruction represents.

Definition:TargetTransformInfo.cpp:987

llvm::TargetTransformInfo::shouldMaximizeVectorBandwidth

bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const

Definition:TargetTransformInfo.cpp:797

llvm::TargetTransformInfo::getPreferredTailFoldingStyle

TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const

Query the target what the preferred style of tail folding is.

Definition:TargetTransformInfo.cpp:371

llvm::TargetTransformInfo::getRegUsageForType

unsigned getRegUsageForType(Type *Ty) const

Returns the estimated number of registers required to represent Ty.

Definition:TargetTransformInfo.cpp:587

llvm::TargetTransformInfo::getExtendedReductionCost

InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...

Definition:TargetTransformInfo.cpp:1233

llvm::TargetTransformInfo::getOperandInfo

static OperandValueInfo getOperandInfo(const Value *V)

Collect properties of V used in cost analysis, e.g. OP_PowerOf2.

Definition:TargetTransformInfo.cpp:880

llvm::TargetTransformInfo::getMulAccReductionCost

InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of an Add ...

Definition:TargetTransformInfo.cpp:1240

llvm::TargetTransformInfo::getRegisterClassForType

unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const

Definition:TargetTransformInfo.cpp:767

llvm::TargetTransformInfo::isElementTypeLegalForScalableVector

bool isElementTypeLegalForScalableVector(Type *Ty) const

Definition:TargetTransformInfo.cpp:1341

llvm::TargetTransformInfo::getMinimumVF

ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const

Definition:TargetTransformInfo.cpp:802

llvm::TargetTransformInfo::enableMaskedInterleavedAccessVectorization

bool enableMaskedInterleavedAccessVectorization() const

Enable matching of interleaved access groups that contain predicated accesses or gaps and therefore v...

Definition:TargetTransformInfo.cpp:676

llvm::TargetTransformInfo::getMinMaxReductionCost

InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Definition:TargetTransformInfo.cpp:1224

llvm::TargetTransformInfo::TargetCostKind

TargetCostKind

The kind of cost model.

Definition:TargetTransformInfo.h:263

llvm::TargetTransformInfo::TCK_RecipThroughput

@ TCK_RecipThroughput

Reciprocal throughput.

Definition:TargetTransformInfo.h:264

llvm::TargetTransformInfo::TCK_CodeSize

@ TCK_CodeSize

Instruction code size.

Definition:TargetTransformInfo.h:266

llvm::TargetTransformInfo::TCK_SizeAndLatency

@ TCK_SizeAndLatency

The weighted sum of size and latency.

Definition:TargetTransformInfo.h:267

llvm::TargetTransformInfo::TCK_Latency

@ TCK_Latency

The latency of instruction.

Definition:TargetTransformInfo.h:265

llvm::TargetTransformInfo::getArithmeticInstrCost

InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const

This is an approximation of reciprocal throughput of a math/logic op.

Definition:TargetTransformInfo.cpp:940

llvm::TargetTransformInfo::isVScaleKnownToBeAPowerOfTwo

bool isVScaleKnownToBeAPowerOfTwo() const

Definition:TargetTransformInfo.cpp:793

llvm::TargetTransformInfo::getMaskedMemoryOpCost

InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Definition:TargetTransformInfo.cpp:1137

llvm::TargetTransformInfo::RegisterKind

RegisterKind

Definition:TargetTransformInfo.h:1180

llvm::TargetTransformInfo::RGK_FixedWidthVector

@ RGK_FixedWidthVector

Definition:TargetTransformInfo.h:1180

llvm::TargetTransformInfo::RGK_ScalableVector

@ RGK_ScalableVector

Definition:TargetTransformInfo.h:1180

llvm::TargetTransformInfo::preferPredicatedReductionSelect

bool preferPredicatedReductionSelect(unsigned Opcode, Type *Ty, ReductionFlags Flags) const

Definition:TargetTransformInfo.cpp:1368

llvm::TargetTransformInfo::getShuffleCost

InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const

Definition:TargetTransformInfo.cpp:976

llvm::TargetTransformInfo::getRegisterClassName

const char * getRegisterClassName(unsigned ClassID) const

Definition:TargetTransformInfo.cpp:772

llvm::TargetTransformInfo::preferEpilogueVectorization

bool preferEpilogueVectorization() const

Return true if the loop vectorizer should consider vectorizing an otherwise scalar epilogue loop.

Definition:TargetTransformInfo.cpp:1373

llvm::TargetTransformInfo::getGatherScatterOpCost

InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1146

llvm::TargetTransformInfo::hasActiveVectorLength

bool hasActiveVectorLength(unsigned Opcode, Type *DataType, Align Alignment) const

Definition:TargetTransformInfo.cpp:1424

llvm::TargetTransformInfo::getEpilogueVectorizationMinVF

unsigned getEpilogueVectorizationMinVF() const

Definition:TargetTransformInfo.cpp:362

llvm::TargetTransformInfo::getNumberOfRegisters

unsigned getNumberOfRegisters(unsigned ClassID) const

Definition:TargetTransformInfo.cpp:759

llvm::TargetTransformInfo::supportsScalableVectors

bool supportsScalableVectors() const

Definition:TargetTransformInfo.cpp:1416

llvm::TargetTransformInfo::isFPVectorizationPotentiallyUnsafe

bool isFPVectorizationPotentiallyUnsafe() const

Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...

Definition:TargetTransformInfo.cpp:680

llvm::TargetTransformInfo::isLegalMaskedStore

bool isLegalMaskedStore(Type *DataType, Align Alignment) const

Return true if the target supports masked store.

Definition:TargetTransformInfo.cpp:466

llvm::TargetTransformInfo::PartialReductionExtendKind

PartialReductionExtendKind

Definition:TargetTransformInfo.h:214

llvm::TargetTransformInfo::TCC_Free

@ TCC_Free

Expected to fold away in lowering.

Definition:TargetTransformInfo.h:289

llvm::TargetTransformInfo::getScalarizationOverhead

InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const

Estimate the overhead of scalarizing an instruction.

Definition:TargetTransformInfo.cpp:628

llvm::TargetTransformInfo::enableInterleavedAccessVectorization

bool enableInterleavedAccessVectorization() const

Enable matching of interleaved access groups.

Definition:TargetTransformInfo.cpp:672

llvm::TargetTransformInfo::getMinTripCountTailFoldingThreshold

unsigned getMinTripCountTailFoldingThreshold() const

Definition:TargetTransformInfo.cpp:1412

llvm::TargetTransformInfo::getInstructionCost

InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const

Estimate the cost of a given IR user when lowered.

Definition:TargetTransformInfo.cpp:270

llvm::TargetTransformInfo::getMaxInterleaveFactor

unsigned getMaxInterleaveFactor(ElementCount VF) const

Definition:TargetTransformInfo.cpp:875

llvm::TargetTransformInfo::enableScalableVectorization

bool enableScalableVectorization() const

Definition:TargetTransformInfo.cpp:1420

llvm::TargetTransformInfo::getNumberOfParts

unsigned getNumberOfParts(Type *Tp) const

Definition:TargetTransformInfo.cpp:1193

llvm::TargetTransformInfo::getPartialReductionCost

InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, PartialReductionExtendKind OpAExtend, PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp=std::nullopt) const

Definition:TargetTransformInfo.cpp:866

llvm::TargetTransformInfo::isTruncateFree

bool isTruncateFree(Type *Ty1, Type *Ty2) const

Return true if it's free to truncate a value of type Ty1 to type Ty2.

Definition:TargetTransformInfo.cpp:573

llvm::TargetTransformInfo::preferPredicateOverEpilogue

bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const

Query the target whether it would be prefered to create a predicated vector loop, which can avoid the...

Definition:TargetTransformInfo.cpp:366

llvm::TargetTransformInfo::getVectorInstrCost

InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const

Definition:TargetTransformInfo.cpp:1079

llvm::TargetTransformInfo::SK_Splice

@ SK_Splice

Concatenates elements from the first input vector with elements of the second input vector.

Definition:TargetTransformInfo.h:1111

llvm::TargetTransformInfo::SK_Broadcast

@ SK_Broadcast

Broadcast element 0 to all other elements.

Definition:TargetTransformInfo.h:1099

llvm::TargetTransformInfo::SK_Reverse

@ SK_Reverse

Reverse the order of the vector.

Definition:TargetTransformInfo.h:1100

llvm::TargetTransformInfo::getCallInstrCost

InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const

Definition:TargetTransformInfo.cpp:1185

llvm::TargetTransformInfo::getCFInstrCost

InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1058

llvm::TargetTransformInfo::CastContextHint

CastContextHint

Represents a hint about the context in which a cast is used.

Definition:TargetTransformInfo.h:1389

llvm::TargetTransformInfo::CastContextHint::Reversed

@ Reversed

The cast is used with a reversed load/store.

llvm::TargetTransformInfo::CastContextHint::Masked

@ Masked

The cast is used with a masked load/store.

llvm::TargetTransformInfo::CastContextHint::None

@ None

The cast is not used with a load/store of any kind.

llvm::TargetTransformInfo::CastContextHint::Normal

@ Normal

The cast is used with a normal load/store.

llvm::TargetTransformInfo::CastContextHint::Interleave

@ Interleave

The cast is used with an interleaved load/store.

llvm::TargetTransformInfo::CastContextHint::GatherScatter

@ GatherScatter

The cast is used with a gather/scatter.

llvm::TargetTransformInfo::OK_UniformValue

@ OK_UniformValue

Definition:TargetTransformInfo.h:1120

llvm::TargetTransformInfo::OK_AnyValue

@ OK_AnyValue

Definition:TargetTransformInfo.h:1119

llvm::TargetTransformInfo::preferFixedOverScalableIfEqualCost

bool preferFixedOverScalableIfEqualCost() const

Definition:TargetTransformInfo.cpp:1359

llvm::TargetTransformInfo::isLegalMaskedLoad

bool isLegalMaskedLoad(Type *DataType, Align Alignment) const

Return true if the target supports masked load.

Definition:TargetTransformInfo.cpp:471

llvm::TruncInst

This class represents a truncation of integer types.

Definition:Instructions.h:4503

llvm::Twine

Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...

Definition:Twine.h:81

llvm::TypeSize

Definition:TypeSize.h:334

llvm::TypeSwitch

This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...

Definition:TypeSwitch.h:87

llvm::TypeSwitch::Case

TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)

Add a case on the given type.

Definition:TypeSwitch.h:96

llvm::Type

The instances of the Type class are immutable: once they are created, they are never changed.

Definition:Type.h:45

llvm::Type::getIntegerBitWidth

unsigned getIntegerBitWidth() const

llvm::Type::isVectorTy

bool isVectorTy() const

True if this is an instance of VectorType.

Definition:Type.h:270

llvm::Type::getInt1Ty

static IntegerType * getInt1Ty(LLVMContext &C)

llvm::Type::getScalarSizeInBits

unsigned getScalarSizeInBits() const LLVM_READONLY

If this is a vector type, return the getPrimitiveSizeInBits value for the element type.

llvm::Type::getVoidTy

static Type * getVoidTy(LLVMContext &C)

llvm::Type::getContext

LLVMContext & getContext() const

Return the LLVMContext in which this type was uniqued.

Definition:Type.h:128

llvm::Type::isFloatingPointTy

bool isFloatingPointTy() const

Return true if this is one of the floating-point types.

Definition:Type.h:184

llvm::Type::isIntOrPtrTy

bool isIntOrPtrTy() const

Return true if this is an integer type or a pointer type.

Definition:Type.h:252

llvm::Type::isIntegerTy

bool isIntegerTy() const

True if this is an instance of IntegerType.

Definition:Type.h:237

llvm::Type::isTokenTy

bool isTokenTy() const

Return true if this is 'token'.

Definition:Type.h:234

llvm::Type::isVoidTy

bool isVoidTy() const

Return true if this is 'void'.

Definition:Type.h:139

llvm::Type::getScalarType

Type * getScalarType() const

If this is a vector type, return the element type, otherwise return 'this'.

Definition:Type.h:355

llvm::UnreachableInst

This function has undefined behavior.

Definition:Instructions.h:4461

llvm::Use

A Use represents the edge between a Value definition and its users.

Definition:Use.h:43

llvm::User

Definition:User.h:44

llvm::User::operands

op_range operands()

Definition:User.h:288

llvm::User::replaceUsesOfWith

bool replaceUsesOfWith(Value *From, Value *To)

Replace uses of one Value with another.

Definition:User.cpp:21

llvm::User::op_begin

op_iterator op_begin()

Definition:User.h:280

llvm::User::setOperand

void setOperand(unsigned i, Value *Val)

Definition:User.h:233

llvm::User::getOperand

Value * getOperand(unsigned i) const

Definition:User.h:228

llvm::User::op_end

op_iterator op_end()

Definition:User.h:282

llvm::VFDatabase::getMappings

static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)

Retrieve all the VFInfo instances associated to the CallInst CI.

Definition:VectorUtils.h:72

llvm::VPBasicBlock

VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.

Definition:VPlan.h:3541

llvm::VPBasicBlock::appendRecipe

void appendRecipe(VPRecipeBase *Recipe)

Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.

Definition:VPlan.h:3616

llvm::VPBasicBlock::iterator

RecipeListTy::iterator iterator

Instruction iterators...

Definition:VPlan.h:3568

llvm::VPBasicBlock::execute

void execute(VPTransformState *State) override

The method which generates the output IR instructions that correspond to this VPBasicBlock,...

Definition:VPlan.cpp:479

llvm::VPBasicBlock::end

iterator end()

Definition:VPlan.h:3578

llvm::VPBasicBlock::begin

iterator begin()

Recipe iterator methods.

Definition:VPlan.h:3576

llvm::VPBasicBlock::phis

iterator_range< iterator > phis()

Returns an iterator range over the PHI-like recipes in the block.

Definition:VPlan.h:3629

llvm::VPBasicBlock::getFirstNonPhi

iterator getFirstNonPhi()

Return the position of the first non-phi node recipe in the block.

Definition:VPlan.cpp:208

llvm::VPBasicBlock::insert

void insert(VPRecipeBase *Recipe, iterator InsertPt)

Definition:VPlan.h:3607

llvm::VPBasicBlock::empty

bool empty() const

Definition:VPlan.h:3587

llvm::VPBlendRecipe

A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.

Definition:VPlan.h:2499

llvm::VPBlockBase

VPBlockBase is the building block of the Hierarchical Control-Flow Graph.

Definition:VPlan.h:392

llvm::VPBlockBase::getParent

VPRegionBlock * getParent()

Definition:VPlan.h:484

llvm::VPBlockBase::getExitingBasicBlock

const VPBasicBlock * getExitingBasicBlock() const

Definition:VPlan.cpp:178

llvm::VPBlockBase::setName

void setName(const Twine &newName)

Definition:VPlan.h:477

llvm::VPBlockBase::getNumSuccessors

size_t getNumSuccessors() const

Definition:VPlan.h:530

llvm::VPBlockBase::swapSuccessors

void swapSuccessors()

Swap successors of the block. The block must have exactly 2 successors.

Definition:VPlan.h:623

llvm::VPBlockBase::getPlan

VPlan * getPlan()

Definition:VPlan.cpp:153

llvm::VPBlockBase::getSinglePredecessor

VPBlockBase * getSinglePredecessor() const

Definition:VPlan.h:526

llvm::VPBlockBase::getEntryBasicBlock

const VPBasicBlock * getEntryBasicBlock() const

Definition:VPlan.cpp:158

llvm::VPBlockBase::getSingleSuccessor

VPBlockBase * getSingleSuccessor() const

Definition:VPlan.h:520

llvm::VPBlockBase::getSuccessors

const VPBlocksTy & getSuccessors() const

Definition:VPlan.h:509

llvm::VPBlockUtils::insertBlockAfter

static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr)

Insert disconnected VPBlockBase NewBlock after BlockPtr.

Definition:VPlanUtils.h:88

llvm::VPBlockUtils::insertOnEdge

static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)

Inserts BlockPtr on the edge between From and To.

Definition:VPlanUtils.h:204

llvm::VPBlockUtils::connectBlocks

static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)

Connect VPBlockBases From and To bi-directionally.

Definition:VPlanUtils.h:142

llvm::VPBlockUtils::reassociateBlocks

static void reassociateBlocks(VPBlockBase *Old, VPBlockBase *New)

Reassociate all the blocks connected to Old so that they now point to New.

Definition:VPlanUtils.h:169

llvm::VPBuilder::InsertPointGuard

RAII object that stores the current insertion point and restores it when the object is destroyed.

Definition:LoopVectorizationPlanner.h:270

llvm::VPBuilder

VPlan-based builder utility analogous to IRBuilder.

Definition:LoopVectorizationPlanner.h:45

llvm::VPBuilder::createICmp

VPValue * createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, DebugLoc DL={}, const Twine &Name="")

Create a new ICmp VPInstruction with predicate Pred and operands A and B.

Definition:LoopVectorizationPlanner.h:215

llvm::VPBuilder::createOr

VPValue * createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")

Definition:LoopVectorizationPlanner.h:187

llvm::VPBuilder::getInsertBlock

VPBasicBlock * getInsertBlock() const

Definition:LoopVectorizationPlanner.h:83

llvm::VPBuilder::createDerivedIV

VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")

Convert the input value Current to the corresponding value of an induction with Start and Step values...

Definition:LoopVectorizationPlanner.h:237

llvm::VPBuilder::createScalarCast

VPScalarCastRecipe * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL)

Definition:LoopVectorizationPlanner.h:245

llvm::VPBuilder::createOverflowingOp

VPInstruction * createOverflowingOp(unsigned Opcode, std::initializer_list< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, DebugLoc DL={}, const Twine &Name="")

Definition:LoopVectorizationPlanner.h:169

llvm::VPBuilder::createNaryOp

VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const Twine &Name="")

Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.

Definition:LoopVectorizationPlanner.h:145

llvm::VPBuilder::createNot

VPValue * createNot(VPValue *Operand, DebugLoc DL={}, const Twine &Name="")

Definition:LoopVectorizationPlanner.h:177

llvm::VPBuilder::createLogicalAnd

VPValue * createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL={}, const Twine &Name="")

Definition:LoopVectorizationPlanner.h:195

llvm::VPBuilder::createSelect

VPValue * createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, DebugLoc DL={}, const Twine &Name="", std::optional< FastMathFlags > FMFs=std::nullopt)

Definition:LoopVectorizationPlanner.h:201

llvm::VPBuilder::setInsertPoint

void setInsertPoint(VPBasicBlock *TheBB)

This specifies that created VPInstructions should be appended to the end of the specified block.

Definition:LoopVectorizationPlanner.h:123

llvm::VPCanonicalIVPHIRecipe

Canonical scalar induction phi of the vector loop.

Definition:VPlan.h:3238

llvm::VPCanonicalIVPHIRecipe::getScalarType

Type * getScalarType() const

Returns the scalar type of the induction.

Definition:VPlan.h:3269

llvm::VPDef::getVPSingleValue

VPValue * getVPSingleValue()

Returns the only VPValue defined by the VPDef.

Definition:VPlanValue.h:394

llvm::VPDerivedIVRecipe::execute

void execute(VPTransformState &State) override

Generate the transformed value of the induction at offset StartValue (1.

Definition:LoopVectorize.cpp:9824

llvm::VPDerivedIVRecipe::getStepValue

VPValue * getStepValue() const

Definition:VPlan.h:3471

llvm::VPDerivedIVRecipe::getStartValue

VPValue * getStartValue() const

Definition:VPlan.h:3470

llvm::VPHeaderPHIRecipe

A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...

Definition:VPlan.h:2033

llvm::VPHeaderPHIRecipe::getBackedgeValue

virtual VPValue * getBackedgeValue()

Returns the incoming value from the loop backedge.

Definition:VPlan.h:2081

llvm::VPHeaderPHIRecipe::getStartValue

VPValue * getStartValue()

Returns the start value of the phi, if one is set.

Definition:VPlan.h:2070

llvm::VPHistogramRecipe

A recipe representing a sequence of load -> update -> store as part of a histogram operation.

Definition:VPlan.h:1783

llvm::VPIRBasicBlock

A special type of VPBasicBlock that wraps an existing IR basic block.

Definition:VPlan.h:3683

llvm::VPIRInstruction

A recipe to wrap on original IR instruction not to be modified during execution, execept for PHIs.

Definition:VPlan.h:1377

llvm::VPInstruction

This is a concrete Recipe that models a single VPlan-level instruction.

Definition:VPlan.h:1189

llvm::VPInstruction::ResumePhi

@ ResumePhi

Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.

Definition:VPlan.h:1207

llvm::VPInstruction::BranchOnCount

@ BranchOnCount

Definition:VPlan.h:1211

llvm::VPInstruction::ExtractFromEnd

@ ExtractFromEnd

Definition:VPlan.h:1217

llvm::VPInstruction::ComputeReductionResult

@ ComputeReductionResult

Definition:VPlan.h:1213

llvm::VPInterleaveRecipe

VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...

Definition:VPlan.h:2566

llvm::VPLane

In what follows, the term "input IR" refers to code that is fed into the vectorizer whereas the term ...

Definition:VPlan.h:154

llvm::VPLane::getLastLaneForVF

static VPLane getLastLaneForVF(const ElementCount &VF)

Definition:VPlan.h:195

llvm::VPLane::getFirstLane

static VPLane getFirstLane()

Definition:VPlan.h:179

llvm::VPPartialReductionRecipe

A recipe for forming partial reductions.

Definition:VPlan.h:2452

llvm::VPRecipeBase

VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.

Definition:VPlan.h:711

llvm::VPRecipeBase::getParent

VPBasicBlock * getParent()

Definition:VPlan.h:736

llvm::VPRecipeBase::getDebugLoc

DebugLoc getDebugLoc() const

Returns the debug location of the recipe.

Definition:VPlan.h:805

llvm::VPRecipeBase::insertBefore

void insertBefore(VPRecipeBase *InsertPos)

Insert an unlinked recipe into a basic block immediately before the specified recipe.

Definition:VPlanRecipes.cpp:203

llvm::VPRecipeBase::insertAfter

void insertAfter(VPRecipeBase *InsertPos)

Insert an unlinked Recipe into a basic block immediately after the specified Recipe.

Definition:VPlanRecipes.cpp:217

llvm::VPRecipeBase::eraseFromParent

iplist< VPRecipeBase >::iterator eraseFromParent()

This method unlinks 'this' from the containing basic block and deletes it.

Definition:VPlanRecipes.cpp:230

llvm::VPRecipeBuilder

Helper class to create VPRecipies from IR instructions.

Definition:VPRecipeBuilder.h:47

llvm::VPRecipeBuilder::tryToCreatePartialReduction

VPRecipeBase * tryToCreatePartialReduction(Instruction *Reduction, ArrayRef< VPValue * > Operands)

Create and return a partial reduction recipe for a reduction instruction along with binary operation ...

Definition:LoopVectorize.cpp:8885

llvm::VPRecipeBuilder::createEdgeMask

VPValue * createEdgeMask(BasicBlock *Src, BasicBlock *Dst)

A helper function that computes the predicate of the edge between SRC and DST.

Definition:LoopVectorize.cpp:8124

llvm::VPRecipeBuilder::handleReplication

VPReplicateRecipe * handleReplication(Instruction *I, VFRange &Range)

Build a VPReplicationRecipe for I.

Definition:LoopVectorize.cpp:8621

llvm::VPRecipeBuilder::createSwitchEdgeMasks

void createSwitchEdgeMasks(SwitchInst *SI)

Create an edge mask for every destination of cases and/or default.

Definition:LoopVectorize.cpp:8068

llvm::VPRecipeBuilder::getBlockInMask

VPValue * getBlockInMask(BasicBlock *BB) const

Returns the entry mask for the block BB.

Definition:LoopVectorize.cpp:8210

llvm::VPRecipeBuilder::getEdgeMask

VPValue * getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const

A helper that returns the previously computed predicate of the edge between SRC and DST.

Definition:LoopVectorize.cpp:8172

llvm::VPRecipeBuilder::mapToVPValues

iterator_range< mapped_iterator< Use *, std::function< VPValue *(Value *)> > > mapToVPValues(User::op_range Operands)

Returns a range mapping the values of the range Operands to their corresponding VPValues.

Definition:LoopVectorize.cpp:8061

llvm::VPRecipeBuilder::fixHeaderPhis

void fixHeaderPhis()

Add the incoming values from the backedge to reduction & first-order recurrence cross-iteration phis.

Definition:LoopVectorize.cpp:8611

llvm::VPRecipeBuilder::tryToCreateWidenRecipe

VPRecipeBase * tryToCreateWidenRecipe(Instruction *Instr, ArrayRef< VPValue * > Operands, VFRange &Range, VPBasicBlock *VPBB)

Create and return a widened recipe for I if one can be created within the given VF Range.

Definition:LoopVectorize.cpp:8800

llvm::VPRecipeBuilder::getVPValueOrAddLiveIn

VPValue * getVPValueOrAddLiveIn(Value *V)

Definition:VPRecipeBuilder.h:235

llvm::VPRecipeBuilder::createHeaderMask

void createHeaderMask()

Create the mask for the vector loop header block.

Definition:LoopVectorize.cpp:8183

llvm::VPRecipeBuilder::getScalingForReduction

std::optional< unsigned > getScalingForReduction(const Instruction *ExitInst)

Definition:VPRecipeBuilder.h:162

llvm::VPRecipeBuilder::createBlockInMask

void createBlockInMask(BasicBlock *BB)

A helper function that computes the predicate of the block BB, assuming that the header block of the ...

Definition:LoopVectorize.cpp:8218

llvm::VPRecipeBuilder::collectScaledReductions

void collectScaledReductions(VFRange &Range)

Find all possible partial reductions in the loop and track all of those that are valid so recipes can...

Definition:LoopVectorize.cpp:8685

llvm::VPRecipeBuilder::getRecipe

VPRecipeBase * getRecipe(Instruction *I)

Return the recipe created for given ingredient.

Definition:VPRecipeBuilder.h:213

llvm::VPRecipeWithIRFlags::setFlags

void setFlags(Instruction *I) const

Set the IR flags for I.

Definition:VPlan.h:1103

llvm::VPReductionPHIRecipe

A recipe for handling reduction phis.

Definition:VPlan.h:2386

llvm::VPReductionPHIRecipe::isInLoop

bool isInLoop() const

Returns true, if the phi is part of an in-loop reduction.

Definition:VPlan.h:2445

llvm::VPReductionPHIRecipe::getRecurrenceDescriptor

const RecurrenceDescriptor & getRecurrenceDescriptor() const

Definition:VPlan.h:2437

llvm::VPReductionRecipe

A recipe to represent inloop reduction operations, performing a reduction on a vector operand into a ...

Definition:VPlan.h:2661

llvm::VPRegionBlock

VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...

Definition:VPlan.h:3718

llvm::VPRegionBlock::getEntry

const VPBlockBase * getEntry() const

Definition:VPlan.h:3754

llvm::VPRegionBlock::isReplicator

bool isReplicator() const

An indicator whether this region is to generate multiple replicated instances of output IR correspond...

Definition:VPlan.h:3786

llvm::VPReplicateRecipe

VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...

Definition:VPlan.h:2782

llvm::VPReplicateRecipe::execute

void execute(VPTransformState &State) override

Generate replicas of the desired Ingredient.

Definition:LoopVectorize.cpp:9848

llvm::VPReplicateRecipe::isUniform

bool isUniform() const

Definition:VPlan.h:2826

llvm::VPReplicateRecipe::shouldPack

bool shouldPack() const

Returns true if the recipe is used by a widened recipe via an intervening VPPredInstPHIRecipe.

Definition:VPlanRecipes.cpp:2362

llvm::VPReverseVectorPointerRecipe

A recipe to compute the pointers for widened memory accesses of IndexTy in reverse order.

Definition:VPlan.h:1910

llvm::VPSingleDefRecipe

VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...

Definition:VPlan.h:838

llvm::VPSingleDefRecipe::getUnderlyingInstr

Instruction * getUnderlyingInstr()

Returns the underlying instruction.

Definition:VPlan.h:908

llvm::VPTypeAnalysis

An analysis for type-inference for VPValues.

Definition:VPlanAnalysis.h:40

llvm::VPTypeAnalysis::inferScalarType

Type * inferScalarType(const VPValue *V)

Infer the type of V. Returns the scalar type of V.

Definition:VPlanAnalysis.cpp:215

llvm::VPUser

This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...

Definition:VPlanValue.h:206

llvm::VPUser::operands

operand_range operands()

Definition:VPlanValue.h:263

llvm::VPUser::setOperand

void setOperand(unsigned I, VPValue *New)

Definition:VPlanValue.h:248

llvm::VPUser::getNumOperands

unsigned getNumOperands() const

Definition:VPlanValue.h:242

llvm::VPUser::getOperand

VPValue * getOperand(unsigned N) const

Definition:VPlanValue.h:243

llvm::VPUser::addOperand

void addOperand(VPValue *Operand)

Definition:VPlanValue.h:237

llvm::VPValue

Definition:VPlanValue.h:46

llvm::VPValue::getDefiningRecipe

VPRecipeBase * getDefiningRecipe()

Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...

Definition:VPlan.cpp:123

llvm::VPValue::replaceAllUsesWith

void replaceAllUsesWith(VPValue *New)

Definition:VPlan.cpp:1419

llvm::VPValue::getLiveInIRValue

Value * getLiveInIRValue()

Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.

Definition:VPlanValue.h:178

llvm::VPValue::replaceUsesWithIf

void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)

Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...

Definition:VPlan.cpp:1423

llvm::VPValue::users

user_range users()

Definition:VPlanValue.h:138

llvm::VPVectorPointerRecipe

A recipe to compute the pointers for widened memory accesses of IndexTy.

Definition:VPlan.h:1963

llvm::VPWidenCallRecipe

A recipe for widening Call instructions using library calls.

Definition:VPlan.h:1727

llvm::VPWidenCanonicalIVRecipe

A Recipe for widening the canonical induction variable of the vector loop.

Definition:VPlan.h:3379

llvm::VPWidenCastRecipe

VPWidenCastRecipe is a recipe to create vector cast instructions.

Definition:VPlan.h:1535

llvm::VPWidenGEPRecipe

A recipe for handling GEP instructions.

Definition:VPlan.h:1861

llvm::VPWidenInductionRecipe

Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...

Definition:VPlan.h:2095

llvm::VPWidenInductionRecipe::getStepValue

VPValue * getStepValue()

Returns the step value of the induction.

Definition:VPlan.h:2123

llvm::VPWidenInductionRecipe::getInductionDescriptor

const InductionDescriptor & getInductionDescriptor() const

Returns the induction descriptor for the recipe.

Definition:VPlan.h:2129

llvm::VPWidenIntOrFpInductionRecipe

A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...

Definition:VPlan.h:2148

llvm::VPWidenIntrinsicRecipe

A recipe for widening vector intrinsics.

Definition:VPlan.h:1635

llvm::VPWidenMemoryRecipe

A common base class for widening memory operations.

Definition:VPlan.h:2955

llvm::VPWidenPHIRecipe

A recipe for handling phis that are widened in the vector loop.

Definition:VPlan.h:2308

llvm::VPWidenPHIRecipe::getIncomingValue

VPValue * getIncomingValue(unsigned I)

Returns the I th incoming VPValue.

Definition:VPlan.h:2348

llvm::VPWidenPHIRecipe::getIncomingBlock

VPBasicBlock * getIncomingBlock(unsigned I)

Returns the I th incoming VPBasicBlock.

Definition:VPlan.h:2345

llvm::VPWidenPointerInductionRecipe

Definition:VPlan.h:2223

llvm::VPWidenRecipe

VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...

Definition:VPlan.h:1437

llvm::VPlanHCFGBuilder

Main class to build the VPlan H-CFG for an incoming IR.

Definition:VPlanHCFGBuilder.h:38

llvm::VPlan

VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...

Definition:VPlan.h:3817

llvm::VPlan::prepareToExecute

void prepareToExecute(Value *TripCount, Value *VectorTripCount, VPTransformState &State)

Prepare the plan for execution, setting up the required live-in values.

Definition:VPlan.cpp:923

llvm::VPlan::getEntry

VPBasicBlock * getEntry()

Definition:VPlan.h:3930

llvm::VPlan::getVectorTripCount

VPValue & getVectorTripCount()

The vector trip count.

Definition:VPlan.h:3992

llvm::VPlan::getVFxUF

VPValue & getVFxUF()

Returns VF * UF of the vector loop region.

Definition:VPlan.h:3998

llvm::VPlan::getVF

VPValue & getVF()

Returns the VF of the vector loop region.

Definition:VPlan.h:3995

llvm::VPlan::getTripCount

VPValue * getTripCount() const

The trip count of the original loop.

Definition:VPlan.h:3971

llvm::VPlan::getOrCreateBackedgeTakenCount

VPValue * getOrCreateBackedgeTakenCount()

The backedge taken count of the original loop.

Definition:VPlan.h:3985

llvm::VPlan::vectorFactors

iterator_range< SmallSetVector< ElementCount, 2 >::iterator > vectorFactors() const

Returns an iterator range over all VFs of the plan.

Definition:VPlan.h:4015

llvm::VPlan::getUF

unsigned getUF() const

Definition:VPlan.h:4023

llvm::VPlan::createInitialVPlan

static VPlanPtr createInitialVPlan(Type *InductionTy, PredicatedScalarEvolution &PSE, bool RequiresScalarEpilogueCheck, bool TailFolded, Loop *TheLoop)

Create initial VPlan, having an "entry" VPBasicBlock (wrapping original scalar pre-header) which cont...

Definition:VPlan.cpp:844

llvm::VPlan::hasVF

bool hasVF(ElementCount VF)

Definition:VPlan.h:4008

llvm::VPlan::hasUF

bool hasUF(unsigned UF) const

Definition:VPlan.h:4021

llvm::VPlan::getExitBlocks

auto getExitBlocks()

Return an iterator range over the VPIRBasicBlock wrapping the exit blocks of the VPlan,...

Definition:VPlanCFG.h:310

llvm::VPlan::getVectorLoopRegion

VPRegionBlock * getVectorLoopRegion()

Returns the VPRegionBlock of the vector loop.

Definition:VPlan.cpp:1051

llvm::VPlan::cost

InstructionCost cost(ElementCount VF, VPCostContext &Ctx)

Return the cost of this plan.

Definition:VPlan.cpp:1045

llvm::VPlan::getMiddleBlock

const VPBasicBlock * getMiddleBlock() const

Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...

Definition:VPlan.h:3949

llvm::VPlan::resetTripCount

void resetTripCount(VPValue *NewTripCount)

Resets the trip count for the VPlan.

Definition:VPlan.h:3978

llvm::VPlan::setEntry

void setEntry(VPBasicBlock *VPBB)

Definition:VPlan.h:3900

llvm::VPlan::createVPIRBasicBlock

VPIRBasicBlock * createVPIRBasicBlock(BasicBlock *IRBB)

Create a VPIRBasicBlock from IRBB containing VPIRInstructions for all instructions in IRBB,...

Definition:VPlan.cpp:1251

llvm::VPlan::getOrAddLiveIn

VPValue * getOrAddLiveIn(Value *V)

Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.

Definition:VPlan.h:4041

llvm::VPlan::getScalarPreheader

VPBasicBlock * getScalarPreheader() const

Return the VPBasicBlock for the preheader of the scalar loop.

Definition:VPlan.h:3957

llvm::VPlan::execute

void execute(VPTransformState *State)

Generate the IR code for this VPlan.

Definition:VPlan.cpp:955

llvm::VPlan::getCanonicalIV

VPCanonicalIVPHIRecipe * getCanonicalIV()

Returns the canonical induction recipe of the vector loop.

Definition:VPlan.h:4075

llvm::VPlan::getScalarHeader

VPIRBasicBlock * getScalarHeader() const

Return the VPIRBasicBlock wrapping the header of the scalar loop.

Definition:VPlan.h:3962

llvm::VPlan::duplicate

VPlan * duplicate()

Clone the current VPlan, update all VPValues of the new VPlan and cloned recipes to refer to the clon...

Definition:VPlan.cpp:1191

llvm::Value

LLVM Value Representation.

Definition:Value.h:74

llvm::Value::getType

Type * getType() const

All values are typed, get the type of this value.

Definition:Value.h:255

llvm::Value::hasOneUser

bool hasOneUser() const

Return true if there is exactly one user of this value.

Definition:Value.cpp:157

llvm::Value::setName

void setName(const Twine &Name)

Change the name of the value.

Definition:Value.cpp:377

llvm::Value::replaceAllUsesWith

void replaceAllUsesWith(Value *V)

Change all uses of this to point to a new Value.

Definition:Value.cpp:534

llvm::Value::users

iterator_range< user_iterator > users()

Definition:Value.h:421

llvm::Value::replaceUsesWithIf

void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)

Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...

Definition:Value.cpp:542

llvm::Value::getContext

LLVMContext & getContext() const

All values hold a context through their type.

Definition:Value.cpp:1075

llvm::Value::getName

StringRef getName() const

Return a constant reference to the value's name.

Definition:Value.cpp:309

llvm::VectorType::get

static VectorType * get(Type *ElementType, ElementCount EC)

This static method is the primary way to construct an VectorType.

llvm::VectorType::isValidElementType

static bool isValidElementType(Type *ElemTy)

Return true if the specified type is valid as a element type.

llvm::cl::Option::getNumOccurrences

int getNumOccurrences() const

Definition:CommandLine.h:399

llvm::cl::opt

Definition:CommandLine.h:1423

llvm::detail::DenseSetImpl::insert

std::pair< iterator, bool > insert(const ValueT &V)

Definition:DenseSet.h:213

llvm::detail::DenseSetImpl::contains

bool contains(const_arg_type_t< ValueT > V) const

Check if the set contains the given element.

Definition:DenseSet.h:193

llvm::details::FixedOrScalableQuantity::getFixedValue

constexpr ScalarTy getFixedValue() const

Definition:TypeSize.h:202

llvm::details::FixedOrScalableQuantity< ElementCount, unsigned >::isKnownLE

static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)

Definition:TypeSize.h:232

llvm::details::FixedOrScalableQuantity::isNonZero

constexpr bool isNonZero() const

Definition:TypeSize.h:158

llvm::details::FixedOrScalableQuantity< ElementCount, unsigned >::isKnownLT

static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)

Definition:TypeSize.h:218

llvm::details::FixedOrScalableQuantity::isScalable

constexpr bool isScalable() const

Returns whether the quantity is scaled by a runtime quantity (vscale).

Definition:TypeSize.h:171

llvm::details::FixedOrScalableQuantity::multiplyCoefficientBy

constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const

Definition:TypeSize.h:258

llvm::details::FixedOrScalableQuantity::isFixed

constexpr bool isFixed() const

Returns true if the quantity is not scaled by vscale.

Definition:TypeSize.h:174

llvm::details::FixedOrScalableQuantity::getKnownMinValue

constexpr ScalarTy getKnownMinValue() const

Returns the minimum value this quantity can represent.

Definition:TypeSize.h:168

llvm::details::FixedOrScalableQuantity::isZero

constexpr bool isZero() const

Definition:TypeSize.h:156

llvm::details::FixedOrScalableQuantity< ElementCount, unsigned >::isKnownGT

static constexpr bool isKnownGT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)

Definition:TypeSize.h:225

llvm::details::FixedOrScalableQuantity::divideCoefficientBy

constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const

We do not provide the '/' operator here because division for polynomial types does not work in the sa...

Definition:TypeSize.h:254

llvm::details::FixedOrScalableQuantity< ElementCount, unsigned >::isKnownGE

static constexpr bool isKnownGE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)

Definition:TypeSize.h:239

llvm::function_ref

An efficient, type-erasing, non-owning reference to a callable.

Definition:STLFunctionalExtras.h:37

llvm::ilist_detail::node_parent_access::getParent

const ParentTy * getParent() const

Definition:ilist_node.h:32

llvm::ilist_node_impl::getIterator

self_iterator getIterator()

Definition:ilist_node.h:132

llvm::iterator_range

A range adaptor for a pair of iterators.

Definition:iterator_range.h:42

llvm::iterator_range::end

IteratorT end() const

Definition:iterator_range.h:65

llvm::iterator_range::begin

IteratorT begin() const

Definition:iterator_range.h:64

llvm::mapped_iterator

Definition:STLExtras.h:353

llvm::raw_ostream

This class implements an extremely fast bulk output stream that can only output to a stream.

Definition:raw_ostream.h:52

llvm::raw_string_ostream

A raw_ostream that writes to an std::string.

Definition:raw_ostream.h:661

This provides a very simple, boring adaptor for a begin and end iterator into a range type.

DebugInfo.h

ErrorHandling.h

llvm_unreachable

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

Definition:ErrorHandling.h:143

PreferPredicateTy

Definition:LoopVectorize.cpp:210

PreferPredicateTy::Option

Option

Definition:LoopVectorize.cpp:211

PreferPredicateTy::ScalarEpilogue

@ ScalarEpilogue

Definition:LoopVectorize.cpp:212

PreferPredicateTy::PredicateElseScalarEpilogue

@ PredicateElseScalarEpilogue

Definition:LoopVectorize.cpp:213

PreferPredicateTy::PredicateOrDontVectorize

@ PredicateOrDontVectorize

Definition:LoopVectorize.cpp:214

llvm::AArch64CC::VS

@ VS

Definition:AArch64BaseInfo.h:261

llvm::AArch64CC::HI

@ HI

Definition:AArch64BaseInfo.h:263

llvm::AMDGPUISD::IF

@ IF

Definition:AMDGPUISelLowering.h:408

llvm::BitmaskEnumDetail::Mask

constexpr std::underlying_type_t< E > Mask()

Get a bitmask with 1s in all places up to the high-order bit of E's largest value.

Definition:BitmaskEnum.h:125

llvm::COFF::Entry

@ Entry

Definition:COFF.h:844

llvm::CallingConv::Tail

@ Tail

Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...

Definition:CallingConv.h:76

llvm::CallingConv::C

@ C

The default llvm calling convention, compatible with C.

Definition:CallingConv.h:34

llvm::HexStyle::Style

Style

Definition:MCInstPrinter.h:35

llvm::IRSimilarity::Legal

@ Legal

Definition:IRSimilarityIdentifier.h:76

llvm::Intrinsic::Tys

ID ArrayRef< Type * > Tys

Definition:Intrinsics.h:102

llvm::Intrinsic::not_intrinsic

@ not_intrinsic

Definition:Intrinsics.h:44

llvm::Loc::Variant

std::variant< std::monostate, Loc::Single, Loc::Multi, Loc::MMI, Loc::EntryValue > Variant

Alias for the std::variant specialization base class of DbgVariable.

Definition:DwarfDebug.h:190

llvm::M68kBeads::Term

@ Term

Definition:M68kBaseInfo.h:116

llvm::M68k::MemAddrModeKind::U

@ U

llvm::M68k::MemAddrModeKind::V

@ V

llvm::M68k::MemAddrModeKind::K

@ K

llvm::M68k::MemAddrModeKind::L

@ L

llvm::PatternMatch

Definition:PatternMatch.h:47

llvm::PatternMatch::m_SpecificInt

specific_intval< false > m_SpecificInt(const APInt &V)

Match a specific integer value or vector with all elements equal to the value.

Definition:PatternMatch.h:982

llvm::PatternMatch::match

bool match(Val *V, const Pattern &P)

Definition:PatternMatch.h:49

llvm::PatternMatch::m_Instruction

bind_ty< Instruction > m_Instruction(Instruction *&I)

Match an instruction, capturing it if we match.

Definition:PatternMatch.h:826

llvm::PatternMatch::m_Specific

specificval_ty m_Specific(const Value *V)

Match if we have a specific specified value.

Definition:PatternMatch.h:885

llvm::PatternMatch::m_Select

ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)

Matches SelectInst.

Definition:PatternMatch.h:1799

llvm::PatternMatch::m_Mul

BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1168

llvm::PatternMatch::m_OneUse

OneUse_match< T > m_OneUse(const T &SubPattern)

Definition:PatternMatch.h:67

llvm::PatternMatch::m_LogicalOr

auto m_LogicalOr()

Matches L || R where L and R are arbitrary values.

Definition:PatternMatch.h:3099

llvm::PatternMatch::m_SpecificICmp

SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)

Definition:PatternMatch.h:1690

llvm::PatternMatch::m_Value

class_match< Value > m_Value()

Match an arbitrary value and ignore it.

Definition:PatternMatch.h:92

llvm::PatternMatch::m_ZExtOrSExt

match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)

Definition:PatternMatch.h:2138

llvm::PatternMatch::m_LogicalAnd

auto m_LogicalAnd()

Matches L && R where L and R are arbitrary values.

Definition:PatternMatch.h:3081

llvm::RISCVFenceField::R

@ R

Definition:RISCVBaseInfo.h:373

llvm::SIEncodingFamily::SI

@ SI

Definition:SIDefines.h:36

llvm::SPII::Store

@ Store

Definition:SparcInstrInfo.h:33

llvm::SPII::Load

@ Load

Definition:SparcInstrInfo.h:32

llvm::X86::FirstMacroFusionInstKind::Cmp

@ Cmp

llvm::cl::Hidden

@ Hidden

Definition:CommandLine.h:137

llvm::cl::values

ValuesClass values(OptsTy... Options)

Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...

Definition:CommandLine.h:711

llvm::cl::BOU_FALSE

@ BOU_FALSE

Definition:CommandLine.h:637

llvm::cl::BOU_UNSET

@ BOU_UNSET

Definition:CommandLine.h:637

llvm::cl::BOU_TRUE

@ BOU_TRUE

Definition:CommandLine.h:637

llvm::cl::init

initializer< Ty > init(const Ty &Val)

Definition:CommandLine.h:443

llvm::lltok::Kind

Kind

Definition:LLToken.h:18

llvm::ms_demangle::QualifierMangleMode::Result

@ Result

llvm::ore::NV

DiagnosticInfoOptimizationBase::Argument NV

Definition:OptimizationRemarkEmitter.h:135

llvm::rdf::Instr

NodeAddr< InstrNode * > Instr

Definition:RDFGraph.h:389

llvm::rdf::Phi

NodeAddr< PhiNode * > Phi

Definition:RDFGraph.h:390

llvm::sys::path::begin

const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)

Get begin iterator over path.

Definition:Path.cpp:226

llvm::sys::path::end

const_iterator end(StringRef path LLVM_LIFETIME_BOUND)

Get end iterator over path.

Definition:Path.cpp:235

llvm::tgtok::In

@ In

Definition:TGLexer.h:84

llvm::vputils::isUniformAfterVectorization

bool isUniformAfterVectorization(const VPValue *VPV)

Returns true if VPV is uniform after vectorization.

Definition:VPlanUtils.h:41

llvm::vputils::getOrCreateVPValueForSCEVExpr

VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr, ScalarEvolution &SE)

Get or create a VPValue that corresponds to the expansion of Expr.

Definition:VPlanUtils.cpp:26

llvm::vputils::getSCEVExprForVPValue

const SCEV * getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE)

Return the SCEV expression for V.

Definition:VPlanUtils.cpp:65

llvm

This is an optimization pass for GlobalISel generic memory operations.

Definition:AddressRanges.h:18

llvm::simplifyLoop

bool simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, ScalarEvolution *SE, AssumptionCache *AC, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)

Simplify each loop in a loop nest recursively.

Definition:LoopSimplify.cpp:697

llvm::ReplaceInstWithInst

void ReplaceInstWithInst(BasicBlock *BB, BasicBlock::iterator &BI, Instruction *I)

Replace the instruction specified by BI with the instruction specified by I.

Definition:BasicBlockUtils.cpp:723

llvm::Offset

@ Offset

Definition:DWP.cpp:480

llvm::addRuntimeChecks

Value * addRuntimeChecks(Instruction *Loc, Loop *TheLoop, const SmallVectorImpl< RuntimePointerCheck > &PointerChecks, SCEVExpander &Expander, bool HoistRuntimeChecks=false)

Add code that checks at runtime if the accessed arrays in PointerChecks overlap.

Definition:LoopUtils.cpp:1954

llvm::RemoveRedundantDbgInstrs

bool RemoveRedundantDbgInstrs(BasicBlock *BB)

Try to remove redundant dbg.value instructions from given basic block.

Definition:BasicBlockUtils.cpp:685

llvm::getLoopEstimatedTripCount

std::optional< unsigned > getLoopEstimatedTripCount(Loop *L, unsigned *EstimatedLoopInvocationWeight=nullptr)

Returns a loop's estimated trip count based on branch weight metadata.

Definition:LoopUtils.cpp:850

llvm::reportVectorization

static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop, VectorizationFactor VF, unsigned IC)

Report successful vectorization of the loop.

Definition:LoopVectorize.cpp:915

llvm::all_of

bool all_of(R &&range, UnaryPredicate P)

Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1739

llvm::getLoadStoreAddressSpace

unsigned getLoadStoreAddressSpace(const Value *I)

A helper function that returns the address space of the pointer operand of load or store instruction.

Definition:Instructions.h:5030

llvm::getMinMaxReductionIntrinsicOp

Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)

Returns the min/max intrinsic used when expanding a min/max reduction.

Definition:LoopUtils.cpp:989

llvm::size

auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)

Get the size of a range.

Definition:STLExtras.h:1697

llvm::getVectorIntrinsicIDForCall

Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)

Returns intrinsic ID for call.

Definition:VectorUtils.cpp:209

llvm::TailFoldingOpts::Reverse

@ Reverse

llvm::RegUsage

@ RegUsage

Definition:SIMachineScheduler.h:33

llvm::enumerate

auto enumerate(FirstRange &&First, RestRanges &&...Rest)

Given two or more input ranges, returns a new range whose values are tuples (A, B,...

Definition:STLExtras.h:2448

llvm::pred_end

auto pred_end(const MachineBasicBlock *BB)

Definition:MachineBasicBlock.h:1385

llvm::verifyFunction

bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)

Check a function for errors, useful for use when debugging a pass.

Definition:Verifier.cpp:7301

llvm::successors

auto successors(const MachineBasicBlock *BB)

Definition:MachineBasicBlock.h:1376

llvm::getLoadStorePointerOperand

const Value * getLoadStorePointerOperand(const Value *V)

A helper function that returns the pointer operand of a load or store instruction.

Definition:Instructions.h:4984

llvm::InstructionVFPair

std::pair< Instruction *, ElementCount > InstructionVFPair

Definition:LoopVectorize.cpp:957

llvm::getRuntimeVF

Value * getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF)

Return the runtime value for VF.

Definition:LoopVectorize.cpp:883

llvm::formLCSSARecursively

bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)

Put a loop nest into LCSSA form.

Definition:LCSSA.cpp:465

llvm::make_range

iterator_range< T > make_range(T x, T y)

Convenience function for iterating over sub-ranges.

Definition:iterator_range.h:77

llvm::makeFollowupLoopID

std::optional< MDNode * > makeFollowupLoopID(MDNode *OrigLoopID, ArrayRef< StringRef > FollowupAttrs, const char *InheritOptionsAttrsPrefix="", bool AlwaysNew=false)

Create a new loop identifier for a loop created from a loop transformation.

Definition:LoopUtils.cpp:263

llvm::append_range

void append_range(Container &C, Range &&R)

Wrapper function to append range R to container C.

Definition:STLExtras.h:2115

llvm::shouldOptimizeForSize

bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)

Returns true if machine function MF is suggested to be size-optimized based on the profile.

Definition:MachineSizeOpts.cpp:27

llvm::make_early_inc_range

iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)

Make a range that does early increment to allow mutation of the underlying range without disrupting i...

Definition:STLExtras.h:657

llvm::getLoadStoreAlignment

Align getLoadStoreAlignment(const Value *I)

A helper function that returns the alignment of load or store instruction.

Definition:Instructions.h:5010

llvm::vp_depth_first_shallow

iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)

Returns an iterator range to traverse the graph starting at G in depth-first order.

Definition:VPlanCFG.h:215

llvm::VerifySCEV

bool VerifySCEV

Definition:ScalarEvolution.cpp:151

llvm::vp_depth_first_deep

iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)

Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...

Definition:VPlanCFG.h:227

llvm::map_range

auto map_range(ContainerTy &&C, FuncTy F)

Definition:STLExtras.h:377

llvm::any_of

bool any_of(R &&range, UnaryPredicate P)

Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1746

llvm::collectEphemeralRecipesForVPlan

void collectEphemeralRecipesForVPlan(VPlan &Plan, DenseSet< VPRecipeBase * > &EphRecipes)

Definition:VPlanAnalysis.cpp:274

llvm::reverse

auto reverse(ContainerTy &&C)

Definition:STLExtras.h:420

llvm::setBranchWeights

void setBranchWeights(Instruction &I, ArrayRef< uint32_t > Weights, bool IsExpected)

Create a new branch_weights metadata node and add or overwrite a prof metadata reference to instructi...

Definition:ProfDataUtils.cpp:235

llvm::isPowerOf2_32

constexpr bool isPowerOf2_32(uint32_t Value)

Return true if the argument is a power of two > 0.

Definition:MathExtras.h:292

llvm::EnableVPlanNativePath

cl::opt< bool > EnableVPlanNativePath("enable-vplan-native-path", cl::Hidden, cl::desc("Enable VPlan-native vectorization path with " "support for outer loop vectorization."))

Definition:VPlan.cpp:53

llvm::sort

void sort(IteratorTy Start, IteratorTy End)

Definition:STLExtras.h:1664

llvm::VPlanPtr

std::unique_ptr< VPlan > VPlanPtr

Definition:VPlan.h:145

llvm::dbgs

raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

Definition:Debug.cpp:163

llvm::none_of

bool none_of(R &&Range, UnaryPredicate P)

Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1753

llvm::EnableLoopVectorization

cl::opt< bool > EnableLoopVectorization

llvm::wouldInstructionBeTriviallyDead

bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)

Return true if the result produced by the instruction would have no side effects if it was not used.

Definition:Local.cpp:425

llvm::isSafeToSpeculativelyExecute

bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)

Return true if the instruction does not have any effects besides calculating the result and does not ...

Definition:ValueTracking.cpp:7043

llvm::make_filter_range

iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)

Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...

Definition:STLExtras.h:573

llvm::llvm_unreachable_internal

void llvm_unreachable_internal(const char *msg=nullptr, const char *file=nullptr, unsigned line=0)

This function calls abort(), and prints the optional message to stderr.

Definition:ErrorHandling.cpp:203

llvm::format

format_object< Ts... > format(const char *Fmt, const Ts &... Vals)

These are helper functions used to produce formatted output.

Definition:Format.h:125

llvm::drop_end

auto drop_end(T &&RangeOrContainer, size_t N=1)

Return a range covering RangeOrContainer with the last N elements excluded.

Definition:STLExtras.h:336

llvm::divideCeil

constexpr T divideCeil(U Numerator, V Denominator)

Returns the integer ceil(Numerator / Denominator).

Definition:MathExtras.h:404

llvm::TTI

TargetTransformInfo TTI

Definition:TargetTransformInfo.h:208

llvm::reportVectorizationInfo

static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr, DebugLoc DL={})

Reports an informative message: print Msg for debugging purposes as well as an optimization remark.

Definition:LoopVectorize.cpp:902

llvm::isAssignmentTrackingEnabled

bool isAssignmentTrackingEnabled(const Module &M)

Return true if assignment tracking is enabled for module M.

Definition:DebugInfo.cpp:2299

llvm::RecurKind

RecurKind

These are the kinds of recurrences that we support.

Definition:IVDescriptors.h:33

llvm::RecurKind::Or

@ Or

Bitwise or logical OR of integers.

llvm::RecurKind::FMulAdd

@ FMulAdd

Sum of float products with llvm.fmuladd(a * b + sum).

llvm::setProfileInfoAfterUnrolling

void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop, Loop *RemainderLoop, uint64_t UF)

Set weights for UnrolledLoop and RemainderLoop based on weights for OrigLoop and the following distri...

Definition:LoopUtils.cpp:1761

llvm::alignTo

uint64_t alignTo(uint64_t Size, Align A)

Returns a multiple of A needed to store Size bytes.

Definition:Alignment.h:155

llvm::reportVectorizationFailure

void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I=nullptr)

Reports a vectorization failure: print DebugMsg for debugging purposes along with the corresponding o...

Definition:LoopVectorize.cpp:887

llvm::Op

DWARFExpression::Operation Op

Definition:DWARFExpression.cpp:22

llvm::ScalarEpilogueLowering

ScalarEpilogueLowering

Definition:LoopVectorize.cpp:936

llvm::CM_ScalarEpilogueNotAllowedLowTripLoop

@ CM_ScalarEpilogueNotAllowedLowTripLoop

Definition:LoopVectorize.cpp:948

llvm::CM_ScalarEpilogueNotNeededUsePredicate

@ CM_ScalarEpilogueNotNeededUsePredicate

Definition:LoopVectorize.cpp:951

llvm::CM_ScalarEpilogueNotAllowedOptSize

@ CM_ScalarEpilogueNotAllowedOptSize

Definition:LoopVectorize.cpp:942

llvm::CM_ScalarEpilogueAllowed

@ CM_ScalarEpilogueAllowed

Definition:LoopVectorize.cpp:939

llvm::CM_ScalarEpilogueNotAllowedUsePredicate

@ CM_ScalarEpilogueNotAllowedUsePredicate

Definition:LoopVectorize.cpp:954

llvm::count_if

auto count_if(R &&Range, UnaryPredicate P)

Wrapper function around std::count_if to count the number of times an element satisfying a given pred...

Definition:STLExtras.h:1945

llvm::pred_begin

auto pred_begin(const MachineBasicBlock *BB)

Definition:MachineBasicBlock.h:1383

llvm::createStepForVF

Value * createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, int64_t Step)

Return a value for Step multiplied by VF.

Definition:LoopVectorize.cpp:876

llvm::SplitBlock

BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)

Split the specified block at the specified instruction.

Definition:BasicBlockUtils.cpp:1084

llvm::find_if

auto find_if(R &&Range, UnaryPredicate P)

Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1766

llvm::predecessors

auto predecessors(const MachineBasicBlock *BB)

Definition:MachineBasicBlock.h:1377

llvm::is_contained

bool is_contained(R &&Range, const E &Element)

Returns true if Element is found in Range.

Definition:STLExtras.h:1903

llvm::BasicBlockSection::List

@ List

llvm::getLoadStoreType

Type * getLoadStoreType(const Value *I)

A helper function that returns the type of a load or store instruction.

Definition:Instructions.h:5039

llvm::addDiffRuntimeChecks

Value * addDiffRuntimeChecks(Instruction *Loc, ArrayRef< PointerDiffInfo > Checks, SCEVExpander &Expander, function_ref< Value *(IRBuilderBase &, unsigned)> GetVF, unsigned IC)

Definition:LoopUtils.cpp:2012

llvm::all_equal

bool all_equal(std::initializer_list< T > Values)

Returns true if all Values in the initializer lists are equal or the list.

Definition:STLExtras.h:2087

llvm::Cost

InstructionCost Cost

Definition:FunctionSpecialization.h:102

llvm::TailFoldingStyle

TailFoldingStyle

Definition:TargetTransformInfo.h:165

llvm::TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck

@ DataAndControlFlowWithoutRuntimeCheck

Use predicate to control both data and control flow, but modify the trip count so that a runtime over...

llvm::TailFoldingStyle::None

@ None

Don't use tail folding.

llvm::TailFoldingStyle::DataWithEVL

@ DataWithEVL

Use predicated EVL instructions for tail-folding.

llvm::TailFoldingStyle::DataWithoutLaneMask

@ DataWithoutLaneMask

Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...

llvm::getReciprocalPredBlockProb

unsigned getReciprocalPredBlockProb()

A helper function that returns the reciprocal of the block probability of predicated blocks.

Definition:VPlan.h:93

llvm::hasBranchWeightMD

bool hasBranchWeightMD(const Instruction &I)

Checks if an instructions has Branch Weight Metadata.

Definition:ProfDataUtils.cpp:103

llvm::VFParamKind::OMP_Linear

@ OMP_Linear

llvm::VFParamKind::Vector

@ Vector

llvm::VFParamKind::OMP_Uniform

@ OMP_Uniform

llvm::VFParamKind::GlobalPredicate

@ GlobalPredicate

llvm::hash_combine

hash_code hash_combine(const Ts &...args)

Combine values into a single hash_code.

Definition:Hashing.h:590

llvm::bit_floor

T bit_floor(T Value)

Returns the largest integral power of two no greater than Value if Value is nonzero.

Definition:bit.h:327

llvm::toVectorTy

Type * toVectorTy(Type *Scalar, ElementCount EC)

A helper function for converting Scalar types to vector types.

Definition:VectorTypeUtils.h:19

llvm::PGSOQueryType::IRPass

@ IRPass

llvm::verifyVPlanIsValid

bool verifyVPlanIsValid(const VPlan &Plan)

Verify invariants for general VPlans.

Definition:VPlanVerifier.cpp:416

llvm::computeMinimumValueSizes

MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)

Compute a map of integer instructions to their minimum legal type size.

Definition:VectorUtils.cpp:700

llvm::hash_combine_range

hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)

Compute a hash_code for a sequence of values.

Definition:Hashing.h:468

llvm::EnableLoopInterleaving

cl::opt< bool > EnableLoopInterleaving

std

Implement std::hash so that hash_code can be used in STL containers.

Definition:BitVector.h:858

std::swap

void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)

Implement std::swap in terms of BitVector swap.

Definition:BitVector.h:860

raw_ostream.h

#define N

llvm::Align

This struct is a compact representation of a valid (non-zero power of two) alignment.

Definition:Alignment.h:39

llvm::AnalysisKey

A special type used by analysis passes to provide an address that identifies that particular analysis...

Definition:Analysis.h:28

llvm::CodeMetrics::collectEphemeralValues

static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)

Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).

Definition:CodeMetrics.cpp:71

llvm::DenseMapInfo

An information struct used to provide DenseMap with the various necessary components for a given valu...

Definition:DenseMapInfo.h:52

llvm::EpilogueLoopVectorizationInfo

Encapsulate information regarding vectorization of a loop and its epilogue.

Definition:LoopVectorize.cpp:689

llvm::EpilogueLoopVectorizationInfo::MainLoopUF

unsigned MainLoopUF

Definition:LoopVectorize.cpp:691

llvm::EpilogueLoopVectorizationInfo::TripCount

Value * TripCount

Definition:LoopVectorize.cpp:698

llvm::EpilogueLoopVectorizationInfo::SCEVSafetyCheck

BasicBlock * SCEVSafetyCheck

Definition:LoopVectorize.cpp:696

llvm::EpilogueLoopVectorizationInfo::EpilogueVF

ElementCount EpilogueVF

Definition:LoopVectorize.cpp:692

llvm::EpilogueLoopVectorizationInfo::VectorTripCount

Value * VectorTripCount

Definition:LoopVectorize.cpp:699

llvm::EpilogueLoopVectorizationInfo::MainLoopVF

ElementCount MainLoopVF

Definition:LoopVectorize.cpp:690

llvm::EpilogueLoopVectorizationInfo::EpilogueLoopVectorizationInfo

EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF, ElementCount EVF, unsigned EUF, VPlan &EpiloguePlan)

Definition:LoopVectorize.cpp:702

llvm::EpilogueLoopVectorizationInfo::EpilogueUF

unsigned EpilogueUF

Definition:LoopVectorize.cpp:693

llvm::EpilogueLoopVectorizationInfo::MemSafetyCheck

BasicBlock * MemSafetyCheck

Definition:LoopVectorize.cpp:697

llvm::EpilogueLoopVectorizationInfo::MainLoopIterationCountCheck

BasicBlock * MainLoopIterationCountCheck

Definition:LoopVectorize.cpp:694

llvm::EpilogueLoopVectorizationInfo::EpiloguePlan

VPlan & EpiloguePlan

Definition:LoopVectorize.cpp:700

llvm::EpilogueLoopVectorizationInfo::EpilogueIterationCountCheck

BasicBlock * EpilogueIterationCountCheck

Definition:LoopVectorize.cpp:695

llvm::FixedScalableVFPair

A class that represents two vectorization factors (initialized with 0 by default).

Definition:LoopVectorizationPlanner.h:329

llvm::FixedScalableVFPair::FixedVF

ElementCount FixedVF

Definition:LoopVectorizationPlanner.h:330

llvm::FixedScalableVFPair::ScalableVF

ElementCount ScalableVF

Definition:LoopVectorizationPlanner.h:331

llvm::FixedScalableVFPair::getNone

static FixedScalableVFPair getNone()

Definition:LoopVectorizationPlanner.h:346

llvm::HistogramInfo

This holds details about a histogram operation – a load -> update -> store sequence where each lane i...

Definition:LoopVectorizationLegality.h:230

llvm::HistogramInfo::Load

LoadInst * Load

Definition:LoopVectorizationLegality.h:231

llvm::LoopVectorizationCostModel::CallWideningDecision

Definition:LoopVectorize.cpp:1199

llvm::LoopVectorizationCostModel::CallWideningDecision::Cost

InstructionCost Cost

Definition:LoopVectorize.cpp:1204

llvm::LoopVectorizationCostModel::CallWideningDecision::Variant

Function * Variant

Definition:LoopVectorize.cpp:1201

llvm::LoopVectorizationCostModel::CallWideningDecision::Kind

InstWidening Kind

Definition:LoopVectorize.cpp:1200

llvm::LoopVectorizationCostModel::CallWideningDecision::MaskPos

std::optional< unsigned > MaskPos

Definition:LoopVectorize.cpp:1203

llvm::LoopVectorizationCostModel::CallWideningDecision::IID

Intrinsic::ID IID

Definition:LoopVectorize.cpp:1202

llvm::LoopVectorizationCostModel::RegisterUsage

A struct that represents some properties of the register usage of a loop.

Definition:LoopVectorize.cpp:1029

llvm::LoopVectorizationCostModel::RegisterUsage::MaxLocalUsers

SmallMapVector< unsigned, unsigned, 4 > MaxLocalUsers

Holds the maximum number of concurrent live intervals in the loop.

Definition:LoopVectorize.cpp:1035

llvm::LoopVectorizationCostModel::RegisterUsage::LoopInvariantRegs

SmallMapVector< unsigned, unsigned, 4 > LoopInvariantRegs

Holds the number of loop invariant values that are used in the loop.

Definition:LoopVectorize.cpp:1032

llvm::LoopVectorizeOptions

Definition:LoopVectorize.h:84

llvm::LoopVectorizePass::TLI

TargetLibraryInfo * TLI

Definition:LoopVectorize.h:148

llvm::LoopVectorizePass::runImpl

LoopVectorizeResult runImpl(Function &F)

Definition:LoopVectorize.cpp:10756

llvm::LoopVectorizePass::processLoop

bool processLoop(Loop *L)

Definition:LoopVectorize.cpp:10325

llvm::LoopVectorizePass::PSI

ProfileSummaryInfo * PSI

Definition:LoopVectorize.h:153

llvm::LoopVectorizePass::LI

LoopInfo * LI

Definition:LoopVectorize.h:144

llvm::LoopVectorizePass::LAIs

LoopAccessInfoManager * LAIs

Definition:LoopVectorize.h:151

llvm::LoopVectorizePass::DB

DemandedBits * DB

Definition:LoopVectorize.h:149

llvm::LoopVectorizePass::printPipeline

void printPipeline(raw_ostream &OS, function_ref< StringRef(StringRef)> MapClassName2PassName)

Definition:LoopVectorize.cpp:10863

llvm::LoopVectorizePass::LoopVectorizePass

LoopVectorizePass(LoopVectorizeOptions Opts={})

Definition:LoopVectorize.cpp:10148

llvm::LoopVectorizePass::BFI

BlockFrequencyInfo * BFI

Definition:LoopVectorize.h:147

llvm::LoopVectorizePass::SE

ScalarEvolution * SE

Definition:LoopVectorize.h:143

llvm::LoopVectorizePass::AC

AssumptionCache * AC

Definition:LoopVectorize.h:150

llvm::LoopVectorizePass::run

PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)

Definition:LoopVectorize.cpp:10814

llvm::LoopVectorizePass::ORE

OptimizationRemarkEmitter * ORE

Definition:LoopVectorize.h:152

llvm::LoopVectorizePass::DT

DominatorTree * DT

Definition:LoopVectorize.h:146

llvm::LoopVectorizeResult

Storage for information about made changes.

Definition:LoopVectorize.h:121

llvm::PartialReductionChain

A chain of instructions that form a partial reduction.

Definition:VPRecipeBuilder.h:30

llvm::PartialReductionChain::ExtendB

Instruction * ExtendB

Definition:VPRecipeBuilder.h:40

llvm::PartialReductionChain::Reduction

Instruction * Reduction

The top-level binary operation that forms the reduction to a scalar after the loop body.

Definition:VPRecipeBuilder.h:37

llvm::PartialReductionChain::ExtendA

Instruction * ExtendA

The extension of each of the inner binary operation's operands.

Definition:VPRecipeBuilder.h:39

llvm::PassInfoMixin

A CRTP mix-in to automatically provide informational APIs needed for passes.

Definition:PassManager.h:69

llvm::ShouldRunExtraVectorPasses

A marker analysis to determine if extra passes should be run after loop vectorization.

Definition:LoopVectorize.h:186

llvm::ShouldRunExtraVectorPasses::Key

static AnalysisKey Key

Definition:LoopVectorize.h:187

llvm::SmallMapVector

A MapVector that performs no allocations if smaller than a certain size.

Definition:MapVector.h:254

llvm::TailFoldingInfo

Definition:TargetTransformInfo.h:198

llvm::TargetTransformInfo::OperandValueInfo

Definition:TargetTransformInfo.h:1135

llvm::TargetTransformInfo::OperandValueInfo::Kind

OperandValueKind Kind

Definition:TargetTransformInfo.h:1136

llvm::TargetTransformInfo::ReductionFlags

Flags describing the kind of vector reduction.

Definition:TargetTransformInfo.h:1748

llvm::TargetTransformInfo::UnrollingPreferences

Parameters that control the generic loop unrolling transformation.

Definition:TargetTransformInfo.h:536

llvm::TargetTransformInfo::UnrollingPreferences::UnrollVectorizedLoop

bool UnrollVectorizedLoop

Don't disable runtime unroll for the loops which were vectorized.

Definition:TargetTransformInfo.h:621

llvm::VFInfo

Holds the VFShape for a specific scalar to vector function mapping.

Definition:VFABIDemangler.h:124

llvm::VFInfo::getParamIndexForOptionalMask

std::optional< unsigned > getParamIndexForOptionalMask() const

Instruction Set Architecture.

Definition:VFABIDemangler.h:132

llvm::VFParameter

Encapsulates information needed to describe a parameter.

Definition:VFABIDemangler.h:63

llvm::VFRange

A range of powers-of-2 vectorization factors with fixed start and adjustable end.

Definition:VPlan.h:98

llvm::VFRange::End

ElementCount End

Definition:VPlan.h:103

llvm::VPCostContext

Struct to hold various analysis needed for cost computations.

Definition:VPlan.h:677

llvm::VPCostContext::CM

LoopVectorizationCostModel & CM

Definition:VPlan.h:682

llvm::VPCostContext::skipCostComputation

bool skipCostComputation(Instruction *UI, bool IsVector) const

Return true if the cost for UI shouldn't be computed, e.g.

Definition:LoopVectorize.cpp:7178

llvm::VPCostContext::getLegacyCost

InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const

Return the cost for UI with VF using the legacy cost model as fallback until computing the cost of al...

Definition:LoopVectorize.cpp:7171

llvm::VPCostContext::SkipCostComputation

SmallPtrSet< Instruction *, 8 > SkipCostComputation

Definition:VPlan.h:683

llvm::VPFirstOrderRecurrencePHIRecipe

A recipe for handling first-order recurrence phis.

Definition:VPlan.h:2354

llvm::VPTransformState::CFGState::PrevBB

BasicBlock * PrevBB

The previous IR BasicBlock created or used.

Definition:VPlan.h:339

llvm::VPTransformState::CFGState::VPBB2IRBB

SmallDenseMap< VPBasicBlock *, BasicBlock * > VPBB2IRBB

A mapping of each VPBasicBlock to the corresponding BasicBlock.

Definition:VPlan.h:347

llvm::VPTransformState

VPTransformState holds information passed down when "executing" a VPlan, needed for generating the ou...

Definition:VPlan.h:231

llvm::VPTransformState::ExpandedSCEVs

DenseMap< const SCEV *, Value * > ExpandedSCEVs

Map SCEVs to their expanded values.

Definition:VPlan.h:384

llvm::VPTransformState::TypeAnalysis

VPTypeAnalysis TypeAnalysis

VPlan-based type analysis.

Definition:VPlan.h:387

llvm::VPTransformState::packScalarIntoVectorValue

void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane)

Construct the vector value of a scalarized value V one lane at a time.

Definition:VPlan.cpp:393

llvm::VPTransformState::get

Value * get(VPValue *Def, bool IsScalar=false)

Get the generated vector Value for a given VPValue Def if IsScalar is false, otherwise return the gen...

Definition:VPlan.cpp:249

llvm::VPTransformState::CFG

struct llvm::VPTransformState::CFGState CFG

llvm::VPTransformState::LVer

LoopVersioning * LVer

LoopVersioning.

Definition:VPlan.h:380

llvm::VPTransformState::addNewMetadata

void addNewMetadata(Instruction *To, const Instruction *Orig)

Add additional metadata to To that was not present on Orig.

Definition:VPlan.cpp:352

llvm::VPTransformState::Lane

std::optional< VPLane > Lane

Hold the index to generate specific scalar instructions.

Definition:VPlan.h:245

llvm::VPTransformState::Builder

IRBuilderBase & Builder

Hold a reference to the IRBuilder used to generate output IR code.

Definition:VPlan.h:364

llvm::VPTransformState::Plan

VPlan * Plan

Pointer to the VPlan code is generated for.

Definition:VPlan.h:370

llvm::VPTransformState::ILV

InnerLoopVectorizer * ILV

Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.

Definition:VPlan.h:367

llvm::VPTransformState::VF

ElementCount VF

The chosen Vectorization Factor of the loop being vectorized.

Definition:VPlan.h:240

llvm::VPTransformState::setDebugLocFrom

void setDebugLocFrom(DebugLoc DL)

Set the debug location in the builder using the debug location DL.

Definition:VPlan.cpp:371

llvm::VPTransformState::set

void set(VPValue *Def, Value *V, bool IsScalar=false)

Set the generated vector Value for a given VPValue, if IsScalar is false.

Definition:VPlan.h:274

llvm::VPWidenLoadRecipe

A recipe for widening load operations, using the address to load from and an optional mask.

Definition:VPlan.h:3035

llvm::VPWidenSelectRecipe

A recipe for widening select instructions.

Definition:VPlan.h:1824

llvm::VPWidenStoreRecipe

A recipe for widening store operations, using the stored value, the address to store to and an option...

Definition:VPlan.h:3113

llvm::VPlanTransforms::optimizeInductionExitUsers

static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues)

If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...

Definition:VPlanTransforms.cpp:731

llvm::VPlanTransforms::convertToConcreteRecipes

static void convertToConcreteRecipes(VPlan &Plan)

Lower abstract recipes to concrete ones, that can be codegen'd.

Definition:VPlanTransforms.cpp:2046

llvm::VPlanTransforms::unrollByUF

static void unrollByUF(VPlan &Plan, unsigned UF, LLVMContext &Ctx)

Explicitly unroll Plan by UF.

Definition:VPlanUnroll.cpp:391

llvm::VPlanTransforms::dropPoisonGeneratingRecipes

static void dropPoisonGeneratingRecipes(VPlan &Plan, function_ref< bool(BasicBlock *)> BlockNeedsPredication)

Drop poison flags from recipes that may generate a poison value that is used after vectorization,...

Definition:VPlanTransforms.cpp:1873

llvm::VPlanTransforms::createInterleaveGroups

static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed)

Definition:VPlanTransforms.cpp:1970

llvm::VPlanTransforms::removeDeadRecipes

static void removeDeadRecipes(VPlan &Plan)

Remove dead recipes from Plan.

Definition:VPlanTransforms.cpp:512

llvm::VPlanTransforms::handleUncountableEarlyExit

static bool handleUncountableEarlyExit(VPlan &Plan, ScalarEvolution &SE, Loop *OrigLoop, BasicBlock *UncountableExitingBlock, VPRecipeBuilder &RecipeBuilder)

Update Plan to account for the uncountable early exit block in UncountableExitingBlock by.

Definition:VPlanTransforms.cpp:2065

llvm::VPlanTransforms::clearReductionWrapFlags

static void clearReductionWrapFlags(VPlan &Plan)

Clear NSW/NUW flags from reduction instructions if necessary.

Definition:VPlanTransforms.cpp:1256

llvm::VPlanTransforms::tryAddExplicitVectorLength

static bool tryAddExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)

Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...

Definition:VPlanTransforms.cpp:1812

llvm::VPlanTransforms::VPInstructionsToVPRecipes

static void VPInstructionsToVPRecipes(VPlanPtr &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, ScalarEvolution &SE, const TargetLibraryInfo &TLI)

Replaces the VPInstructions in Plan with corresponding widen recipes.

Definition:VPlanTransforms.cpp:33

llvm::VPlanTransforms::addActiveLaneMask

static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)

Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...

Definition:VPlanTransforms.cpp:1606

llvm::VPlanTransforms::optimize

static void optimize(VPlan &Plan)

Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...

Definition:VPlanTransforms.cpp:1441

llvm::VPlanTransforms::truncateToMinimalBitwidths

static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)