Movatterモバイル変換

Go to the documentation of this file.

1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//

2//

3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

4// See https://llvm.org/LICENSE.txt for license information.

5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

6//

7//===----------------------------------------------------------------------===//

8//

9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive

10// stores that can be put together into vector-stores. Next, it attempts to

11// construct vectorizable tree using the use-def chains. If a profitable tree

12// was found, the SLP vectorizer performs vectorization on the tree.

13//

14// The pass is inspired by the work described in the paper:

15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.

16//

17//===----------------------------------------------------------------------===//

19#include "llvm/Transforms/Vectorize/SLPVectorizer.h"

20#include "llvm/ADT/DenseMap.h"

21#include "llvm/ADT/DenseSet.h"

22#include "llvm/ADT/PriorityQueue.h"

23#include "llvm/ADT/STLExtras.h"

24#include "llvm/ADT/ScopeExit.h"

25#include "llvm/ADT/SetOperations.h"

26#include "llvm/ADT/SetVector.h"

27#include "llvm/ADT/SmallBitVector.h"

28#include "llvm/ADT/SmallPtrSet.h"

29#include "llvm/ADT/SmallSet.h"

30#include "llvm/ADT/SmallString.h"

31#include "llvm/ADT/Statistic.h"

32#include "llvm/ADT/iterator.h"

33#include "llvm/ADT/iterator_range.h"

34#include "llvm/Analysis/AliasAnalysis.h"

35#include "llvm/Analysis/AssumptionCache.h"

36#include "llvm/Analysis/CodeMetrics.h"

37#include "llvm/Analysis/ConstantFolding.h"

38#include "llvm/Analysis/DemandedBits.h"

39#include "llvm/Analysis/GlobalsModRef.h"

40#include "llvm/Analysis/IVDescriptors.h"

41#include "llvm/Analysis/LoopAccessAnalysis.h"

42#include "llvm/Analysis/LoopInfo.h"

43#include "llvm/Analysis/MemoryLocation.h"

44#include "llvm/Analysis/OptimizationRemarkEmitter.h"

45#include "llvm/Analysis/ScalarEvolution.h"

46#include "llvm/Analysis/ScalarEvolutionExpressions.h"

47#include "llvm/Analysis/TargetLibraryInfo.h"

48#include "llvm/Analysis/TargetTransformInfo.h"

49#include "llvm/Analysis/ValueTracking.h"

50#include "llvm/Analysis/VectorUtils.h"

51#include "llvm/IR/Attributes.h"

52#include "llvm/IR/BasicBlock.h"

53#include "llvm/IR/Constant.h"

54#include "llvm/IR/Constants.h"

55#include "llvm/IR/DataLayout.h"

56#include "llvm/IR/DerivedTypes.h"

57#include "llvm/IR/Dominators.h"

58#include "llvm/IR/Function.h"

59#include "llvm/IR/IRBuilder.h"

60#include "llvm/IR/InstrTypes.h"

61#include "llvm/IR/Instruction.h"

62#include "llvm/IR/Instructions.h"

63#include "llvm/IR/IntrinsicInst.h"

64#include "llvm/IR/Intrinsics.h"

65#include "llvm/IR/Module.h"

66#include "llvm/IR/Operator.h"

67#include "llvm/IR/PatternMatch.h"

68#include "llvm/IR/Type.h"

69#include "llvm/IR/Use.h"

70#include "llvm/IR/User.h"

71#include "llvm/IR/Value.h"

72#include "llvm/IR/ValueHandle.h"

73#ifdef EXPENSIVE_CHECKS

74#include "llvm/IR/Verifier.h"

75#endif

76#include "llvm/Pass.h"

77#include "llvm/Support/Casting.h"

78#include "llvm/Support/CommandLine.h"

79#include "llvm/Support/Compiler.h"

80#include "llvm/Support/DOTGraphTraits.h"

81#include "llvm/Support/Debug.h"

82#include "llvm/Support/DebugCounter.h"

83#include "llvm/Support/ErrorHandling.h"

84#include "llvm/Support/GraphWriter.h"

85#include "llvm/Support/InstructionCost.h"

86#include "llvm/Support/KnownBits.h"

87#include "llvm/Support/MathExtras.h"

88#include "llvm/Support/raw_ostream.h"

89#include "llvm/Transforms/Utils/InjectTLIMappings.h"

90#include "llvm/Transforms/Utils/Local.h"

91#include "llvm/Transforms/Utils/LoopUtils.h"

92#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"

93#include <algorithm>

94#include <cassert>

95#include <cstdint>

96#include <iterator>

97#include <memory>

98#include <optional>

99#include <set>

100#include <string>

101#include <tuple>

102#include <utility>

103

104using namespacellvm;

105using namespacellvm::PatternMatch;

106using namespaceslpvectorizer;

107using namespacestd::placeholders;

108

109#define SV_NAME "slp-vectorizer"

110#define DEBUG_TYPE "SLP"

111

112STATISTIC(NumVectorInstructions,"Number of vector instructions generated");

113

114DEBUG_COUNTER(VectorizedGraphs,"slp-vectorized",

115"Controls which SLP graphs should be vectorized.");

116

117staticcl::opt<bool>

118RunSLPVectorization("vectorize-slp",cl::init(true),cl::Hidden,

119cl::desc("Run the SLP vectorization passes"));

120

121staticcl::opt<bool>

122SLPReVec("slp-revec",cl::init(false),cl::Hidden,

123cl::desc("Enable vectorization for wider vector utilization"));

124

125staticcl::opt<int>

126SLPCostThreshold("slp-threshold",cl::init(0),cl::Hidden,

127cl::desc("Only vectorize if you gain more than this "

128"number "));

129

130staticcl::opt<bool>SLPSkipEarlyProfitabilityCheck(

131"slp-skip-early-profitability-check",cl::init(false),cl::Hidden,

132cl::desc("When true, SLP vectorizer bypasses profitability checks based on "

133"heuristics and makes vectorization decision via cost modeling."));

134

135staticcl::opt<bool>

136ShouldVectorizeHor("slp-vectorize-hor",cl::init(true),cl::Hidden,

137cl::desc("Attempt to vectorize horizontal reductions"));

138

139staticcl::opt<bool>ShouldStartVectorizeHorAtStore(

140"slp-vectorize-hor-store",cl::init(false),cl::Hidden,

141cl::desc(

142"Attempt to vectorize horizontal reductions feeding into a store"));

143

144staticcl::opt<int>

145MaxVectorRegSizeOption("slp-max-reg-size",cl::init(128),cl::Hidden,

146cl::desc("Attempt to vectorize for this register size in bits"));

147

148staticcl::opt<unsigned>

149MaxVFOption("slp-max-vf",cl::init(0),cl::Hidden,

150cl::desc("Maximum SLP vectorization factor (0=unlimited)"));

151

152/// Limits the size of scheduling regions in a block.

153/// It avoid long compile times for _very_ large blocks where vector

154/// instructions are spread over a wide range.

155/// This limit is way higher than needed by real-world functions.

156staticcl::opt<int>

157ScheduleRegionSizeBudget("slp-schedule-budget",cl::init(100000),cl::Hidden,

158cl::desc("Limit the size of the SLP scheduling region per block"));

159

160staticcl::opt<int>MinVectorRegSizeOption(

161"slp-min-reg-size",cl::init(128),cl::Hidden,

162cl::desc("Attempt to vectorize for this register size in bits"));

163

164staticcl::opt<unsigned>RecursionMaxDepth(

165"slp-recursion-max-depth",cl::init(12),cl::Hidden,

166cl::desc("Limit the recursion depth when building a vectorizable tree"));

167

168staticcl::opt<unsigned>MinTreeSize(

169"slp-min-tree-size",cl::init(3),cl::Hidden,

170cl::desc("Only vectorize small trees if they are fully vectorizable"));

171

172// The maximum depth that the look-ahead score heuristic will explore.

173// The higher this value, the higher the compilation time overhead.

174staticcl::opt<int>LookAheadMaxDepth(

175"slp-max-look-ahead-depth",cl::init(2),cl::Hidden,

176cl::desc("The maximum look-ahead depth for operand reordering scores"));

177

178// The maximum depth that the look-ahead score heuristic will explore

179// when it probing among candidates for vectorization tree roots.

180// The higher this value, the higher the compilation time overhead but unlike

181// similar limit for operands ordering this is less frequently used, hence

182// impact of higher value is less noticeable.

183staticcl::opt<int>RootLookAheadMaxDepth(

184"slp-max-root-look-ahead-depth",cl::init(2),cl::Hidden,

185cl::desc("The maximum look-ahead depth for searching best rooting option"));

186

187staticcl::opt<unsigned>MinProfitableStridedLoads(

188"slp-min-strided-loads",cl::init(2),cl::Hidden,

189cl::desc("The minimum number of loads, which should be considered strided, "

190"if the stride is > 1 or is runtime value"));

191

192staticcl::opt<unsigned>MaxProfitableLoadStride(

193"slp-max-stride",cl::init(8),cl::Hidden,

194cl::desc("The maximum stride, considered to be profitable."));

195

196staticcl::opt<bool>

197ViewSLPTree("view-slp-tree",cl::Hidden,

198cl::desc("Display the SLP trees with Graphviz"));

199

200staticcl::opt<bool>VectorizeNonPowerOf2(

201"slp-vectorize-non-power-of-2",cl::init(false),cl::Hidden,

202cl::desc("Try to vectorize with non-power-of-2 number of elements."));

203

204// Limit the number of alias checks. The limit is chosen so that

205// it has no negative effect on the llvm benchmarks.

206staticconstunsignedAliasedCheckLimit = 10;

207

208// Limit of the number of uses for potentially transformed instructions/values,

209// used in checks to avoid compile-time explode.

210staticconstexprintUsesLimit = 64;

211

212// Another limit for the alias checks: The maximum distance between load/store

213// instructions where alias checks are done.

214// This limit is useful for very large basic blocks.

215staticconstunsignedMaxMemDepDistance = 160;

216

217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling

218/// regions to be handled.

219staticconstintMinScheduleRegionSize = 16;

220

221/// Maximum allowed number of operands in the PHI nodes.

222staticconstunsignedMaxPHINumOperands = 128;

223

224/// Predicate for the element types that the SLP vectorizer supports.

225///

226/// The most important thing to filter here are types which are invalid in LLVM

227/// vectors. We also filter target specific types which have absolutely no

228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just

229/// avoids spending time checking the cost model and realizing that they will

230/// be inevitably scalarized.

231staticboolisValidElementType(Type *Ty) {

232// TODO: Support ScalableVectorType.

233if (SLPReVec && isa<FixedVectorType>(Ty))

234 Ty = Ty->getScalarType();

235return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&

236 !Ty->isPPC_FP128Ty();

237}

238

239/// Returns the type of the given value/instruction \p V. If it is store,

240/// returns the type of its value operand, for Cmp - the types of the compare

241/// operands and for insertelement - the type os the inserted operand.

242/// Otherwise, just the type of the value is returned.

243staticType *getValueType(Value *V) {

244if (auto *SI = dyn_cast<StoreInst>(V))

245return SI->getValueOperand()->getType();

246if (auto *CI = dyn_cast<CmpInst>(V))

247return CI->getOperand(0)->getType();

248if (auto *IE = dyn_cast<InsertElementInst>(V))

249return IE->getOperand(1)->getType();

250return V->getType();

251}

252

253/// \returns the number of elements for Ty.

254staticunsignedgetNumElements(Type *Ty) {

255assert(!isa<ScalableVectorType>(Ty) &&

256"ScalableVectorType is not supported.");

257if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))

258return VecTy->getNumElements();

259return 1;

260}

261

262/// \returns the vector type of ScalarTy based on vectorization factor.

263staticFixedVectorType *getWidenedType(Type *ScalarTy,unsigned VF) {

264returnFixedVectorType::get(ScalarTy->getScalarType(),

265 VF *getNumElements(ScalarTy));

266}

267

268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,

269/// which forms type, which splits by \p TTI into whole vector types during

270/// legalization.

271staticunsignedgetFullVectorNumberOfElements(constTargetTransformInfo &TTI,

272Type *Ty,unsigned Sz) {

273if (!isValidElementType(Ty))

274returnbit_ceil(Sz);

275// Find the number of elements, which forms full vectors.

276constunsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));

277if (NumParts == 0 || NumParts >= Sz)

278returnbit_ceil(Sz);

279returnbit_ceil(divideCeil(Sz, NumParts)) * NumParts;

280}

281

282/// Returns the number of elements of the given type \p Ty, not greater than \p

283/// Sz, which forms type, which splits by \p TTI into whole vector types during

284/// legalization.

285staticunsigned

286getFloorFullVectorNumberOfElements(constTargetTransformInfo &TTI,Type *Ty,

287unsigned Sz) {

288if (!isValidElementType(Ty))

289returnbit_floor(Sz);

290// Find the number of elements, which forms full vectors.

291unsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));

292if (NumParts == 0 || NumParts >= Sz)

293returnbit_floor(Sz);

294unsigned RegVF =bit_ceil(divideCeil(Sz, NumParts));

295if (RegVF > Sz)

296returnbit_floor(Sz);

297return (Sz / RegVF) * RegVF;

298}

299

300staticvoidtransformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,

301SmallVectorImpl<int> &Mask) {

302// The ShuffleBuilder implementation use shufflevector to splat an "element".

303// But the element have different meaning for SLP (scalar) and REVEC

304// (vector). We need to expand Mask into masks which shufflevector can use

305// directly.

306SmallVector<int> NewMask(Mask.size() * VecTyNumElements);

307for (unsignedI : seq<unsigned>(Mask.size()))

308for (auto [J, MaskV] :enumerate(MutableArrayRef(NewMask).slice(

309I * VecTyNumElements, VecTyNumElements)))

310 MaskV = Mask[I] ==PoisonMaskElem ?PoisonMaskElem

311 : Mask[I] * VecTyNumElements + J;

312 Mask.swap(NewMask);

313}

314

315/// \returns the number of groups of shufflevector

316/// A group has the following features

317/// 1. All of value in a group are shufflevector.

318/// 2. The mask of all shufflevector is isExtractSubvectorMask.

319/// 3. The mask of all shufflevector uses all of the elements of the source.

320/// e.g., it is 1 group (%0)

321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,

322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>

323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,

324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>

325/// it is 2 groups (%3 and %4)

326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,

327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,

329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>

330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,

331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,

333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>

334/// it is 0 group

335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,

336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,

338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

339staticunsignedgetShufflevectorNumGroups(ArrayRef<Value *> VL) {

340if (VL.empty())

341return 0;

342if (!all_of(VL, IsaPred<ShuffleVectorInst>))

343return 0;

344auto *SV = cast<ShuffleVectorInst>(VL.front());

345unsigned SVNumElements =

346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();

347unsigned ShuffleMaskSize = SV->getShuffleMask().size();

348if (SVNumElements % ShuffleMaskSize != 0)

349return 0;

350unsigned GroupSize = SVNumElements / ShuffleMaskSize;

351if (GroupSize == 0 || (VL.size() % GroupSize) != 0)

352return 0;

353unsigned NumGroup = 0;

354for (size_tI = 0, E = VL.size();I != E;I += GroupSize) {

355auto *SV = cast<ShuffleVectorInst>(VL[I]);

356Value *Src = SV->getOperand(0);

357ArrayRef<Value *> Group = VL.slice(I, GroupSize);

358SmallBitVector ExpectedIndex(GroupSize);

359if (!all_of(Group, [&](Value *V) {

360auto *SV = cast<ShuffleVectorInst>(V);

361// From the same source.

362if (SV->getOperand(0) != Src)

363returnfalse;

364int Index;

365if (!SV->isExtractSubvectorMask(Index))

366returnfalse;

367 ExpectedIndex.set(Index / ShuffleMaskSize);

368returntrue;

369 }))

370return 0;

371if (!ExpectedIndex.all())

372return 0;

373 ++NumGroup;

374 }

375assert(NumGroup == (VL.size() / GroupSize) &&"Unexpected number of groups");

376return NumGroup;

377}

378

379/// \returns a shufflevector mask which is used to vectorize shufflevectors

380/// e.g.,

381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,

382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,

384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>

385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,

386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>

387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,

388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>

389/// the result is

390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>

391staticSmallVector<int>calculateShufflevectorMask(ArrayRef<Value *> VL) {

392assert(getShufflevectorNumGroups(VL) &&"Not supported shufflevector usage.");

393auto *SV = cast<ShuffleVectorInst>(VL.front());

394unsigned SVNumElements =

395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();

396SmallVector<int> Mask;

397unsigned AccumulateLength = 0;

398for (Value *V : VL) {

399auto *SV = cast<ShuffleVectorInst>(V);

400for (int M : SV->getShuffleMask())

401 Mask.push_back(M ==PoisonMaskElem ?PoisonMaskElem

402 : AccumulateLength + M);

403 AccumulateLength += SVNumElements;

404 }

405return Mask;

406}

407

408/// \returns True if the value is a constant (but not globals/constant

409/// expressions).

410staticboolisConstant(Value *V) {

411return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);

412}

413

414/// Checks if \p V is one of vector-like instructions, i.e. undef,

415/// insertelement/extractelement with constant indices for fixed vector type or

416/// extractvalue instruction.

417staticboolisVectorLikeInstWithConstOps(Value *V) {

418if (!isa<InsertElementInst, ExtractElementInst>(V) &&

419 !isa<ExtractValueInst, UndefValue>(V))

420returnfalse;

421auto *I = dyn_cast<Instruction>(V);

422if (!I || isa<ExtractValueInst>(I))

423returntrue;

424if (!isa<FixedVectorType>(I->getOperand(0)->getType()))

425returnfalse;

426if (isa<ExtractElementInst>(I))

427returnisConstant(I->getOperand(1));

428assert(isa<InsertElementInst>(V) &&"Expected only insertelement.");

429returnisConstant(I->getOperand(2));

430}

431

432/// Returns power-of-2 number of elements in a single register (part), given the

433/// total number of elements \p Size and number of registers (parts) \p

434/// NumParts.

435staticunsignedgetPartNumElems(unsignedSize,unsigned NumParts) {

436return std::min<unsigned>(Size,bit_ceil(divideCeil(Size, NumParts)));

437}

438

439/// Returns correct remaining number of elements, considering total amount \p

440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems

441/// and current register (part) \p Part.

442staticunsignedgetNumElems(unsignedSize,unsigned PartNumElems,

443unsigned Part) {

444return std::min<unsigned>(PartNumElems,Size - Part * PartNumElems);

445}

446

447#if !defined(NDEBUG)

448/// Print a short descriptor of the instruction bundle suitable for debug output.

449static std::stringshortBundleName(ArrayRef<Value *> VL,intIdx = -1) {

450 std::string Result;

451raw_string_ostream OS(Result);

452if (Idx >= 0)

453OS <<"Idx: " <<Idx <<", ";

454OS <<"n=" << VL.size() <<" [" << *VL.front() <<", ..]";

455return Result;

456}

457#endif

458

459/// \returns true if all of the instructions in \p VL are in the same block or

460/// false otherwise.

461staticboolallSameBlock(ArrayRef<Value *> VL) {

462auto *It =find_if(VL, IsaPred<Instruction>);

463if (It == VL.end())

464returnfalse;

465Instruction *I0 = cast<Instruction>(*It);

466if (all_of(VL,isVectorLikeInstWithConstOps))

467returntrue;

468

469BasicBlock *BB = I0->getParent();

470for (Value *V :iterator_range(It, VL.end())) {

471if (isa<PoisonValue>(V))

472continue;

473auto *II = dyn_cast<Instruction>(V);

474if (!II)

475returnfalse;

476

477if (BB !=II->getParent())

478returnfalse;

479 }

480returntrue;

481}

482

483/// \returns True if all of the values in \p VL are constants (but not

484/// globals/constant expressions).

485staticboolallConstant(ArrayRef<Value *> VL) {

486// Constant expressions and globals can't be vectorized like normal integer/FP

487// constants.

488returnall_of(VL,isConstant);

489}

490

491/// \returns True if all of the values in \p VL are identical or some of them

492/// are UndefValue.

493staticboolisSplat(ArrayRef<Value *> VL) {

494Value *FirstNonUndef =nullptr;

495for (Value *V : VL) {

496if (isa<UndefValue>(V))

497continue;

498if (!FirstNonUndef) {

499 FirstNonUndef = V;

500continue;

501 }

502if (V != FirstNonUndef)

503returnfalse;

504 }

505return FirstNonUndef !=nullptr;

506}

507

508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.

509staticboolisCommutative(Instruction *I) {

510if (auto *Cmp = dyn_cast<CmpInst>(I))

511return Cmp->isCommutative();

512if (auto *BO = dyn_cast<BinaryOperator>(I))

513return BO->isCommutative() ||

514 (BO->getOpcode() == Instruction::Sub &&

515 !BO->hasNUsesOrMore(UsesLimit) &&

516all_of(

517 BO->uses(),

518 [](constUse &U) {

519// Commutative, if icmp eq/ne sub, 0

520 CmpPredicate Pred;

521 if (match(U.getUser(),

522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&

523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))

524 return true;

525// Commutative, if abs(sub nsw, true) or abs(sub, false).

526 ConstantInt *Flag;

527 return match(U.getUser(),

528 m_Intrinsic<Intrinsic::abs>(

529 m_Specific(U.get()), m_ConstantInt(Flag))) &&

530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||

531 Flag->isOne());

532 })) ||

533 (BO->getOpcode() == Instruction::FSub &&

534 !BO->hasNUsesOrMore(UsesLimit) &&

535all_of(BO->uses(), [](constUse &U) {

536 return match(U.getUser(),

537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));

538 }));

539returnI->isCommutative();

540}

541

542template <typename T>

543static std::optional<unsigned>getInsertExtractIndex(constValue *Inst,

544unsignedOffset) {

545static_assert(std::is_same_v<T, InsertElementInst> ||

546 std::is_same_v<T, ExtractElementInst>,

547"unsupported T");

548int Index =Offset;

549if (constauto *IE = dyn_cast<T>(Inst)) {

550constauto *VT = dyn_cast<FixedVectorType>(IE->getType());

551if (!VT)

552return std::nullopt;

553constauto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));

554if (!CI)

555return std::nullopt;

556if (CI->getValue().uge(VT->getNumElements()))

557return std::nullopt;

558 Index *= VT->getNumElements();

559 Index += CI->getZExtValue();

560return Index;

561 }

562return std::nullopt;

563}

564

565/// \returns inserting or extracting index of InsertElement, ExtractElement or

566/// InsertValue instruction, using Offset as base offset for index.

567/// \returns std::nullopt if the index is not an immediate.

568static std::optional<unsigned>getElementIndex(constValue *Inst,

569unsignedOffset = 0) {

570if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst,Offset))

571return Index;

572if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,Offset))

573return Index;

574

575int Index =Offset;

576

577constauto *IV = dyn_cast<InsertValueInst>(Inst);

578if (!IV)

579return std::nullopt;

580

581Type *CurrentType =IV->getType();

582for (unsignedI :IV->indices()) {

583if (constauto *ST = dyn_cast<StructType>(CurrentType)) {

584 Index *= ST->getNumElements();

585 CurrentType = ST->getElementType(I);

586 }elseif (constauto *AT = dyn_cast<ArrayType>(CurrentType)) {

587 Index *= AT->getNumElements();

588 CurrentType = AT->getElementType();

589 }else {

590return std::nullopt;

591 }

592 Index +=I;

593 }

594return Index;

595}

596

597namespace{

598/// Specifies the way the mask should be analyzed for undefs/poisonous elements

599/// in the shuffle mask.

600enum class UseMask {

601 FirstArg,///< The mask is expected to be for permutation of 1-2 vectors,

602 ///< check for the mask elements for the first argument (mask

603 ///< indices are in range [0:VF)).

604 SecondArg,///< The mask is expected to be for permutation of 2 vectors, check

605 ///< for the mask elements for the second argument (mask indices

606 ///< are in range [VF:2*VF))

607 UndefsAsMask///< Consider undef mask elements (-1) as placeholders for

608 ///< future shuffle elements and mark them as ones as being used

609 ///< in future. Non-undef elements are considered as unused since

610 ///< they're already marked as used in the mask.

611};

612}// namespace

613

614/// Prepares a use bitset for the given mask either for the first argument or

615/// for the second.

616staticSmallBitVector buildUseMask(int VF,ArrayRef<int> Mask,

617 UseMask MaskArg) {

618SmallBitVector UseMask(VF,true);

619for (auto [Idx,Value] :enumerate(Mask)) {

620if (Value ==PoisonMaskElem) {

621if (MaskArg == UseMask::UndefsAsMask)

622 UseMask.reset(Idx);

623continue;

624 }

625if (MaskArg == UseMask::FirstArg &&Value < VF)

626 UseMask.reset(Value);

627elseif (MaskArg == UseMask::SecondArg &&Value >= VF)

628 UseMask.reset(Value - VF);

629 }

630return UseMask;

631}

632

633/// Checks if the given value is actually an undefined constant vector.

634/// Also, if the \p UseMask is not empty, tries to check if the non-masked

635/// elements actually mask the insertelement buildvector, if any.

636template <bool IsPoisonOnly = false>

637staticSmallBitVector isUndefVector(constValue *V,

638constSmallBitVector &UseMask = {}) {

639SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(),true);

640usingT = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;

641if (isa<T>(V))

642return Res;

643auto *VecTy = dyn_cast<FixedVectorType>(V->getType());

644if (!VecTy)

645return Res.reset();

646auto *C = dyn_cast<Constant>(V);

647if (!C) {

648if (!UseMask.empty()) {

649constValue *Base =V;

650while (auto *II = dyn_cast<InsertElementInst>(Base)) {

651Base =II->getOperand(0);

652if (isa<T>(II->getOperand(1)))

653continue;

654 std::optional<unsigned>Idx =getElementIndex(II);

655if (!Idx) {

656 Res.reset();

657return Res;

658 }

659if (*Idx < UseMask.size() && !UseMask.test(*Idx))

660 Res.reset(*Idx);

661 }

662// TODO: Add analysis for shuffles here too.

663if (V ==Base) {

664 Res.reset();

665 }else {

666SmallBitVector SubMask(UseMask.size(),false);

667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);

668 }

669 }else {

670 Res.reset();

671 }

672return Res;

673 }

674for (unsignedI = 0, E = VecTy->getNumElements();I != E; ++I) {

675if (Constant *Elem =C->getAggregateElement(I))

676if (!isa<T>(Elem) &&

677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))

678 Res.reset(I);

679 }

680return Res;

681}

682

683/// Checks if the vector of instructions can be represented as a shuffle, like:

684/// %x0 = extractelement <4 x i8> %x, i32 0

685/// %x3 = extractelement <4 x i8> %x, i32 3

686/// %y1 = extractelement <4 x i8> %y, i32 1

687/// %y2 = extractelement <4 x i8> %y, i32 2

688/// %x0x0 = mul i8 %x0, %x0

689/// %x3x3 = mul i8 %x3, %x3

690/// %y1y1 = mul i8 %y1, %y1

691/// %y2y2 = mul i8 %y2, %y2

692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0

693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1

694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2

695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3

696/// ret <4 x i8> %ins4

697/// can be transformed into:

698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,

699/// i32 6>

700/// %2 = mul <4 x i8> %1, %1

701/// ret <4 x i8> %2

702/// Mask will return the Shuffle Mask equivalent to the extracted elements.

703/// TODO: Can we split off and reuse the shuffle mask detection from

704/// ShuffleVectorInst/getShuffleCost?

705static std::optional<TargetTransformInfo::ShuffleKind>

706isFixedVectorShuffle(ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,

707AssumptionCache *AC) {

708constauto *It =find_if(VL, IsaPred<ExtractElementInst>);

709if (It == VL.end())

710return std::nullopt;

711unsignedSize =

712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S,Value *V) {

713 auto *EI = dyn_cast<ExtractElementInst>(V);

714 if (!EI)

715 return S;

716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

717 if (!VTy)

718 return S;

719 return std::max(S, VTy->getNumElements());

720 });

721

722Value *Vec1 =nullptr;

723Value *Vec2 =nullptr;

724bool HasNonUndefVec =any_of(VL, [&](Value *V) {

725auto *EE = dyn_cast<ExtractElementInst>(V);

726if (!EE)

727returnfalse;

728Value *Vec = EE->getVectorOperand();

729if (isa<UndefValue>(Vec))

730returnfalse;

731returnisGuaranteedNotToBePoison(Vec, AC);

732 });

733enum ShuffleMode {Unknown,Select, Permute };

734 ShuffleMode CommonShuffleMode =Unknown;

735 Mask.assign(VL.size(),PoisonMaskElem);

736for (unsignedI = 0, E = VL.size();I < E; ++I) {

737// Undef can be represented as an undef element in a vector.

738if (isa<UndefValue>(VL[I]))

739continue;

740auto *EI = cast<ExtractElementInst>(VL[I]);

741if (isa<ScalableVectorType>(EI->getVectorOperandType()))

742return std::nullopt;

743auto *Vec = EI->getVectorOperand();

744// We can extractelement from undef or poison vector.

745if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())

746continue;

747// All vector operands must have the same number of vector elements.

748if (isa<UndefValue>(Vec)) {

749 Mask[I] =I;

750 }else {

751if (isa<UndefValue>(EI->getIndexOperand()))

752continue;

753auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());

754if (!Idx)

755return std::nullopt;

756// Undefined behavior if Idx is negative or >= Size.

757if (Idx->getValue().uge(Size))

758continue;

759unsigned IntIdx =Idx->getValue().getZExtValue();

760 Mask[I] = IntIdx;

761 }

762if (isUndefVector(Vec).all() && HasNonUndefVec)

763continue;

764// For correct shuffling we have to have at most 2 different vector operands

765// in all extractelement instructions.

766if (!Vec1 || Vec1 == Vec) {

767 Vec1 = Vec;

768 }elseif (!Vec2 || Vec2 == Vec) {

769 Vec2 = Vec;

770 Mask[I] +=Size;

771 }else {

772return std::nullopt;

773 }

774if (CommonShuffleMode == Permute)

775continue;

776// If the extract index is not the same as the operation number, it is a

777// permutation.

778if (Mask[I] %Size !=I) {

779 CommonShuffleMode = Permute;

780continue;

781 }

782 CommonShuffleMode =Select;

783 }

784// If we're not crossing lanes in different vectors, consider it as blending.

785if (CommonShuffleMode ==Select && Vec2)

786returnTargetTransformInfo::SK_Select;

787// If Vec2 was never used, we have a permutation of a single vector, otherwise

788// we have permutation of 2 vectors.

789return Vec2 ?TargetTransformInfo::SK_PermuteTwoSrc

790 :TargetTransformInfo::SK_PermuteSingleSrc;

791}

792

793/// \returns True if Extract{Value,Element} instruction extracts element Idx.

794static std::optional<unsigned>getExtractIndex(Instruction *E) {

795unsigned Opcode = E->getOpcode();

796assert((Opcode == Instruction::ExtractElement ||

797 Opcode == Instruction::ExtractValue) &&

798"Expected extractelement or extractvalue instruction.");

799if (Opcode == Instruction::ExtractElement) {

800auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));

801if (!CI)

802return std::nullopt;

803return CI->getZExtValue();

804 }

805auto *EI = cast<ExtractValueInst>(E);

806if (EI->getNumIndices() != 1)

807return std::nullopt;

808return *EI->idx_begin();

809}

810

811namespace{

812

813/// Main data required for vectorization of instructions.

814classInstructionsState {

815 /// The main/alternate instruction. MainOp is also VL0.

816Instruction *MainOp =nullptr;

817Instruction *AltOp =nullptr;

818

819public:

820Instruction *getMainOp() const{

821assert(valid() &&"InstructionsState is invalid.");

822return MainOp;

823 }

824

825Instruction *getAltOp() const{

826assert(valid() &&"InstructionsState is invalid.");

827return AltOp;

828 }

829

830 /// The main/alternate opcodes for the list of instructions.

831unsignedgetOpcode() const{return getMainOp()->getOpcode(); }

832

833unsigned getAltOpcode() const{return getAltOp()->getOpcode(); }

834

835 /// Some of the instructions in the list have alternate opcodes.

836bool isAltShuffle() const{return getMainOp() != getAltOp(); }

837

838bool isOpcodeOrAlt(Instruction *I) const{

839unsigned CheckedOpcode =I->getOpcode();

840returngetOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;

841 }

842

843 /// Checks if the current state is valid, i.e. has non-null MainOp

844bool valid() const{return MainOp && AltOp; }

845

846explicitoperatorbool() const{return valid(); }

847

848 InstructionsState() =delete;

849 InstructionsState(Instruction *MainOp,Instruction *AltOp)

850 : MainOp(MainOp), AltOp(AltOp) {}

851static InstructionsState invalid() {return {nullptr,nullptr}; }

852};

853

854}// end anonymous namespace

855

856/// \returns true if \p Opcode is allowed as part of the main/alternate

857/// instruction for SLP vectorization.

858///

859/// Example of unsupported opcode is SDIV that can potentially cause UB if the

860/// "shuffled out" lane would result in division by zero.

861staticboolisValidForAlternation(unsigned Opcode) {

862if (Instruction::isIntDivRem(Opcode))

863returnfalse;

864

865returntrue;

866}

867

868static InstructionsStategetSameOpcode(ArrayRef<Value *> VL,

869constTargetLibraryInfo &TLI);

870

871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.

872/// compatible instructions or constants, or just some other regular values.

873staticboolareCompatibleCmpOps(Value *BaseOp0,Value *BaseOp1,Value *Op0,

874Value *Op1,constTargetLibraryInfo &TLI) {

875return (isConstant(BaseOp0) &&isConstant(Op0)) ||

876 (isConstant(BaseOp1) &&isConstant(Op1)) ||

877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&

878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||

879 BaseOp0 == Op0 || BaseOp1 == Op1 ||

880getSameOpcode({BaseOp0, Op0}, TLI) ||

881getSameOpcode({BaseOp1, Op1}, TLI);

882}

883

884/// \returns true if a compare instruction \p CI has similar "look" and

885/// same predicate as \p BaseCI, "as is" or with its operands and predicate

886/// swapped, false otherwise.

887staticboolisCmpSameOrSwapped(constCmpInst *BaseCI,constCmpInst *CI,

888constTargetLibraryInfo &TLI) {

889assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&

890"Assessing comparisons of different types?");

891CmpInst::Predicate BasePred = BaseCI->getPredicate();

892CmpInst::Predicate Pred = CI->getPredicate();

893CmpInst::Predicate SwappedPred =CmpInst::getSwappedPredicate(Pred);

894

895Value *BaseOp0 = BaseCI->getOperand(0);

896Value *BaseOp1 = BaseCI->getOperand(1);

897Value *Op0 = CI->getOperand(0);

898Value *Op1 = CI->getOperand(1);

899

900return (BasePred == Pred &&

901areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||

902 (BasePred == SwappedPred &&

903areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));

904}

905

906/// \returns analysis of the Instructions in \p VL described in

907/// InstructionsState, the Opcode that we suppose the whole list

908/// could be vectorized even if its structure is diverse.

909static InstructionsStategetSameOpcode(ArrayRef<Value *> VL,

910constTargetLibraryInfo &TLI) {

911// Make sure these are all Instructions.

912if (!all_of(VL, IsaPred<Instruction, PoisonValue>))

913return InstructionsState::invalid();

914

915auto *It =find_if(VL, IsaPred<Instruction>);

916if (It == VL.end())

917return InstructionsState::invalid();

918

919Instruction *MainOp = cast<Instruction>(*It);

920unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);

921if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||

922 (VL.size() == 2 && InstCnt < 2))

923return InstructionsState::invalid();

924

925bool IsCastOp = isa<CastInst>(MainOp);

926bool IsBinOp = isa<BinaryOperator>(MainOp);

927bool IsCmpOp = isa<CmpInst>(MainOp);

928CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()

929 :CmpInst::BAD_ICMP_PREDICATE;

930Instruction *AltOp = MainOp;

931unsigned Opcode = MainOp->getOpcode();

932unsigned AltOpcode = Opcode;

933

934bool SwappedPredsCompatible = IsCmpOp && [&]() {

935SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;

936 UniquePreds.insert(BasePred);

937 UniqueNonSwappedPreds.insert(BasePred);

938for (Value *V : VL) {

939auto *I = dyn_cast<CmpInst>(V);

940if (!I)

941returnfalse;

942CmpInst::Predicate CurrentPred =I->getPredicate();

943CmpInst::Predicate SwappedCurrentPred =

944CmpInst::getSwappedPredicate(CurrentPred);

945 UniqueNonSwappedPreds.insert(CurrentPred);

946if (!UniquePreds.contains(CurrentPred) &&

947 !UniquePreds.contains(SwappedCurrentPred))

948 UniquePreds.insert(CurrentPred);

949 }

950// Total number of predicates > 2, but if consider swapped predicates

951// compatible only 2, consider swappable predicates as compatible opcodes,

952// not alternate.

953return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;

954 }();

955// Check for one alternate opcode from another BinaryOperator.

956// TODO - generalize to support all operators (types, calls etc.).

957Intrinsic::ID BaseID = 0;

958SmallVector<VFInfo> BaseMappings;

959if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {

960 BaseID =getVectorIntrinsicIDForCall(CallBase, &TLI);

961 BaseMappings =VFDatabase(*CallBase).getMappings(*CallBase);

962if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())

963return InstructionsState::invalid();

964 }

965bool AnyPoison = InstCnt != VL.size();

966// Check MainOp too to be sure that it matches the requirements for the

967// instructions.

968for (Value *V :iterator_range(It, VL.end())) {

969auto *I = dyn_cast<Instruction>(V);

970if (!I)

971continue;

972

973// Cannot combine poison and divisions.

974// TODO: do some smart analysis of the CallInsts to exclude divide-like

975// intrinsics/functions only.

976if (AnyPoison && (I->isIntDivRem() ||I->isFPDivRem() || isa<CallInst>(I)))

977return InstructionsState::invalid();

978unsigned InstOpcode =I->getOpcode();

979if (IsBinOp && isa<BinaryOperator>(I)) {

980if (InstOpcode == Opcode || InstOpcode == AltOpcode)

981continue;

982if (Opcode == AltOpcode &&isValidForAlternation(InstOpcode) &&

983isValidForAlternation(Opcode)) {

984 AltOpcode = InstOpcode;

985 AltOp =I;

986continue;

987 }

988 }elseif (IsCastOp && isa<CastInst>(I)) {

989Value *Op0 = MainOp->getOperand(0);

990Type *Ty0 = Op0->getType();

991Value *Op1 =I->getOperand(0);

992Type *Ty1 = Op1->getType();

993if (Ty0 == Ty1) {

994if (InstOpcode == Opcode || InstOpcode == AltOpcode)

995continue;

996if (Opcode == AltOpcode) {

997assert(isValidForAlternation(Opcode) &&

998isValidForAlternation(InstOpcode) &&

999"Cast isn't safe for alternation, logic needs to be updated!");

1000 AltOpcode = InstOpcode;

1001 AltOp =I;

1002continue;

1003 }

1004 }

1005 }elseif (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {

1006auto *BaseInst = cast<CmpInst>(MainOp);

1007Type *Ty0 = BaseInst->getOperand(0)->getType();

1008Type *Ty1 = Inst->getOperand(0)->getType();

1009if (Ty0 == Ty1) {

1010assert(InstOpcode == Opcode &&"Expected same CmpInst opcode.");

1011assert(InstOpcode == AltOpcode &&

1012"Alternate instructions are only supported by BinaryOperator "

1013"and CastInst.");

1014// Check for compatible operands. If the corresponding operands are not

1015// compatible - need to perform alternate vectorization.

1016CmpInst::Predicate CurrentPred = Inst->getPredicate();

1017CmpInst::Predicate SwappedCurrentPred =

1018CmpInst::getSwappedPredicate(CurrentPred);

1019

1020if ((VL.size() == 2 || SwappedPredsCompatible) &&

1021 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))

1022continue;

1023

1024if (isCmpSameOrSwapped(BaseInst, Inst, TLI))

1025continue;

1026auto *AltInst = cast<CmpInst>(AltOp);

1027if (MainOp != AltOp) {

1028if (isCmpSameOrSwapped(AltInst, Inst, TLI))

1029continue;

1030 }elseif (BasePred != CurrentPred) {

1031assert(

1032isValidForAlternation(InstOpcode) &&

1033"CmpInst isn't safe for alternation, logic needs to be updated!");

1034 AltOp =I;

1035continue;

1036 }

1037CmpInst::Predicate AltPred = AltInst->getPredicate();

1038if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||

1039 AltPred == CurrentPred || AltPred == SwappedCurrentPred)

1040continue;

1041 }

1042 }elseif (InstOpcode == Opcode) {

1043assert(InstOpcode == AltOpcode &&

1044"Alternate instructions are only supported by BinaryOperator and "

1045"CastInst.");

1046if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {

1047if (Gep->getNumOperands() != 2 ||

1048 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())

1049return InstructionsState::invalid();

1050 }elseif (auto *EI = dyn_cast<ExtractElementInst>(I)) {

1051if (!isVectorLikeInstWithConstOps(EI))

1052return InstructionsState::invalid();

1053 }elseif (auto *LI = dyn_cast<LoadInst>(I)) {

1054auto *BaseLI = cast<LoadInst>(MainOp);

1055if (!LI->isSimple() || !BaseLI->isSimple())

1056return InstructionsState::invalid();

1057 }elseif (auto *Call = dyn_cast<CallInst>(I)) {

1058auto *CallBase = cast<CallInst>(MainOp);

1059if (Call->getCalledFunction() !=CallBase->getCalledFunction())

1060return InstructionsState::invalid();

1061if (Call->hasOperandBundles() &&

1062 (!CallBase->hasOperandBundles() ||

1063 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),

1064 Call->op_begin() + Call->getBundleOperandsEndIndex(),

1065CallBase->op_begin() +

1066CallBase->getBundleOperandsStartIndex())))

1067return InstructionsState::invalid();

1068Intrinsic::ID ID =getVectorIntrinsicIDForCall(Call, &TLI);

1069if (ID != BaseID)

1070return InstructionsState::invalid();

1071if (!ID) {

1072SmallVector<VFInfo> Mappings =VFDatabase(*Call).getMappings(*Call);

1073if (Mappings.size() != BaseMappings.size() ||

1074 Mappings.front().ISA != BaseMappings.front().ISA ||

1075 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||

1076 Mappings.front().VectorName != BaseMappings.front().VectorName ||

1077 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||

1078 Mappings.front().Shape.Parameters !=

1079 BaseMappings.front().Shape.Parameters)

1080return InstructionsState::invalid();

1081 }

1082 }

1083continue;

1084 }

1085return InstructionsState::invalid();

1086 }

1087

1088return InstructionsState(MainOp, AltOp);

1089}

1090

1091/// \returns true if all of the values in \p VL have the same type or false

1092/// otherwise.

1093staticboolallSameType(ArrayRef<Value *> VL) {

1094Type *Ty = VL.front()->getType();

1095returnall_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });

1096}

1097

1098/// \returns True if in-tree use also needs extract. This refers to

1099/// possible scalar operand in vectorized instruction.

1100staticbooldoesInTreeUserNeedToExtract(Value *Scalar,Instruction *UserInst,

1101TargetLibraryInfo *TLI,

1102constTargetTransformInfo *TTI) {

1103if (!UserInst)

1104returnfalse;

1105unsigned Opcode = UserInst->getOpcode();

1106switch (Opcode) {

1107case Instruction::Load: {

1108LoadInst *LI = cast<LoadInst>(UserInst);

1109return (LI->getPointerOperand() == Scalar);

1110 }

1111case Instruction::Store: {

1112StoreInst *SI = cast<StoreInst>(UserInst);

1113return (SI->getPointerOperand() == Scalar);

1114 }

1115case Instruction::Call: {

1116CallInst *CI = cast<CallInst>(UserInst);

1117Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

1118returnany_of(enumerate(CI->args()), [&](auto &&Arg) {

1119 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&

1120 Arg.value().get() == Scalar;

1121 });

1122 }

1123default:

1124returnfalse;

1125 }

1126}

1127

1128/// \returns the AA location that is being access by the instruction.

1129staticMemoryLocation getLocation(Instruction *I) {

1130if (StoreInst *SI = dyn_cast<StoreInst>(I))

1131returnMemoryLocation::get(SI);

1132if (LoadInst *LI = dyn_cast<LoadInst>(I))

1133returnMemoryLocation::get(LI);

1134returnMemoryLocation();

1135}

1136

1137/// \returns True if the instruction is not a volatile or atomic load/store.

1138staticboolisSimple(Instruction *I) {

1139if (LoadInst *LI = dyn_cast<LoadInst>(I))

1140return LI->isSimple();

1141if (StoreInst *SI = dyn_cast<StoreInst>(I))

1142return SI->isSimple();

1143if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))

1144return !MI->isVolatile();

1145returntrue;

1146}

1147

1148/// Shuffles \p Mask in accordance with the given \p SubMask.

1149/// \param ExtendingManyInputs Supports reshuffling of the mask with not only

1150/// one but two input vectors.

1151staticvoidaddMask(SmallVectorImpl<int> &Mask,ArrayRef<int> SubMask,

1152bool ExtendingManyInputs =false) {

1153if (SubMask.empty())

1154return;

1155assert(

1156 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||

1157// Check if input scalars were extended to match the size of other node.

1158 (SubMask.size() == Mask.size() && Mask.back() ==PoisonMaskElem)) &&

1159"SubMask with many inputs support must be larger than the mask.");

1160if (Mask.empty()) {

1161 Mask.append(SubMask.begin(), SubMask.end());

1162return;

1163 }

1164SmallVector<int> NewMask(SubMask.size(),PoisonMaskElem);

1165int TermValue = std::min(Mask.size(), SubMask.size());

1166for (intI = 0, E = SubMask.size();I < E; ++I) {

1167if (SubMask[I] ==PoisonMaskElem ||

1168 (!ExtendingManyInputs &&

1169 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))

1170continue;

1171 NewMask[I] = Mask[SubMask[I]];

1172 }

1173 Mask.swap(NewMask);

1174}

1175

1176/// Order may have elements assigned special value (size) which is out of

1177/// bounds. Such indices only appear on places which correspond to undef values

1178/// (see canReuseExtract for details) and used in order to avoid undef values

1179/// have effect on operands ordering.

1180/// The first loop below simply finds all unused indices and then the next loop

1181/// nest assigns these indices for undef values positions.

1182/// As an example below Order has two undef positions and they have assigned

1183/// values 3 and 7 respectively:

1184/// before: 6 9 5 4 9 2 1 0

1185/// after: 6 3 5 4 7 2 1 0

1186staticvoidfixupOrderingIndices(MutableArrayRef<unsigned> Order) {

1187constunsigned Sz = Order.size();

1188SmallBitVector UnusedIndices(Sz,/*t=*/true);

1189SmallBitVector MaskedIndices(Sz);

1190for (unsignedI = 0;I < Sz; ++I) {

1191if (Order[I] < Sz)

1192 UnusedIndices.reset(Order[I]);

1193else

1194 MaskedIndices.set(I);

1195 }

1196if (MaskedIndices.none())

1197return;

1198assert(UnusedIndices.count() == MaskedIndices.count() &&

1199"Non-synced masked/available indices.");

1200intIdx = UnusedIndices.find_first();

1201int MIdx = MaskedIndices.find_first();

1202while (MIdx >= 0) {

1203assert(Idx >= 0 &&"Indices must be synced.");

1204 Order[MIdx] =Idx;

1205Idx = UnusedIndices.find_next(Idx);

1206 MIdx = MaskedIndices.find_next(MIdx);

1207 }

1208}

1209

1210/// \returns a bitset for selecting opcodes. false for Opcode0 and true for

1211/// Opcode1.

1212staticSmallBitVector getAltInstrMask(ArrayRef<Value *> VL,unsigned Opcode0,

1213unsigned Opcode1) {

1214Type *ScalarTy = VL[0]->getType();

1215unsigned ScalarTyNumElements =getNumElements(ScalarTy);

1216SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements,false);

1217for (unsigned Lane : seq<unsigned>(VL.size())) {

1218if (isa<PoisonValue>(VL[Lane]))

1219continue;

1220if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)

1221 OpcodeMask.set(Lane * ScalarTyNumElements,

1222 Lane * ScalarTyNumElements + ScalarTyNumElements);

1223 }

1224return OpcodeMask;

1225}

1226

1227namespacellvm {

1228

1229staticvoidinversePermutation(ArrayRef<unsigned> Indices,

1230SmallVectorImpl<int> &Mask) {

1231 Mask.clear();

1232constunsigned E = Indices.size();

1233 Mask.resize(E,PoisonMaskElem);

1234for (unsignedI = 0;I < E; ++I)

1235 Mask[Indices[I]] =I;

1236}

1237

1238/// Reorders the list of scalars in accordance with the given \p Mask.

1239staticvoidreorderScalars(SmallVectorImpl<Value *> &Scalars,

1240ArrayRef<int> Mask) {

1241assert(!Mask.empty() &&"Expected non-empty mask.");

1242SmallVector<Value *> Prev(Scalars.size(),

1243PoisonValue::get(Scalars.front()->getType()));

1244 Prev.swap(Scalars);

1245for (unsignedI = 0, E = Prev.size();I < E; ++I)

1246if (Mask[I] !=PoisonMaskElem)

1247 Scalars[Mask[I]] = Prev[I];

1248}

1249

1250/// Checks if the provided value does not require scheduling. It does not

1251/// require scheduling if this is not an instruction or it is an instruction

1252/// that does not read/write memory and all operands are either not instructions

1253/// or phi nodes or instructions from different blocks.

1254staticboolareAllOperandsNonInsts(Value *V) {

1255auto *I = dyn_cast<Instruction>(V);

1256if (!I)

1257returntrue;

1258return !mayHaveNonDefUseDependency(*I) &&

1259all_of(I->operands(), [I](Value *V) {

1260 auto *IO = dyn_cast<Instruction>(V);

1261 if (!IO)

1262 return true;

1263 return isa<PHINode>(IO) || IO->getParent() != I->getParent();

1264 });

1265}

1266

1267/// Checks if the provided value does not require scheduling. It does not

1268/// require scheduling if this is not an instruction or it is an instruction

1269/// that does not read/write memory and all users are phi nodes or instructions

1270/// from the different blocks.

1271staticboolisUsedOutsideBlock(Value *V) {

1272auto *I = dyn_cast<Instruction>(V);

1273if (!I)

1274returntrue;

1275// Limits the number of uses to save compile time.

1276return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&

1277all_of(I->users(), [I](User *U) {

1278 auto *IU = dyn_cast<Instruction>(U);

1279 if (!IU)

1280 return true;

1281 return IU->getParent() != I->getParent() || isa<PHINode>(IU);

1282 });

1283}

1284

1285/// Checks if the specified value does not require scheduling. It does not

1286/// require scheduling if all operands and all users do not need to be scheduled

1287/// in the current basic block.

1288staticbooldoesNotNeedToBeScheduled(Value *V) {

1289returnareAllOperandsNonInsts(V) &&isUsedOutsideBlock(V);

1290}

1291

1292/// Checks if the specified array of instructions does not require scheduling.

1293/// It is so if all either instructions have operands that do not require

1294/// scheduling or their users do not require scheduling since they are phis or

1295/// in other basic blocks.

1296staticbooldoesNotNeedToSchedule(ArrayRef<Value *> VL) {

1297return !VL.empty() &&

1298 (all_of(VL,isUsedOutsideBlock) ||all_of(VL,areAllOperandsNonInsts));

1299}

1300

1301/// Returns true if widened type of \p Ty elements with size \p Sz represents

1302/// full vector type, i.e. adding extra element results in extra parts upon type

1303/// legalization.

1304staticboolhasFullVectorsOrPowerOf2(constTargetTransformInfo &TTI,Type *Ty,

1305unsigned Sz) {

1306if (Sz <= 1)

1307returnfalse;

1308if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))

1309returnfalse;

1310if (has_single_bit(Sz))

1311returntrue;

1312constunsigned NumParts =TTI.getNumberOfParts(getWidenedType(Ty, Sz));

1313return NumParts > 0 && NumParts < Sz &&has_single_bit(Sz / NumParts) &&

1314 Sz % NumParts == 0;

1315}

1316

1317/// Returns number of parts, the type \p VecTy will be split at the codegen

1318/// phase. If the type is going to be scalarized or does not uses whole

1319/// registers, returns 1.

1320staticunsigned

1321getNumberOfParts(constTargetTransformInfo &TTI,VectorType *VecTy,

1322constunsigned Limit = std::numeric_limits<unsigned>::max()) {

1323unsigned NumParts =TTI.getNumberOfParts(VecTy);

1324if (NumParts == 0 || NumParts >= Limit)

1325return 1;

1326unsigned Sz =getNumElements(VecTy);

1327if (NumParts >= Sz || Sz % NumParts != 0 ||

1328 !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts))

1329return 1;

1330return NumParts;

1331}

1332

1333namespaceslpvectorizer {

1334

1335/// Bottom Up SLP Vectorizer.

1336classBoUpSLP {

1337structTreeEntry;

1338structScheduleData;

1339classShuffleCostEstimator;

1340classShuffleInstructionBuilder;

1341

1342public:

1343 /// Tracks the state we can represent the loads in the given sequence.

1344enum classLoadsState {

1345Gather,

1346Vectorize,

1347ScatterVectorize,

1348StridedVectorize

1349 };

1350

1351usingValueList =SmallVector<Value *, 8>;

1352usingInstrList =SmallVector<Instruction *, 16>;

1353usingValueSet =SmallPtrSet<Value *, 16>;

1354usingStoreList =SmallVector<StoreInst *, 8>;

1355usingExtraValueToDebugLocsMap =SmallDenseSet<Value *, 4>;

1356usingOrdersType =SmallVector<unsigned, 4>;

1357

1358BoUpSLP(Function *Func,ScalarEvolution *Se,TargetTransformInfo *Tti,

1359TargetLibraryInfo *TLi,AAResults *Aa,LoopInfo *Li,

1360DominatorTree *Dt,AssumptionCache *AC,DemandedBits *DB,

1361constDataLayout *DL,OptimizationRemarkEmitter *ORE)

1362 : BatchAA(*Aa),F(Func), SE(Se),TTI(Tti), TLI(TLi), LI(Li), DT(Dt),

1363 AC(AC), DB(DB),DL(DL), ORE(ORE),

1364 Builder(Se->getContext(),TargetFolder(*DL)) {

1365CodeMetrics::collectEphemeralValues(F, AC, EphValues);

1366// Use the vector register size specified by the target unless overridden

1367// by a command-line option.

1368// TODO: It would be better to limit the vectorization factor based on

1369// data type rather than just register size. For example, x86 AVX has

1370// 256-bit registers, but it does not support integer operations

1371// at that width (that requires AVX2).

1372if (MaxVectorRegSizeOption.getNumOccurrences())

1373 MaxVecRegSize =MaxVectorRegSizeOption;

1374else

1375 MaxVecRegSize =

1376TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)

1377 .getFixedValue();

1378

1379if (MinVectorRegSizeOption.getNumOccurrences())

1380 MinVecRegSize =MinVectorRegSizeOption;

1381else

1382 MinVecRegSize =TTI->getMinVectorRegisterBitWidth();

1383 }

1384

1385 /// Vectorize the tree that starts with the elements in \p VL.

1386 /// Returns the vectorized root.

1387Value *vectorizeTree();

1388

1389 /// Vectorize the tree but with the list of externally used values \p

1390 /// ExternallyUsedValues. Values in this MapVector can be replaced but the

1391 /// generated extractvalue instructions.

1392Value *

1393vectorizeTree(constExtraValueToDebugLocsMap &ExternallyUsedValues,

1394Instruction *ReductionRoot =nullptr);

1395

1396 /// \returns the cost incurred by unwanted spills and fills, caused by

1397 /// holding live values over call sites.

1398InstructionCost getSpillCost()const;

1399

1400 /// \returns the vectorization cost of the subtree that starts at \p VL.

1401 /// A negative number means that this is profitable.

1402InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});

1403

1404 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for

1405 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.

1406voidbuildTree(ArrayRef<Value *> Roots,

1407constSmallDenseSet<Value *> &UserIgnoreLst);

1408

1409 /// Construct a vectorizable tree that starts at \p Roots.

1410voidbuildTree(ArrayRef<Value *> Roots);

1411

1412 /// Returns whether the root node has in-tree uses.

1413booldoesRootHaveInTreeUses() const{

1414return !VectorizableTree.empty() &&

1415 !VectorizableTree.front()->UserTreeIndices.empty();

1416 }

1417

1418 /// Return the scalars of the root node.

1419ArrayRef<Value *>getRootNodeScalars() const{

1420assert(!VectorizableTree.empty() &&"No graph to get the first node from");

1421return VectorizableTree.front()->Scalars;

1422 }

1423

1424 /// Returns the type/is-signed info for the root node in the graph without

1425 /// casting.

1426 std::optional<std::pair<Type *, bool>>getRootNodeTypeWithNoCast() const{

1427const TreeEntry &Root = *VectorizableTree.front().get();

1428if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||

1429 !Root.Scalars.front()->getType()->isIntegerTy())

1430return std::nullopt;

1431auto It = MinBWs.find(&Root);

1432if (It != MinBWs.end())

1433return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),

1434 It->second.first),

1435 It->second.second);

1436if (Root.getOpcode() == Instruction::ZExt ||

1437 Root.getOpcode() == Instruction::SExt)

1438return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),

1439 Root.getOpcode() == Instruction::SExt);

1440return std::nullopt;

1441 }

1442

1443 /// Checks if the root graph node can be emitted with narrower bitwidth at

1444 /// codegen and returns it signedness, if so.

1445boolisSignedMinBitwidthRootNode() const{

1446return MinBWs.at(VectorizableTree.front().get()).second;

1447 }

1448

1449 /// Returns reduction type after minbitdth analysis.

1450FixedVectorType *getReductionType() const{

1451if (ReductionBitWidth == 0 ||

1452 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||

1453 ReductionBitWidth >=

1454DL->getTypeSizeInBits(

1455 VectorizableTree.front()->Scalars.front()->getType()))

1456returngetWidenedType(

1457 VectorizableTree.front()->Scalars.front()->getType(),

1458 VectorizableTree.front()->getVectorFactor());

1459returngetWidenedType(

1460IntegerType::get(

1461 VectorizableTree.front()->Scalars.front()->getContext(),

1462 ReductionBitWidth),

1463 VectorizableTree.front()->getVectorFactor());

1464 }

1465

1466 /// Builds external uses of the vectorized scalars, i.e. the list of

1467 /// vectorized scalars to be extracted, their lanes and their scalar users. \p

1468 /// ExternallyUsedValues contains additional list of external uses to handle

1469 /// vectorization of reductions.

1470void

1471buildExternalUses(constExtraValueToDebugLocsMap &ExternallyUsedValues = {});

1472

1473 /// Transforms graph nodes to target specific representations, if profitable.

1474voidtransformNodes();

1475

1476 /// Clear the internal data structures that are created by 'buildTree'.

1477voiddeleteTree() {

1478 VectorizableTree.clear();

1479 ScalarToTreeEntry.clear();

1480 MultiNodeScalars.clear();

1481 MustGather.clear();

1482 NonScheduledFirst.clear();

1483 EntryToLastInstruction.clear();

1484 LoadEntriesToVectorize.clear();

1485 IsGraphTransformMode =false;

1486 GatheredLoadsEntriesFirst.reset();

1487 ExternalUses.clear();

1488 ExternalUsesAsOriginalScalar.clear();

1489for (auto &Iter : BlocksSchedules) {

1490 BlockScheduling *BS = Iter.second.get();

1491 BS->clear();

1492 }

1493 MinBWs.clear();

1494 ReductionBitWidth = 0;

1495 BaseGraphSize = 1;

1496 CastMaxMinBWSizes.reset();

1497 ExtraBitWidthNodes.clear();

1498 InstrElementSize.clear();

1499 UserIgnoreList =nullptr;

1500 PostponedGathers.clear();

1501 ValueToGatherNodes.clear();

1502 }

1503

1504unsignedgetTreeSize() const{return VectorizableTree.size(); }

1505

1506 /// Returns the base graph size, before any transformations.

1507unsignedgetCanonicalGraphSize() const{return BaseGraphSize; }

1508

1509 /// Perform LICM and CSE on the newly generated gather sequences.

1510voidoptimizeGatherSequence();

1511

1512 /// Does this non-empty order represent an identity order? Identity

1513 /// should be represented as an empty order, so this is used to

1514 /// decide if we can canonicalize a computed order. Undef elements

1515 /// (represented as size) are ignored.

1516boolisIdentityOrder(ArrayRef<unsigned> Order) const{

1517assert(!Order.empty() &&"expected non-empty order");

1518constunsigned Sz = Order.size();

1519returnall_of(enumerate(Order), [&](constauto &P) {

1520returnP.value() ==P.index() ||P.value() == Sz;

1521 });

1522 }

1523

1524 /// Checks if the specified gather tree entry \p TE can be represented as a

1525 /// shuffled vector entry + (possibly) permutation with other gathers. It

1526 /// implements the checks only for possibly ordered scalars (Loads,

1527 /// ExtractElement, ExtractValue), which can be part of the graph.

1528 std::optional<OrdersType>findReusedOrderedScalars(const TreeEntry &TE);

1529

1530 /// Sort loads into increasing pointers offsets to allow greater clustering.

1531 std::optional<OrdersType>findPartiallyOrderedLoads(const TreeEntry &TE);

1532

1533 /// Gets reordering data for the given tree entry. If the entry is vectorized

1534 /// - just return ReorderIndices, otherwise check if the scalars can be

1535 /// reordered and return the most optimal order.

1536 /// \return std::nullopt if ordering is not important, empty order, if

1537 /// identity order is important, or the actual order.

1538 /// \param TopToBottom If true, include the order of vectorized stores and

1539 /// insertelement nodes, otherwise skip them.

1540 std::optional<OrdersType>getReorderingData(const TreeEntry &TE,

1541bool TopToBottom);

1542

1543 /// Reorders the current graph to the most profitable order starting from the

1544 /// root node to the leaf nodes. The best order is chosen only from the nodes

1545 /// of the same size (vectorization factor). Smaller nodes are considered

1546 /// parts of subgraph with smaller VF and they are reordered independently. We

1547 /// can make it because we still need to extend smaller nodes to the wider VF

1548 /// and we can merge reordering shuffles with the widening shuffles.

1549voidreorderTopToBottom();

1550

1551 /// Reorders the current graph to the most profitable order starting from

1552 /// leaves to the root. It allows to rotate small subgraphs and reduce the

1553 /// number of reshuffles if the leaf nodes use the same order. In this case we

1554 /// can merge the orders and just shuffle user node instead of shuffling its

1555 /// operands. Plus, even the leaf nodes have different orders, it allows to

1556 /// sink reordering in the graph closer to the root node and merge it later

1557 /// during analysis.

1558voidreorderBottomToTop(bool IgnoreReorder =false);

1559

1560 /// \return The vector element size in bits to use when vectorizing the

1561 /// expression tree ending at \p V. If V is a store, the size is the width of

1562 /// the stored value. Otherwise, the size is the width of the largest loaded

1563 /// value reaching V. This method is used by the vectorizer to calculate

1564 /// vectorization factors.

1565unsignedgetVectorElementSize(Value *V);

1566

1567 /// Compute the minimum type sizes required to represent the entries in a

1568 /// vectorizable tree.

1569voidcomputeMinimumValueSizes();

1570

1571// \returns maximum vector register size as set by TTI or overridden by cl::opt.

1572unsignedgetMaxVecRegSize() const{

1573return MaxVecRegSize;

1574 }

1575

1576// \returns minimum vector register size as set by cl::opt.

1577unsignedgetMinVecRegSize() const{

1578return MinVecRegSize;

1579 }

1580

1581unsignedgetMinVF(unsigned Sz) const{

1582return std::max(2U,getMinVecRegSize() / Sz);

1583 }

1584

1585unsignedgetMaximumVF(unsigned ElemWidth,unsigned Opcode) const{

1586unsigned MaxVF =MaxVFOption.getNumOccurrences() ?

1587MaxVFOption :TTI->getMaximumVF(ElemWidth, Opcode);

1588return MaxVF ? MaxVF : UINT_MAX;

1589 }

1590

1591 /// Check if homogeneous aggregate is isomorphic to some VectorType.

1592 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like

1593 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },

1594 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.

1595 ///

1596 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.

1597unsignedcanMapToVector(Type *T)const;

1598

1599 /// \returns True if the VectorizableTree is both tiny and not fully

1600 /// vectorizable. We do not vectorize such trees.

1601boolisTreeTinyAndNotFullyVectorizable(bool ForReduction =false)const;

1602

1603 /// Checks if the graph and all its subgraphs cannot be better vectorized.

1604 /// It may happen, if all gather nodes are loads and they cannot be

1605 /// "clusterized". In this case even subgraphs cannot be vectorized more

1606 /// effectively than the base graph.

1607boolisTreeNotExtendable()const;

1608

1609 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values

1610 /// can be load combined in the backend. Load combining may not be allowed in

1611 /// the IR optimizer, so we do not want to alter the pattern. For example,

1612 /// partially transforming a scalar bswap() pattern into vector code is

1613 /// effectively impossible for the backend to undo.

1614 /// TODO: If load combining is allowed in the IR optimizer, this analysis

1615 /// may not be necessary.

1616boolisLoadCombineReductionCandidate(RecurKind RdxKind)const;

1617

1618 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values

1619 /// can be load combined in the backend. Load combining may not be allowed in

1620 /// the IR optimizer, so we do not want to alter the pattern. For example,

1621 /// partially transforming a scalar bswap() pattern into vector code is

1622 /// effectively impossible for the backend to undo.

1623 /// TODO: If load combining is allowed in the IR optimizer, this analysis

1624 /// may not be necessary.

1625boolisLoadCombineCandidate(ArrayRef<Value *> Stores)const;

1626

1627 /// Checks if the given array of loads can be represented as a vectorized,

1628 /// scatter or just simple gather.

1629 /// \param VL list of loads.

1630 /// \param VL0 main load value.

1631 /// \param Order returned order of load instructions.

1632 /// \param PointerOps returned list of pointer operands.

1633 /// \param BestVF return best vector factor, if recursive check found better

1634 /// vectorization sequences rather than masked gather.

1635 /// \param TryRecursiveCheck used to check if long masked gather can be

1636 /// represented as a serie of loads/insert subvector, if profitable.

1637LoadsState canVectorizeLoads(ArrayRef<Value *> VL,constValue *VL0,

1638SmallVectorImpl<unsigned> &Order,

1639SmallVectorImpl<Value *> &PointerOps,

1640unsigned *BestVF =nullptr,

1641bool TryRecursiveCheck =true)const;

1642

1643 /// Registers non-vectorizable sequence of loads

1644template <typename T>voidregisterNonVectorizableLoads(ArrayRef<T *> VL) {

1645 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));

1646 }

1647

1648 /// Checks if the given loads sequence is known as not vectorizable

1649template <typename T>

1650boolareKnownNonVectorizableLoads(ArrayRef<T *> VL) const{

1651return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));

1652 }

1653

1654OptimizationRemarkEmitter *getORE() {return ORE; }

1655

1656 /// This structure holds any data we need about the edges being traversed

1657 /// during buildTree_rec(). We keep track of:

1658 /// (i) the user TreeEntry index, and

1659 /// (ii) the index of the edge.

1660structEdgeInfo {

1661EdgeInfo() =default;

1662EdgeInfo(TreeEntry *UserTE,unsignedEdgeIdx)

1663 :UserTE(UserTE),EdgeIdx(EdgeIdx) {}

1664 /// The user TreeEntry.

1665 TreeEntry *UserTE =nullptr;

1666 /// The operand index of the use.

1667unsignedEdgeIdx = UINT_MAX;

1668#ifndef NDEBUG

1669friendinlineraw_ostream &operator<<(raw_ostream &OS,

1670constBoUpSLP::EdgeInfo &EI) {

1671 EI.dump(OS);

1672returnOS;

1673 }

1674 /// Debug print.

1675voiddump(raw_ostream &OS) const{

1676OS <<"{User:" << (UserTE ? std::to_string(UserTE->Idx) :"null")

1677 <<" EdgeIdx:" <<EdgeIdx <<"}";

1678 }

1679LLVM_DUMP_METHODvoiddump() const{dump(dbgs()); }

1680#endif

1681bool operator == (constEdgeInfo &Other) const{

1682returnUserTE ==Other.UserTE &&EdgeIdx ==Other.EdgeIdx;

1683 }

1684 };

1685

1686 /// A helper class used for scoring candidates for two consecutive lanes.

1687classLookAheadHeuristics {

1688constTargetLibraryInfo &TLI;

1689constDataLayout &DL;

1690ScalarEvolution &SE;

1691constBoUpSLP &R;

1692int NumLanes;// Total number of lanes (aka vectorization factor).

1693int MaxLevel;// The maximum recursion depth for accumulating score.

1694

1695public:

1696LookAheadHeuristics(constTargetLibraryInfo &TLI,constDataLayout &DL,

1697ScalarEvolution &SE,constBoUpSLP &R,int NumLanes,

1698int MaxLevel)

1699 : TLI(TLI),DL(DL), SE(SE), R(R), NumLanes(NumLanes),

1700 MaxLevel(MaxLevel) {}

1701

1702// The hard-coded scores listed here are not very important, though it shall

1703// be higher for better matches to improve the resulting cost. When

1704// computing the scores of matching one sub-tree with another, we are

1705// basically counting the number of values that are matching. So even if all

1706// scores are set to 1, we would still get a decent matching result.

1707// However, sometimes we have to break ties. For example we may have to

1708// choose between matching loads vs matching opcodes. This is what these

1709// scores are helping us with: they provide the order of preference. Also,

1710// this is important if the scalar is externally used or used in another

1711// tree entry node in the different lane.

1712

1713 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

1714staticconstintScoreConsecutiveLoads = 4;

1715 /// The same load multiple times. This should have a better score than

1716 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it

1717 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for

1718 /// a vector load and 1.0 for a broadcast.

1719staticconstintScoreSplatLoads = 3;

1720 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).

1721staticconstintScoreReversedLoads = 3;

1722 /// A load candidate for masked gather.

1723staticconstintScoreMaskedGatherCandidate = 1;

1724 /// ExtractElementInst from same vector and consecutive indexes.

1725staticconstintScoreConsecutiveExtracts = 4;

1726 /// ExtractElementInst from same vector and reversed indices.

1727staticconstintScoreReversedExtracts = 3;

1728 /// Constants.

1729staticconstintScoreConstants = 2;

1730 /// Instructions with the same opcode.

1731staticconstintScoreSameOpcode = 2;

1732 /// Instructions with alt opcodes (e.g, add + sub).

1733staticconstintScoreAltOpcodes = 1;

1734 /// Identical instructions (a.k.a. splat or broadcast).

1735staticconstintScoreSplat = 1;

1736 /// Matching with an undef is preferable to failing.

1737staticconstintScoreUndef = 1;

1738 /// Score for failing to find a decent match.

1739staticconstintScoreFail = 0;

1740 /// Score if all users are vectorized.

1741staticconstintScoreAllUserVectorized = 1;

1742

1743 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.

1744 /// \p U1 and \p U2 are the users of \p V1 and \p V2.

1745 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p

1746 /// MainAltOps.

1747intgetShallowScore(Value *V1,Value *V2,Instruction *U1,Instruction *U2,

1748ArrayRef<Value *> MainAltOps) const{

1749if (!isValidElementType(V1->getType()) ||

1750 !isValidElementType(V2->getType()))

1751returnLookAheadHeuristics::ScoreFail;

1752

1753if (V1 == V2) {

1754if (isa<LoadInst>(V1)) {

1755// Retruns true if the users of V1 and V2 won't need to be extracted.

1756auto AllUsersAreInternal = [U1, U2,this](Value *V1,Value *V2) {

1757// Bail out if we have too many uses to save compilation time.

1758if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))

1759returnfalse;

1760

1761auto AllUsersVectorized = [U1, U2,this](Value *V) {

1762returnllvm::all_of(V->users(), [U1, U2,this](Value *U) {

1763 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;

1764 });

1765 };

1766return AllUsersVectorized(V1) && AllUsersVectorized(V2);

1767 };

1768// A broadcast of a load can be cheaper on some targets.

1769if (R.TTI->isLegalBroadcastLoad(V1->getType(),

1770ElementCount::getFixed(NumLanes)) &&

1771 ((int)V1->getNumUses() == NumLanes ||

1772 AllUsersAreInternal(V1, V2)))

1773returnLookAheadHeuristics::ScoreSplatLoads;

1774 }

1775returnLookAheadHeuristics::ScoreSplat;

1776 }

1777

1778auto CheckSameEntryOrFail = [&]() {

1779if (const TreeEntry *TE1 = R.getTreeEntry(V1);

1780 TE1 && TE1 == R.getTreeEntry(V2))

1781returnLookAheadHeuristics::ScoreSplatLoads;

1782returnLookAheadHeuristics::ScoreFail;

1783 };

1784

1785auto *LI1 = dyn_cast<LoadInst>(V1);

1786auto *LI2 = dyn_cast<LoadInst>(V2);

1787if (LI1 && LI2) {

1788if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||

1789 !LI2->isSimple())

1790return CheckSameEntryOrFail();

1791

1792 std::optional<int> Dist =getPointersDiff(

1793 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),

1794 LI2->getPointerOperand(),DL, SE,/*StrictCheck=*/true);

1795if (!Dist || *Dist == 0) {

1796if (getUnderlyingObject(LI1->getPointerOperand()) ==

1797getUnderlyingObject(LI2->getPointerOperand()) &&

1798 R.TTI->isLegalMaskedGather(

1799getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))

1800returnLookAheadHeuristics::ScoreMaskedGatherCandidate;

1801return CheckSameEntryOrFail();

1802 }

1803// The distance is too large - still may be profitable to use masked

1804// loads/gathers.

1805if (std::abs(*Dist) > NumLanes / 2)

1806returnLookAheadHeuristics::ScoreMaskedGatherCandidate;

1807// This still will detect consecutive loads, but we might have "holes"

1808// in some cases. It is ok for non-power-2 vectorization and may produce

1809// better results. It should not affect current vectorization.

1810return (*Dist > 0) ?LookAheadHeuristics::ScoreConsecutiveLoads

1811 :LookAheadHeuristics::ScoreReversedLoads;

1812 }

1813

1814auto *C1 = dyn_cast<Constant>(V1);

1815auto *C2 = dyn_cast<Constant>(V2);

1816if (C1 && C2)

1817returnLookAheadHeuristics::ScoreConstants;

1818

1819// Extracts from consecutive indexes of the same vector better score as

1820// the extracts could be optimized away.

1821Value *EV1;

1822ConstantInt *Ex1Idx;

1823if (match(V1,m_ExtractElt(m_Value(EV1),m_ConstantInt(Ex1Idx)))) {

1824// Undefs are always profitable for extractelements.

1825// Compiler can easily combine poison and extractelement <non-poison> or

1826// undef and extractelement <poison>. But combining undef +

1827// extractelement <non-poison-but-may-produce-poison> requires some

1828// extra operations.

1829if (isa<UndefValue>(V2))

1830return (isa<PoisonValue>(V2) ||isUndefVector(EV1).all())

1831 ?LookAheadHeuristics::ScoreConsecutiveExtracts

1832 :LookAheadHeuristics::ScoreSameOpcode;

1833Value *EV2 =nullptr;

1834ConstantInt *Ex2Idx =nullptr;

1835if (match(V2,

1836m_ExtractElt(m_Value(EV2),m_CombineOr(m_ConstantInt(Ex2Idx),

1837m_Undef())))) {

1838// Undefs are always profitable for extractelements.

1839if (!Ex2Idx)

1840returnLookAheadHeuristics::ScoreConsecutiveExtracts;

1841if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())

1842returnLookAheadHeuristics::ScoreConsecutiveExtracts;

1843if (EV2 == EV1) {

1844int Idx1 = Ex1Idx->getZExtValue();

1845int Idx2 = Ex2Idx->getZExtValue();

1846int Dist = Idx2 - Idx1;

1847// The distance is too large - still may be profitable to use

1848// shuffles.

1849if (std::abs(Dist) == 0)

1850returnLookAheadHeuristics::ScoreSplat;

1851if (std::abs(Dist) > NumLanes / 2)

1852returnLookAheadHeuristics::ScoreSameOpcode;

1853return (Dist > 0) ?LookAheadHeuristics::ScoreConsecutiveExtracts

1854 :LookAheadHeuristics::ScoreReversedExtracts;

1855 }

1856returnLookAheadHeuristics::ScoreAltOpcodes;

1857 }

1858return CheckSameEntryOrFail();

1859 }

1860

1861auto *I1 = dyn_cast<Instruction>(V1);

1862auto *I2 = dyn_cast<Instruction>(V2);

1863if (I1 && I2) {

1864if (I1->getParent() != I2->getParent())

1865return CheckSameEntryOrFail();

1866SmallVector<Value *, 4> Ops(MainAltOps);

1867 Ops.push_back(I1);

1868 Ops.push_back(I2);

1869 InstructionsState S =getSameOpcode(Ops, TLI);

1870// Note: Only consider instructions with <= 2 operands to avoid

1871// complexity explosion.

1872if (S &&

1873 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||

1874 !S.isAltShuffle()) &&

1875all_of(Ops, [&S](Value *V) {

1876return isa<PoisonValue>(V) ||

1877 cast<Instruction>(V)->getNumOperands() ==

1878 S.getMainOp()->getNumOperands();

1879 }))

1880return S.isAltShuffle() ?LookAheadHeuristics::ScoreAltOpcodes

1881 :LookAheadHeuristics::ScoreSameOpcode;

1882 }

1883

1884if (I1 && isa<PoisonValue>(V2))

1885returnLookAheadHeuristics::ScoreSameOpcode;

1886

1887if (isa<UndefValue>(V2))

1888returnLookAheadHeuristics::ScoreUndef;

1889

1890return CheckSameEntryOrFail();

1891 }

1892

1893 /// Go through the operands of \p LHS and \p RHS recursively until

1894 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are

1895 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands

1896 /// of \p U1 and \p U2), except at the beginning of the recursion where

1897 /// these are set to nullptr.

1898 ///

1899 /// For example:

1900 /// \verbatim

1901 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]

1902 /// \ / \ / \ / \ /

1903 /// + + + +

1904 /// G1 G2 G3 G4

1905 /// \endverbatim

1906 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at

1907 /// each level recursively, accumulating the score. It starts from matching

1908 /// the additions at level 0, then moves on to the loads (level 1). The

1909 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and

1910 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while

1911 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.

1912 /// Please note that the order of the operands does not matter, as we

1913 /// evaluate the score of all profitable combinations of operands. In

1914 /// other words the score of G1 and G4 is the same as G1 and G2. This

1915 /// heuristic is based on ideas described in:

1916 /// Look-ahead SLP: Auto-vectorization in the presence of commutative

1917 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,

1918 /// Luís F. W. Góes

1919intgetScoreAtLevelRec(Value *LHS,Value *RHS,Instruction *U1,

1920Instruction *U2,int CurrLevel,

1921ArrayRef<Value *> MainAltOps) const{

1922

1923// Get the shallow score of V1 and V2.

1924int ShallowScoreAtThisLevel =

1925getShallowScore(LHS,RHS, U1, U2, MainAltOps);

1926

1927// If reached MaxLevel,

1928// or if V1 and V2 are not instructions,

1929// or if they are SPLAT,

1930// or if they are not consecutive,

1931// or if profitable to vectorize loads or extractelements, early return

1932// the current cost.

1933auto *I1 = dyn_cast<Instruction>(LHS);

1934auto *I2 = dyn_cast<Instruction>(RHS);

1935if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||

1936 ShallowScoreAtThisLevel ==LookAheadHeuristics::ScoreFail ||

1937 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||

1938 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||

1939 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&

1940 ShallowScoreAtThisLevel))

1941return ShallowScoreAtThisLevel;

1942assert(I1 && I2 &&"Should have early exited.");

1943

1944// Contains the I2 operand indexes that got matched with I1 operands.

1945SmallSet<unsigned, 4> Op2Used;

1946

1947// Recursion towards the operands of I1 and I2. We are trying all possible

1948// operand pairs, and keeping track of the best score.

1949for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();

1950 OpIdx1 != NumOperands1; ++OpIdx1) {

1951// Try to pair op1I with the best operand of I2.

1952int MaxTmpScore = 0;

1953unsigned MaxOpIdx2 = 0;

1954bool FoundBest =false;

1955// If I2 is commutative try all combinations.

1956unsigned FromIdx =isCommutative(I2) ? 0 : OpIdx1;

1957unsigned ToIdx =isCommutative(I2)

1958 ? I2->getNumOperands()

1959 : std::min(I2->getNumOperands(), OpIdx1 + 1);

1960assert(FromIdx <= ToIdx &&"Bad index");

1961for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {

1962// Skip operands already paired with OpIdx1.

1963if (Op2Used.count(OpIdx2))

1964continue;

1965// Recursively calculate the cost at each level

1966int TmpScore =

1967getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),

1968 I1, I2, CurrLevel + 1, {});

1969// Look for the best score.

1970if (TmpScore >LookAheadHeuristics::ScoreFail &&

1971 TmpScore > MaxTmpScore) {

1972 MaxTmpScore = TmpScore;

1973 MaxOpIdx2 = OpIdx2;

1974 FoundBest =true;

1975 }

1976 }

1977if (FoundBest) {

1978// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.

1979 Op2Used.insert(MaxOpIdx2);

1980 ShallowScoreAtThisLevel += MaxTmpScore;

1981 }

1982 }

1983return ShallowScoreAtThisLevel;

1984 }

1985 };

1986 /// A helper data structure to hold the operands of a vector of instructions.

1987 /// This supports a fixed vector length for all operand vectors.

1988classVLOperands {

1989 /// For each operand we need (i) the value, and (ii) the opcode that it

1990 /// would be attached to if the expression was in a left-linearized form.

1991 /// This is required to avoid illegal operand reordering.

1992 /// For example:

1993 /// \verbatim

1994 /// 0 Op1

1995 /// |/

1996 /// Op1 Op2 Linearized + Op2

1997 /// \ / ----------> |/

1998 /// - -

1999 ///

2000 /// Op1 - Op2 (0 + Op1) - Op2

2001 /// \endverbatim

2002 ///

2003 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.

2004 ///

2005 /// Another way to think of this is to track all the operations across the

2006 /// path from the operand all the way to the root of the tree and to

2007 /// calculate the operation that corresponds to this path. For example, the

2008 /// path from Op2 to the root crosses the RHS of the '-', therefore the

2009 /// corresponding operation is a '-' (which matches the one in the

2010 /// linearized tree, as shown above).

2011 ///

2012 /// For lack of a better term, we refer to this operation as Accumulated

2013 /// Path Operation (APO).

2014structOperandData {

2015 OperandData() =default;

2016 OperandData(Value *V,bool APO,bool IsUsed)

2017 : V(V), APO(APO), IsUsed(IsUsed) {}

2018 /// The operand value.

2019Value *V =nullptr;

2020 /// TreeEntries only allow a single opcode, or an alternate sequence of

2021 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the

2022 /// APO. It is set to 'true' if 'V' is attached to an inverse operation

2023 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise

2024 /// (e.g., Add/Mul)

2025bool APO =false;

2026 /// Helper data for the reordering function.

2027bool IsUsed =false;

2028 };

2029

2030 /// During operand reordering, we are trying to select the operand at lane

2031 /// that matches best with the operand at the neighboring lane. Our

2032 /// selection is based on the type of value we are looking for. For example,

2033 /// if the neighboring lane has a load, we need to look for a load that is

2034 /// accessing a consecutive address. These strategies are summarized in the

2035 /// 'ReorderingMode' enumerator.

2036enum class ReorderingMode {

2037 Load,///< Matching loads to consecutive memory addresses

2038 Opcode,///< Matching instructions based on opcode (same or alternate)

2039Constant,///< Matching constants

2040Splat,///< Matching the same instruction multiple times (broadcast)

2041Failed,///< We failed to create a vectorizable group

2042 };

2043

2044usingOperandDataVec =SmallVector<OperandData, 2>;

2045

2046 /// A vector of operand vectors.

2047SmallVector<OperandDataVec, 4> OpsVec;

2048 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]

2049 /// is not IntrinsicInst, ArgSize is User::getNumOperands.

2050unsigned ArgSize = 0;

2051

2052constTargetLibraryInfo &TLI;

2053constDataLayout &DL;

2054ScalarEvolution &SE;

2055constBoUpSLP &R;

2056constLoop *L =nullptr;

2057

2058 /// \returns the operand data at \p OpIdx and \p Lane.

2059 OperandData &getData(unsigned OpIdx,unsigned Lane) {

2060return OpsVec[OpIdx][Lane];

2061 }

2062

2063 /// \returns the operand data at \p OpIdx and \p Lane. Const version.

2064const OperandData &getData(unsigned OpIdx,unsigned Lane) const{

2065return OpsVec[OpIdx][Lane];

2066 }

2067

2068 /// Clears the used flag for all entries.

2069void clearUsed() {

2070for (unsigned OpIdx = 0, NumOperands = getNumOperands();

2071 OpIdx != NumOperands; ++OpIdx)

2072for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;

2073 ++Lane)

2074 OpsVec[OpIdx][Lane].IsUsed =false;

2075 }

2076

2077 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.

2078void swap(unsigned OpIdx1,unsigned OpIdx2,unsigned Lane) {

2079std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);

2080 }

2081

2082 /// \param Lane lane of the operands under analysis.

2083 /// \param OpIdx operand index in \p Lane lane we're looking the best

2084 /// candidate for.

2085 /// \param Idx operand index of the current candidate value.

2086 /// \returns The additional score due to possible broadcasting of the

2087 /// elements in the lane. It is more profitable to have power-of-2 unique

2088 /// elements in the lane, it will be vectorized with higher probability

2089 /// after removing duplicates. Currently the SLP vectorizer supports only

2090 /// vectorization of the power-of-2 number of unique scalars.

2091int getSplatScore(unsigned Lane,unsigned OpIdx,unsignedIdx,

2092constSmallBitVector &UsedLanes) const{

2093Value *IdxLaneV = getData(Idx, Lane).V;

2094if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||

2095 isa<ExtractElementInst>(IdxLaneV))

2096return 0;

2097SmallDenseMap<Value *, unsigned, 4> Uniques;

2098for (unsigned Ln : seq<unsigned>(getNumLanes())) {

2099if (Ln == Lane)

2100continue;

2101Value *OpIdxLnV = getData(OpIdx, Ln).V;

2102if (!isa<Instruction>(OpIdxLnV))

2103return 0;

2104 Uniques.try_emplace(OpIdxLnV, Ln);

2105 }

2106unsigned UniquesCount = Uniques.size();

2107auto IdxIt = Uniques.find(IdxLaneV);

2108unsigned UniquesCntWithIdxLaneV =

2109 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;

2110Value *OpIdxLaneV = getData(OpIdx, Lane).V;

2111auto OpIdxIt = Uniques.find(OpIdxLaneV);

2112unsigned UniquesCntWithOpIdxLaneV =

2113 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;

2114if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)

2115return 0;

2116return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -

2117 UniquesCntWithOpIdxLaneV,

2118 UniquesCntWithOpIdxLaneV -

2119bit_floor(UniquesCntWithOpIdxLaneV)) -

2120 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))

2121 ? UniquesCntWithIdxLaneV -bit_floor(UniquesCntWithIdxLaneV)

2122 :bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);

2123 }

2124

2125 /// \param Lane lane of the operands under analysis.

2126 /// \param OpIdx operand index in \p Lane lane we're looking the best

2127 /// candidate for.

2128 /// \param Idx operand index of the current candidate value.

2129 /// \returns The additional score for the scalar which users are all

2130 /// vectorized.

2131int getExternalUseScore(unsigned Lane,unsigned OpIdx,unsignedIdx) const{

2132Value *IdxLaneV = getData(Idx, Lane).V;

2133Value *OpIdxLaneV = getData(OpIdx, Lane).V;

2134// Do not care about number of uses for vector-like instructions

2135// (extractelement/extractvalue with constant indices), they are extracts

2136// themselves and already externally used. Vectorization of such

2137// instructions does not add extra extractelement instruction, just may

2138// remove it.

2139if (isVectorLikeInstWithConstOps(IdxLaneV) &&

2140isVectorLikeInstWithConstOps(OpIdxLaneV))

2141returnLookAheadHeuristics::ScoreAllUserVectorized;

2142auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);

2143if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))

2144return 0;

2145return R.areAllUsersVectorized(IdxLaneI)

2146 ?LookAheadHeuristics::ScoreAllUserVectorized

2147 : 0;

2148 }

2149

2150 /// Score scaling factor for fully compatible instructions but with

2151 /// different number of external uses. Allows better selection of the

2152 /// instructions with less external uses.

2153staticconstint ScoreScaleFactor = 10;

2154

2155 /// \Returns the look-ahead score, which tells us how much the sub-trees

2156 /// rooted at \p LHS and \p RHS match, the more they match the higher the

2157 /// score. This helps break ties in an informed way when we cannot decide on

2158 /// the order of the operands by just considering the immediate

2159 /// predecessors.

2160int getLookAheadScore(Value *LHS,Value *RHS,ArrayRef<Value *> MainAltOps,

2161int Lane,unsigned OpIdx,unsignedIdx,

2162bool &IsUsed,constSmallBitVector &UsedLanes) {

2163LookAheadHeuristics LookAhead(TLI,DL, SE, R, getNumLanes(),

2164LookAheadMaxDepth);

2165// Keep track of the instruction stack as we recurse into the operands

2166// during the look-ahead score exploration.

2167int Score =

2168 LookAhead.getScoreAtLevelRec(LHS,RHS,/*U1=*/nullptr,/*U2=*/nullptr,

2169/*CurrLevel=*/1, MainAltOps);

2170if (Score) {

2171int SplatScore = getSplatScore(Lane, OpIdx,Idx, UsedLanes);

2172if (Score <= -SplatScore) {

2173// Failed score.

2174 Score = 0;

2175 }else {

2176 Score += SplatScore;

2177// Scale score to see the difference between different operands

2178// and similar operands but all vectorized/not all vectorized

2179// uses. It does not affect actual selection of the best

2180// compatible operand in general, just allows to select the

2181// operand with all vectorized uses.

2182 Score *= ScoreScaleFactor;

2183 Score += getExternalUseScore(Lane, OpIdx,Idx);

2184 IsUsed =true;

2185 }

2186 }

2187return Score;

2188 }

2189

2190 /// Best defined scores per lanes between the passes. Used to choose the

2191 /// best operand (with the highest score) between the passes.

2192 /// The key - {Operand Index, Lane}.

2193 /// The value - the best score between the passes for the lane and the

2194 /// operand.

2195SmallDenseMap<std::pair<unsigned, unsigned>,unsigned, 8>

2196 BestScoresPerLanes;

2197

2198// Search all operands in Ops[*][Lane] for the one that matches best

2199// Ops[OpIdx][LastLane] and return its opreand index.

2200// If no good match can be found, return std::nullopt.

2201 std::optional<unsigned>

2202 getBestOperand(unsigned OpIdx,int Lane,int LastLane,

2203ArrayRef<ReorderingMode> ReorderingModes,

2204ArrayRef<Value *> MainAltOps,

2205constSmallBitVector &UsedLanes) {

2206unsigned NumOperands = getNumOperands();

2207

2208// The operand of the previous lane at OpIdx.

2209Value *OpLastLane = getData(OpIdx, LastLane).V;

2210

2211// Our strategy mode for OpIdx.

2212 ReorderingMode RMode = ReorderingModes[OpIdx];

2213if (RMode == ReorderingMode::Failed)

2214return std::nullopt;

2215

2216// The linearized opcode of the operand at OpIdx, Lane.

2217bool OpIdxAPO = getData(OpIdx, Lane).APO;

2218

2219// The best operand index and its score.

2220// Sometimes we have more than one option (e.g., Opcode and Undefs), so we

2221// are using the score to differentiate between the two.

2222structBestOpData {

2223 std::optional<unsigned>Idx;

2224unsigned Score = 0;

2225 } BestOp;

2226 BestOp.Score =

2227 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)

2228 .first->second;

2229

2230// Track if the operand must be marked as used. If the operand is set to

2231// Score 1 explicitly (because of non power-of-2 unique scalars, we may

2232// want to reestimate the operands again on the following iterations).

2233bool IsUsed = RMode == ReorderingMode::Splat ||

2234 RMode == ReorderingMode::Constant ||

2235 RMode == ReorderingMode::Load;

2236// Iterate through all unused operands and look for the best.

2237for (unsignedIdx = 0;Idx != NumOperands; ++Idx) {

2238// Get the operand at Idx and Lane.

2239 OperandData &OpData = getData(Idx, Lane);

2240Value *Op = OpData.V;

2241bool OpAPO = OpData.APO;

2242

2243// Skip already selected operands.

2244if (OpData.IsUsed)

2245continue;

2246

2247// Skip if we are trying to move the operand to a position with a

2248// different opcode in the linearized tree form. This would break the

2249// semantics.

2250if (OpAPO != OpIdxAPO)

2251continue;

2252

2253// Look for an operand that matches the current mode.

2254switch (RMode) {

2255case ReorderingMode::Load:

2256case ReorderingMode::Opcode: {

2257bool LeftToRight = Lane > LastLane;

2258Value *OpLeft = (LeftToRight) ? OpLastLane :Op;

2259Value *OpRight = (LeftToRight) ?Op : OpLastLane;

2260int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,

2261 OpIdx,Idx, IsUsed, UsedLanes);

2262if (Score >static_cast<int>(BestOp.Score) ||

2263 (Score > 0 && Score ==static_cast<int>(BestOp.Score) &&

2264Idx == OpIdx)) {

2265 BestOp.Idx =Idx;

2266 BestOp.Score = Score;

2267 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;

2268 }

2269break;

2270 }

2271case ReorderingMode::Constant:

2272if (isa<Constant>(Op) ||

2273 (!BestOp.Score && L && L->isLoopInvariant(Op))) {

2274 BestOp.Idx =Idx;

2275if (isa<Constant>(Op)) {

2276 BestOp.Score =LookAheadHeuristics::ScoreConstants;

2277 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =

2278LookAheadHeuristics::ScoreConstants;

2279 }

2280if (isa<UndefValue>(Op) || !isa<Constant>(Op))

2281 IsUsed =false;

2282 }

2283break;

2284case ReorderingMode::Splat:

2285if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {

2286 IsUsed =Op == OpLastLane;

2287if (Op == OpLastLane) {

2288 BestOp.Score =LookAheadHeuristics::ScoreSplat;

2289 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =

2290LookAheadHeuristics::ScoreSplat;

2291 }

2292 BestOp.Idx =Idx;

2293 }

2294break;

2295case ReorderingMode::Failed:

2296llvm_unreachable("Not expected Failed reordering mode.");

2297 }

2298 }

2299

2300if (BestOp.Idx) {

2301 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;

2302return BestOp.Idx;

2303 }

2304// If we could not find a good match return std::nullopt.

2305return std::nullopt;

2306 }

2307

2308 /// Helper for reorderOperandVecs.

2309 /// \returns the lane that we should start reordering from. This is the one

2310 /// which has the least number of operands that can freely move about or

2311 /// less profitable because it already has the most optimal set of operands.

2312unsigned getBestLaneToStartReordering() const{

2313unsigned Min = UINT_MAX;

2314unsigned SameOpNumber = 0;

2315// std::pair<unsigned, unsigned> is used to implement a simple voting

2316// algorithm and choose the lane with the least number of operands that

2317// can freely move about or less profitable because it already has the

2318// most optimal set of operands. The first unsigned is a counter for

2319// voting, the second unsigned is the counter of lanes with instructions

2320// with same/alternate opcodes and same parent basic block.

2321MapVector<unsigned, std::pair<unsigned, unsigned>> HashMap;

2322// Try to be closer to the original results, if we have multiple lanes

2323// with same cost. If 2 lanes have the same cost, use the one with the

2324// highest index.

2325for (intI = getNumLanes();I > 0; --I) {

2326unsigned Lane =I - 1;

2327 OperandsOrderData NumFreeOpsHash =

2328 getMaxNumOperandsThatCanBeReordered(Lane);

2329// Compare the number of operands that can move and choose the one with

2330// the least number.

2331if (NumFreeOpsHash.NumOfAPOs < Min) {

2332 Min = NumFreeOpsHash.NumOfAPOs;

2333 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

2334 HashMap.clear();

2335 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

2336 }elseif (NumFreeOpsHash.NumOfAPOs == Min &&

2337 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {

2338// Select the most optimal lane in terms of number of operands that

2339// should be moved around.

2340 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;

2341 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);

2342 }elseif (NumFreeOpsHash.NumOfAPOs == Min &&

2343 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {

2344auto [It, Inserted] =

2345 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);

2346if (!Inserted)

2347 ++It->second.first;

2348 }

2349 }

2350// Select the lane with the minimum counter.

2351unsigned BestLane = 0;

2352unsigned CntMin = UINT_MAX;

2353for (constauto &Data :reverse(HashMap)) {

2354if (Data.second.first < CntMin) {

2355 CntMin =Data.second.first;

2356 BestLane =Data.second.second;

2357 }

2358 }

2359return BestLane;

2360 }

2361

2362 /// Data structure that helps to reorder operands.

2363structOperandsOrderData {

2364 /// The best number of operands with the same APOs, which can be

2365 /// reordered.

2366unsigned NumOfAPOs = UINT_MAX;

2367 /// Number of operands with the same/alternate instruction opcode and

2368 /// parent.

2369unsigned NumOpsWithSameOpcodeParent = 0;

2370 /// Hash for the actual operands ordering.

2371 /// Used to count operands, actually their position id and opcode

2372 /// value. It is used in the voting mechanism to find the lane with the

2373 /// least number of operands that can freely move about or less profitable

2374 /// because it already has the most optimal set of operands. Can be

2375 /// replaced with SmallVector<unsigned> instead but hash code is faster

2376 /// and requires less memory.

2377unsigned Hash = 0;

2378 };

2379 /// \returns the maximum number of operands that are allowed to be reordered

2380 /// for \p Lane and the number of compatible instructions(with the same

2381 /// parent/opcode). This is used as a heuristic for selecting the first lane

2382 /// to start operand reordering.

2383 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const{

2384unsigned CntTrue = 0;

2385unsigned NumOperands = getNumOperands();

2386// Operands with the same APO can be reordered. We therefore need to count

2387// how many of them we have for each APO, like this: Cnt[APO] = x.

2388// Since we only have two APOs, namely true and false, we can avoid using

2389// a map. Instead we can simply count the number of operands that

2390// correspond to one of them (in this case the 'true' APO), and calculate

2391// the other by subtracting it from the total number of operands.

2392// Operands with the same instruction opcode and parent are more

2393// profitable since we don't need to move them in many cases, with a high

2394// probability such lane already can be vectorized effectively.

2395bool AllUndefs =true;

2396unsigned NumOpsWithSameOpcodeParent = 0;

2397Instruction *OpcodeI =nullptr;

2398BasicBlock *Parent =nullptr;

2399unsigned Hash = 0;

2400for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

2401const OperandData &OpData = getData(OpIdx, Lane);

2402if (OpData.APO)

2403 ++CntTrue;

2404// Use Boyer-Moore majority voting for finding the majority opcode and

2405// the number of times it occurs.

2406if (auto *I = dyn_cast<Instruction>(OpData.V)) {

2407if (!OpcodeI || !getSameOpcode({OpcodeI,I}, TLI) ||

2408I->getParent() != Parent) {

2409if (NumOpsWithSameOpcodeParent == 0) {

2410 NumOpsWithSameOpcodeParent = 1;

2411 OpcodeI =I;

2412 Parent =I->getParent();

2413 }else {

2414 --NumOpsWithSameOpcodeParent;

2415 }

2416 }else {

2417 ++NumOpsWithSameOpcodeParent;

2418 }

2419 }

2420 Hash =hash_combine(

2421 Hash,hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));

2422 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);

2423 }

2424if (AllUndefs)

2425return {};

2426 OperandsOrderDataData;

2427Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);

2428Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;

2429Data.Hash = Hash;

2430returnData;

2431 }

2432

2433 /// Go through the instructions in VL and append their operands.

2434void appendOperandsOfVL(ArrayRef<Value *> VL,const InstructionsState &S) {

2435assert(!VL.empty() &&"Bad VL");

2436assert((empty() || VL.size() == getNumLanes()) &&

2437"Expected same number of lanes");

2438assert(S.valid() &&"InstructionsState is invalid.");

2439// IntrinsicInst::isCommutative returns true if swapping the first "two"

2440// arguments to the intrinsic produces the same result.

2441constexprunsigned IntrinsicNumOperands = 2;

2442Instruction *MainOp = S.getMainOp();

2443unsigned NumOperands = MainOp->getNumOperands();

2444 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;

2445 OpsVec.resize(NumOperands);

2446unsigned NumLanes = VL.size();

2447for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

2448 OpsVec[OpIdx].resize(NumLanes);

2449for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {

2450assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&

2451"Expected instruction or poison value");

2452// Our tree has just 3 nodes: the root and two operands.

2453// It is therefore trivial to get the APO. We only need to check the

2454// opcode of VL[Lane] and whether the operand at OpIdx is the LHS or

2455// RHS operand. The LHS operand of both add and sub is never attached

2456// to an inversese operation in the linearized form, therefore its APO

2457// is false. The RHS is true only if VL[Lane] is an inverse operation.

2458

2459// Since operand reordering is performed on groups of commutative

2460// operations or alternating sequences (e.g., +, -), we can safely

2461// tell the inverse operations by checking commutativity.

2462if (isa<PoisonValue>(VL[Lane])) {

2463if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {

2464if (OpIdx == 0) {

2465 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),true,false};

2466continue;

2467 }

2468 }elseif (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {

2469if (OpIdx == 0) {

2470 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),true,false};

2471continue;

2472 }

2473 }

2474 OpsVec[OpIdx][Lane] = {

2475PoisonValue::get(MainOp->getOperand(OpIdx)->getType()),true,

2476false};

2477continue;

2478 }

2479bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));

2480bool APO = (OpIdx == 0) ?false : IsInverseOperation;

2481 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),

2482 APO,false};

2483 }

2484 }

2485 }

2486

2487 /// \returns the number of operands.

2488unsigned getNumOperands() const{return ArgSize; }

2489

2490 /// \returns the number of lanes.

2491unsigned getNumLanes() const{return OpsVec[0].size(); }

2492

2493 /// \returns the operand value at \p OpIdx and \p Lane.

2494Value *getValue(unsigned OpIdx,unsigned Lane) const{

2495return getData(OpIdx, Lane).V;

2496 }

2497

2498 /// \returns true if the data structure is empty.

2499bool empty() const{return OpsVec.empty(); }

2500

2501 /// Clears the data.

2502void clear() { OpsVec.clear(); }

2503

2504 /// \Returns true if there are enough operands identical to \p Op to fill

2505 /// the whole vector (it is mixed with constants or loop invariant values).

2506 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.

2507bool shouldBroadcast(Value *Op,unsigned OpIdx,unsigned Lane) {

2508assert(Op == getValue(OpIdx, Lane) &&

2509"Op is expected to be getValue(OpIdx, Lane).");

2510// Small number of loads - try load matching.

2511if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)

2512returnfalse;

2513bool OpAPO = getData(OpIdx, Lane).APO;

2514bool IsInvariant = L && L->isLoopInvariant(Op);

2515unsigned Cnt = 0;

2516for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {

2517if (Ln == Lane)

2518continue;

2519// This is set to true if we found a candidate for broadcast at Lane.

2520bool FoundCandidate =false;

2521for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {

2522 OperandData &Data = getData(OpI, Ln);

2523if (Data.APO != OpAPO ||Data.IsUsed)

2524continue;

2525Value *OpILane = getValue(OpI, Lane);

2526bool IsConstantOp = isa<Constant>(OpILane);

2527// Consider the broadcast candidate if:

2528// 1. Same value is found in one of the operands.

2529if (Data.V ==Op ||

2530// 2. The operand in the given lane is not constant but there is a

2531// constant operand in another lane (which can be moved to the

2532// given lane). In this case we can represent it as a simple

2533// permutation of constant and broadcast.

2534 (!IsConstantOp &&

2535 ((Lns > 2 && isa<Constant>(Data.V)) ||

2536// 2.1. If we have only 2 lanes, need to check that value in the

2537// next lane does not build same opcode sequence.

2538 (Lns == 2 &&

2539 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&

2540 isa<Constant>(Data.V)))) ||

2541// 3. The operand in the current lane is loop invariant (can be

2542// hoisted out) and another operand is also a loop invariant

2543// (though not a constant). In this case the whole vector can be

2544// hoisted out.

2545// FIXME: need to teach the cost model about this case for better

2546// estimation.

2547 (IsInvariant && !isa<Constant>(Data.V) &&

2548 !getSameOpcode({Op,Data.V}, TLI) &&

2549 L->isLoopInvariant(Data.V))) {

2550 FoundCandidate =true;

2551Data.IsUsed =Data.V ==Op;

2552if (Data.V ==Op)

2553 ++Cnt;

2554break;

2555 }

2556 }

2557if (!FoundCandidate)

2558returnfalse;

2559 }

2560return getNumLanes() == 2 || Cnt > 1;

2561 }

2562

2563 /// Checks if there is at least single compatible operand in lanes other

2564 /// than \p Lane, compatible with the operand \p Op.

2565bool canBeVectorized(Instruction *Op,unsigned OpIdx,unsigned Lane) const{

2566assert(Op == getValue(OpIdx, Lane) &&

2567"Op is expected to be getValue(OpIdx, Lane).");

2568bool OpAPO = getData(OpIdx, Lane).APO;

2569for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {

2570if (Ln == Lane)

2571continue;

2572if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {

2573const OperandData &Data = getData(OpI, Ln);

2574if (Data.APO != OpAPO ||Data.IsUsed)

2575returntrue;

2576Value *OpILn = getValue(OpI, Ln);

2577return (L && L->isLoopInvariant(OpILn)) ||

2578 (getSameOpcode({Op, OpILn}, TLI) &&

2579allSameBlock({Op, OpILn}));

2580 }))

2581returntrue;

2582 }

2583returnfalse;

2584 }

2585

2586public:

2587 /// Initialize with all the operands of the instruction vector \p RootVL.

2588VLOperands(ArrayRef<Value *> RootVL,const InstructionsState &S,

2589constBoUpSLP &R)

2590 : TLI(*R.TLI),DL(*R.DL), SE(*R.SE), R(R),

2591 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {

2592// Append all the operands of RootVL.

2593 appendOperandsOfVL(RootVL, S);

2594 }

2595

2596 /// \Returns a value vector with the operands across all lanes for the

2597 /// opearnd at \p OpIdx.

2598ValueList getVL(unsigned OpIdx) const{

2599ValueList OpVL(OpsVec[OpIdx].size());

2600assert(OpsVec[OpIdx].size() == getNumLanes() &&

2601"Expected same num of lanes across all operands");

2602for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)

2603 OpVL[Lane] = OpsVec[OpIdx][Lane].V;

2604return OpVL;

2605 }

2606

2607// Performs operand reordering for 2 or more operands.

2608// The original operands are in OrigOps[OpIdx][Lane].

2609// The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.

2610voidreorder() {

2611unsigned NumOperands = getNumOperands();

2612unsigned NumLanes = getNumLanes();

2613// Each operand has its own mode. We are using this mode to help us select

2614// the instructions for each lane, so that they match best with the ones

2615// we have selected so far.

2616SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);

2617

2618// This is a greedy single-pass algorithm. We are going over each lane

2619// once and deciding on the best order right away with no back-tracking.

2620// However, in order to increase its effectiveness, we start with the lane

2621// that has operands that can move the least. For example, given the

2622// following lanes:

2623// Lane 0 : A[0] = B[0] + C[0] // Visited 3rd

2624// Lane 1 : A[1] = C[1] - B[1] // Visited 1st

2625// Lane 2 : A[2] = B[2] + C[2] // Visited 2nd

2626// Lane 3 : A[3] = C[3] - B[3] // Visited 4th

2627// we will start at Lane 1, since the operands of the subtraction cannot

2628// be reordered. Then we will visit the rest of the lanes in a circular

2629// fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.

2630

2631// Find the first lane that we will start our search from.

2632unsigned FirstLane = getBestLaneToStartReordering();

2633

2634// Initialize the modes.

2635for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

2636Value *OpLane0 = getValue(OpIdx, FirstLane);

2637// Keep track if we have instructions with all the same opcode on one

2638// side.

2639if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {

2640// Check if OpLane0 should be broadcast.

2641if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||

2642 !canBeVectorized(OpILane0, OpIdx, FirstLane))

2643 ReorderingModes[OpIdx] = ReorderingMode::Splat;

2644elseif (isa<LoadInst>(OpILane0))

2645 ReorderingModes[OpIdx] = ReorderingMode::Load;

2646else

2647 ReorderingModes[OpIdx] = ReorderingMode::Opcode;

2648 }elseif (isa<Constant>(OpLane0)) {

2649 ReorderingModes[OpIdx] = ReorderingMode::Constant;

2650 }elseif (isa<Argument>(OpLane0)) {

2651// Our best hope is a Splat. It may save some cost in some cases.

2652 ReorderingModes[OpIdx] = ReorderingMode::Splat;

2653 }else {

2654llvm_unreachable("Unexpected value kind.");

2655 }

2656 }

2657

2658// Check that we don't have same operands. No need to reorder if operands

2659// are just perfect diamond or shuffled diamond match. Do not do it only

2660// for possible broadcasts or non-power of 2 number of scalars (just for

2661// now).

2662auto &&SkipReordering = [this]() {

2663SmallPtrSet<Value *, 4> UniqueValues;

2664ArrayRef<OperandData> Op0 = OpsVec.front();

2665for (const OperandData &Data : Op0)

2666 UniqueValues.insert(Data.V);

2667for (ArrayRef<OperandData>Op :

2668ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {

2669if (any_of(Op, [&UniqueValues](const OperandData &Data) {

2670return !UniqueValues.contains(Data.V);

2671 }))

2672returnfalse;

2673 }

2674// TODO: Check if we can remove a check for non-power-2 number of

2675// scalars after full support of non-power-2 vectorization.

2676return UniqueValues.size() != 2 &&

2677hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),

2678 UniqueValues.size());

2679 };

2680

2681// If the initial strategy fails for any of the operand indexes, then we

2682// perform reordering again in a second pass. This helps avoid assigning

2683// high priority to the failed strategy, and should improve reordering for

2684// the non-failed operand indexes.

2685for (intPass = 0;Pass != 2; ++Pass) {

2686// Check if no need to reorder operands since they're are perfect or

2687// shuffled diamond match.

2688// Need to do it to avoid extra external use cost counting for

2689// shuffled matches, which may cause regressions.

2690if (SkipReordering())

2691break;

2692// Skip the second pass if the first pass did not fail.

2693bool StrategyFailed =false;

2694// Mark all operand data as free to use.

2695 clearUsed();

2696// We keep the original operand order for the FirstLane, so reorder the

2697// rest of the lanes. We are visiting the nodes in a circular fashion,

2698// using FirstLane as the center point and increasing the radius

2699// distance.

2700SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);

2701for (unsignedI = 0;I < NumOperands; ++I)

2702 MainAltOps[I].push_back(getData(I, FirstLane).V);

2703

2704SmallBitVector UsedLanes(NumLanes);

2705 UsedLanes.set(FirstLane);

2706for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {

2707// Visit the lane on the right and then the lane on the left.

2708for (intDirection : {+1, -1}) {

2709int Lane = FirstLane +Direction * Distance;

2710if (Lane < 0 || Lane >= (int)NumLanes)

2711continue;

2712 UsedLanes.set(Lane);

2713int LastLane = Lane -Direction;

2714assert(LastLane >= 0 && LastLane < (int)NumLanes &&

2715"Out of bounds");

2716// Look for a good match for each operand.

2717for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {

2718// Search for the operand that matches SortedOps[OpIdx][Lane-1].

2719 std::optional<unsigned> BestIdx =

2720 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,

2721 MainAltOps[OpIdx], UsedLanes);

2722// By not selecting a value, we allow the operands that follow to

2723// select a better matching value. We will get a non-null value in

2724// the next run of getBestOperand().

2725if (BestIdx) {

2726// Swap the current operand with the one returned by

2727// getBestOperand().

2728 swap(OpIdx, *BestIdx, Lane);

2729 }else {

2730// Enable the second pass.

2731 StrategyFailed =true;

2732 }

2733// Try to get the alternate opcode and follow it during analysis.

2734if (MainAltOps[OpIdx].size() != 2) {

2735 OperandData &AltOp = getData(OpIdx, Lane);

2736 InstructionsState OpS =

2737getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);

2738if (OpS && OpS.isAltShuffle())

2739 MainAltOps[OpIdx].push_back(AltOp.V);

2740 }

2741 }

2742 }

2743 }

2744// Skip second pass if the strategy did not fail.

2745if (!StrategyFailed)

2746break;

2747 }

2748 }

2749

2750#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

2751LLVM_DUMP_METHODstaticStringRef getModeStr(ReorderingMode RMode) {

2752switch (RMode) {

2753case ReorderingMode::Load:

2754return"Load";

2755case ReorderingMode::Opcode:

2756return"Opcode";

2757case ReorderingMode::Constant:

2758return"Constant";

2759case ReorderingMode::Splat:

2760return"Splat";

2761case ReorderingMode::Failed:

2762return"Failed";

2763 }

2764llvm_unreachable("Unimplemented Reordering Type");

2765 }

2766

2767LLVM_DUMP_METHODstaticraw_ostream &printMode(ReorderingMode RMode,

2768raw_ostream &OS) {

2769returnOS <<getModeStr(RMode);

2770 }

2771

2772 /// Debug print.

2773LLVM_DUMP_METHODstaticvoiddumpMode(ReorderingMode RMode) {

2774printMode(RMode,dbgs());

2775 }

2776

2777friendraw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {

2778returnprintMode(RMode,OS);

2779 }

2780

2781LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const{

2782constunsigned Indent = 2;

2783unsigned Cnt = 0;

2784for (constOperandDataVec &OpDataVec : OpsVec) {

2785OS <<"Operand " << Cnt++ <<"\n";

2786for (const OperandData &OpData : OpDataVec) {

2787OS.indent(Indent) <<"{";

2788if (Value *V = OpData.V)

2789OS << *V;

2790else

2791OS <<"null";

2792OS <<", APO:" << OpData.APO <<"}\n";

2793 }

2794OS <<"\n";

2795 }

2796returnOS;

2797 }

2798

2799 /// Debug print.

2800LLVM_DUMP_METHODvoiddump() const{print(dbgs()); }

2801#endif

2802 };

2803

2804 /// Evaluate each pair in \p Candidates and return index into \p Candidates

2805 /// for a pair which have highest score deemed to have best chance to form

2806 /// root of profitable tree to vectorize. Return std::nullopt if no candidate

2807 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit

2808 /// of the cost, considered to be good enough score.

2809 std::optional<int>

2810findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,

2811int Limit =LookAheadHeuristics::ScoreFail) const{

2812LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this,/*NumLanes=*/2,

2813RootLookAheadMaxDepth);

2814int BestScore = Limit;

2815 std::optional<int> Index;

2816for (intI : seq<int>(0, Candidates.size())) {

2817int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,

2818 Candidates[I].second,

2819/*U1=*/nullptr,/*U2=*/nullptr,

2820/*CurrLevel=*/1, {});

2821if (Score > BestScore) {

2822 BestScore = Score;

2823 Index =I;

2824 }

2825 }

2826return Index;

2827 }

2828

2829 /// Checks if the instruction is marked for deletion.

2830boolisDeleted(Instruction *I) const{return DeletedInstructions.count(I); }

2831

2832 /// Removes an instruction from its block and eventually deletes it.

2833 /// It's like Instruction::eraseFromParent() except that the actual deletion

2834 /// is delayed until BoUpSLP is destructed.

2835voideraseInstruction(Instruction *I) {

2836 DeletedInstructions.insert(I);

2837 }

2838

2839 /// Remove instructions from the parent function and clear the operands of \p

2840 /// DeadVals instructions, marking for deletion trivially dead operands.

2841template <typename T>

2842voidremoveInstructionsAndOperands(ArrayRef<T *> DeadVals) {

2843SmallVector<WeakTrackingVH> DeadInsts;

2844for (T *V : DeadVals) {

2845auto *I = cast<Instruction>(V);

2846 DeletedInstructions.insert(I);

2847 }

2848DenseSet<Value *> Processed;

2849for (T *V : DeadVals) {

2850if (!V || !Processed.insert(V).second)

2851continue;

2852auto *I = cast<Instruction>(V);

2853salvageDebugInfo(*I);

2854SmallVector<const TreeEntry *> Entries;

2855if (const TreeEntry *Entry = getTreeEntry(I)) {

2856 Entries.push_back(Entry);

2857auto It = MultiNodeScalars.find(I);

2858if (It != MultiNodeScalars.end())

2859 Entries.append(It->second.begin(), It->second.end());

2860 }

2861for (Use &U :I->operands()) {

2862if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());

2863 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&

2864wouldInstructionBeTriviallyDead(OpI, TLI) &&

2865 (Entries.empty() ||none_of(Entries, [&](const TreeEntry *Entry) {

2866return Entry->VectorizedValue == OpI;

2867 })))

2868 DeadInsts.push_back(OpI);

2869 }

2870I->dropAllReferences();

2871 }

2872for (T *V : DeadVals) {

2873auto *I = cast<Instruction>(V);

2874if (!I->getParent())

2875continue;

2876assert((I->use_empty() ||all_of(I->uses(),

2877 [&](Use &U) {

2878 return isDeleted(

2879 cast<Instruction>(U.getUser()));

2880 })) &&

2881"trying to erase instruction with users.");

2882I->removeFromParent();

2883 SE->forgetValue(I);

2884 }

2885// Process the dead instruction list until empty.

2886while (!DeadInsts.empty()) {

2887Value *V = DeadInsts.pop_back_val();

2888Instruction *VI = cast_or_null<Instruction>(V);

2889if (!VI || !VI->getParent())

2890continue;

2891assert(isInstructionTriviallyDead(VI, TLI) &&

2892"Live instruction found in dead worklist!");

2893assert(VI->use_empty() &&"Instructions with uses are not dead.");

2894

2895// Don't lose the debug info while deleting the instructions.

2896salvageDebugInfo(*VI);

2897

2898// Null out all of the instruction's operands to see if any operand

2899// becomes dead as we go.

2900for (Use &OpU : VI->operands()) {

2901Value *OpV = OpU.get();

2902if (!OpV)

2903continue;

2904 OpU.set(nullptr);

2905

2906if (!OpV->use_empty())

2907continue;

2908

2909// If the operand is an instruction that became dead as we nulled out

2910// the operand, and if it is 'trivially' dead, delete it in a future

2911// loop iteration.

2912if (auto *OpI = dyn_cast<Instruction>(OpV))

2913if (!DeletedInstructions.contains(OpI) &&

2914isInstructionTriviallyDead(OpI, TLI))

2915 DeadInsts.push_back(OpI);

2916 }

2917

2918 VI->removeFromParent();

2919 DeletedInstructions.insert(VI);

2920 SE->forgetValue(VI);

2921 }

2922 }

2923

2924 /// Checks if the instruction was already analyzed for being possible

2925 /// reduction root.

2926boolisAnalyzedReductionRoot(Instruction *I) const{

2927return AnalyzedReductionsRoots.count(I);

2928 }

2929 /// Register given instruction as already analyzed for being possible

2930 /// reduction root.

2931voidanalyzedReductionRoot(Instruction *I) {

2932 AnalyzedReductionsRoots.insert(I);

2933 }

2934 /// Checks if the provided list of reduced values was checked already for

2935 /// vectorization.

2936boolareAnalyzedReductionVals(ArrayRef<Value *> VL) const{

2937return AnalyzedReductionVals.contains(hash_value(VL));

2938 }

2939 /// Adds the list of reduced values to list of already checked values for the

2940 /// vectorization.

2941voidanalyzedReductionVals(ArrayRef<Value *> VL) {

2942 AnalyzedReductionVals.insert(hash_value(VL));

2943 }

2944 /// Clear the list of the analyzed reduction root instructions.

2945voidclearReductionData() {

2946 AnalyzedReductionsRoots.clear();

2947 AnalyzedReductionVals.clear();

2948 AnalyzedMinBWVals.clear();

2949 }

2950 /// Checks if the given value is gathered in one of the nodes.

2951boolisAnyGathered(constSmallDenseSet<Value *> &Vals) const{

2952returnany_of(MustGather, [&](Value *V) {return Vals.contains(V); });

2953 }

2954 /// Checks if the given value is gathered in one of the nodes.

2955boolisGathered(constValue *V) const{

2956return MustGather.contains(V);

2957 }

2958 /// Checks if the specified value was not schedule.

2959boolisNotScheduled(constValue *V) const{

2960return NonScheduledFirst.contains(V);

2961 }

2962

2963 /// Check if the value is vectorized in the tree.

2964boolisVectorized(Value *V) const{return getTreeEntry(V); }

2965

2966~BoUpSLP();

2967

2968private:

2969 /// Determine if a node \p E in can be demoted to a smaller type with a

2970 /// truncation. We collect the entries that will be demoted in ToDemote.

2971 /// \param E Node for analysis

2972 /// \param ToDemote indices of the nodes to be demoted.

2973bool collectValuesToDemote(

2974const TreeEntry &E,bool IsProfitableToDemoteRoot,unsigned &BitWidth,

2975SmallVectorImpl<unsigned> &ToDemote,DenseSet<const TreeEntry *> &Visited,

2976constSmallDenseSet<unsigned, 8> &NodesToKeepBWs,unsigned &MaxDepthLevel,

2977bool &IsProfitableToDemote,bool IsTruncRoot)const;

2978

2979 /// Check if the operands on the edges \p Edges of the \p UserTE allows

2980 /// reordering (i.e. the operands can be reordered because they have only one

2981 /// user and reordarable).

2982 /// \param ReorderableGathers List of all gather nodes that require reordering

2983 /// (e.g., gather of extractlements or partially vectorizable loads).

2984 /// \param GatherOps List of gather operand nodes for \p UserTE that require

2985 /// reordering, subset of \p NonVectorized.

2986bool

2987 canReorderOperands(TreeEntry *UserTE,

2988SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

2989ArrayRef<TreeEntry *> ReorderableGathers,

2990SmallVectorImpl<TreeEntry *> &GatherOps);

2991

2992 /// Checks if the given \p TE is a gather node with clustered reused scalars

2993 /// and reorders it per given \p Mask.

2994void reorderNodeWithReuses(TreeEntry &TE,ArrayRef<int> Mask)const;

2995

2996 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,

2997 /// if any. If it is not vectorized (gather node), returns nullptr.

2998 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,unsigned OpIdx) {

2999ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);

3000 TreeEntry *TE =nullptr;

3001constauto *It =find_if(VL, [&](Value *V) {

3002 TE = getTreeEntry(V);

3003if (TE &&is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))

3004returntrue;

3005auto It = MultiNodeScalars.find(V);

3006if (It != MultiNodeScalars.end()) {

3007for (TreeEntry *E : It->second) {

3008if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {

3009 TE = E;

3010returntrue;

3011 }

3012 }

3013 }

3014returnfalse;

3015 });

3016if (It != VL.end()) {

3017assert(TE->isSame(VL) &&"Expected same scalars.");

3018returnTE;

3019 }

3020returnnullptr;

3021 }

3022

3023 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,

3024 /// if any. If it is not vectorized (gather node), returns nullptr.

3025const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,

3026unsigned OpIdx) const{

3027returnconst_cast<BoUpSLP *>(this)->getVectorizedOperand(

3028const_cast<TreeEntry *>(UserTE), OpIdx);

3029 }

3030

3031 /// Checks if all users of \p I are the part of the vectorization tree.

3032bool areAllUsersVectorized(

3033Instruction *I,

3034constSmallDenseSet<Value *> *VectorizedVals =nullptr)const;

3035

3036 /// Return information about the vector formed for the specified index

3037 /// of a vector of (the same) instruction.

3038TargetTransformInfo::OperandValueInfo getOperandInfo(ArrayRef<Value *> Ops);

3039

3040 /// \ returns the graph entry for the \p Idx operand of the \p E entry.

3041const TreeEntry *getOperandEntry(const TreeEntry *E,unsignedIdx)const;

3042

3043 /// Gets the root instruction for the given node. If the node is a strided

3044 /// load/store node with the reverse order, the root instruction is the last

3045 /// one.

3046Instruction *getRootEntryInstruction(const TreeEntry &Entry)const;

3047

3048 /// \returns Cast context for the given graph node.

3049TargetTransformInfo::CastContextHint

3050 getCastContextHint(const TreeEntry &TE)const;

3051

3052 /// \returns the cost of the vectorizable entry.

3053InstructionCost getEntryCost(const TreeEntry *E,

3054ArrayRef<Value *> VectorizedVals,

3055SmallPtrSetImpl<Value *> &CheckedExtracts);

3056

3057 /// This is the recursive part of buildTree.

3058void buildTree_rec(ArrayRef<Value *> Roots,unsignedDepth,

3059const EdgeInfo &EI,unsigned InterleaveFactor = 0);

3060

3061 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can

3062 /// be vectorized to use the original vector (or aggregate "bitcast" to a

3063 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise

3064 /// returns false, setting \p CurrentOrder to either an empty vector or a

3065 /// non-identity permutation that allows to reuse extract instructions.

3066 /// \param ResizeAllowed indicates whether it is allowed to handle subvector

3067 /// extract order.

3068bool canReuseExtract(ArrayRef<Value *> VL,

3069SmallVectorImpl<unsigned> &CurrentOrder,

3070bool ResizeAllowed =false)const;

3071

3072 /// Vectorize a single entry in the tree.

3073 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to

3074 /// avoid issues with def-use order.

3075Value *vectorizeTree(TreeEntry *E,bool PostponedPHIs);

3076

3077 /// Returns vectorized operand node, that matches the order of the scalars

3078 /// operand number \p NodeIdx in entry \p E.

3079 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,unsigned NodeIdx);

3080const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,

3081unsigned NodeIdx) const{

3082returnconst_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);

3083 }

3084

3085 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry

3086 /// \p E.

3087 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to

3088 /// avoid issues with def-use order.

3089Value *vectorizeOperand(TreeEntry *E,unsigned NodeIdx,bool PostponedPHIs);

3090

3091 /// Create a new vector from a list of scalar values. Produces a sequence

3092 /// which exploits values reused across lanes, and arranges the inserts

3093 /// for ease of later optimization.

3094template <typename BVTy,typename ResTy,typename...Args>

3095 ResTy processBuildVector(const TreeEntry *E,Type *ScalarTy, Args &...Params);

3096

3097 /// Create a new vector from a list of scalar values. Produces a sequence

3098 /// which exploits values reused across lanes, and arranges the inserts

3099 /// for ease of later optimization.

3100Value *createBuildVector(const TreeEntry *E,Type *ScalarTy,

3101bool PostponedPHIs);

3102

3103 /// Returns the instruction in the bundle, which can be used as a base point

3104 /// for scheduling. Usually it is the last instruction in the bundle, except

3105 /// for the case when all operands are external (in this case, it is the first

3106 /// instruction in the list).

3107Instruction &getLastInstructionInBundle(const TreeEntry *E);

3108

3109 /// Tries to find extractelement instructions with constant indices from fixed

3110 /// vector type and gather such instructions into a bunch, which highly likely

3111 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

3112 /// was successful, the matched scalars are replaced by poison values in \p VL

3113 /// for future analysis.

3114 std::optional<TargetTransformInfo::ShuffleKind>

3115 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,

3116SmallVectorImpl<int> &Mask)const;

3117

3118 /// Tries to find extractelement instructions with constant indices from fixed

3119 /// vector type and gather such instructions into a bunch, which highly likely

3120 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt

3121 /// was successful, the matched scalars are replaced by poison values in \p VL

3122 /// for future analysis.

3123SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

3124 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

3125SmallVectorImpl<int> &Mask,

3126unsigned NumParts)const;

3127

3128 /// Checks if the gathered \p VL can be represented as a single register

3129 /// shuffle(s) of previous tree entries.

3130 /// \param TE Tree entry checked for permutation.

3131 /// \param VL List of scalars (a subset of the TE scalar), checked for

3132 /// permutations. Must form single-register vector.

3133 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also

3134 /// commands to build the mask using the original vector value, without

3135 /// relying on the potential reordering.

3136 /// \returns ShuffleKind, if gathered values can be represented as shuffles of

3137 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.

3138 std::optional<TargetTransformInfo::ShuffleKind>

3139 isGatherShuffledSingleRegisterEntry(

3140const TreeEntry *TE,ArrayRef<Value *> VL,MutableArrayRef<int> Mask,

3141SmallVectorImpl<const TreeEntry *> &Entries,unsigned Part,

3142bool ForOrder);

3143

3144 /// Checks if the gathered \p VL can be represented as multi-register

3145 /// shuffle(s) of previous tree entries.

3146 /// \param TE Tree entry checked for permutation.

3147 /// \param VL List of scalars (a subset of the TE scalar), checked for

3148 /// permutations.

3149 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also

3150 /// commands to build the mask using the original vector value, without

3151 /// relying on the potential reordering.

3152 /// \returns per-register series of ShuffleKind, if gathered values can be

3153 /// represented as shuffles of previous tree entries. \p Mask is filled with

3154 /// the shuffle mask (also on per-register base).

3155SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

3156 isGatherShuffledEntry(

3157const TreeEntry *TE,ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,

3158SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,

3159unsigned NumParts,bool ForOrder =false);

3160

3161 /// \returns the cost of gathering (inserting) the values in \p VL into a

3162 /// vector.

3163 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.

3164InstructionCost getGatherCost(ArrayRef<Value *> VL,bool ForPoisonSrc,

3165Type *ScalarTy)const;

3166

3167 /// Set the Builder insert point to one after the last instruction in

3168 /// the bundle

3169void setInsertPointAfterBundle(const TreeEntry *E);

3170

3171 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not

3172 /// specified, the starting vector value is poison.

3173Value *

3174 gather(ArrayRef<Value *> VL,Value *Root,Type *ScalarTy,

3175function_ref<Value *(Value *,Value *,ArrayRef<int>)> CreateShuffle);

3176

3177 /// \returns whether the VectorizableTree is fully vectorizable and will

3178 /// be beneficial even the tree height is tiny.

3179bool isFullyVectorizableTinyTree(bool ForReduction)const;

3180

3181 /// Run through the list of all gathered loads in the graph and try to find

3182 /// vector loads/masked gathers instead of regular gathers. Later these loads

3183 /// are reshufled to build final gathered nodes.

3184void tryToVectorizeGatheredLoads(

3185constSmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,

3186SmallVector<SmallVector<std::pair<LoadInst *, int>>>,

3187 8> &GatheredLoads);

3188

3189 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the

3190 /// users of \p TE and collects the stores. It returns the map from the store

3191 /// pointers to the collected stores.

3192SmallVector<SmallVector<StoreInst *>>

3193 collectUserStores(const BoUpSLP::TreeEntry *TE)const;

3194

3195 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the

3196 /// stores in \p StoresVec can form a vector instruction. If so it returns

3197 /// true and populates \p ReorderIndices with the shuffle indices of the

3198 /// stores when compared to the sorted vector.

3199bool canFormVector(ArrayRef<StoreInst *> StoresVec,

3200OrdersType &ReorderIndices)const;

3201

3202 /// Iterates through the users of \p TE, looking for scalar stores that can be

3203 /// potentially vectorized in a future SLP-tree. If found, it keeps track of

3204 /// their order and builds an order index vector for each store bundle. It

3205 /// returns all these order vectors found.

3206 /// We run this after the tree has formed, otherwise we may come across user

3207 /// instructions that are not yet in the tree.

3208SmallVector<OrdersType, 1>

3209 findExternalStoreUsersReorderIndices(TreeEntry *TE)const;

3210

3211 /// Tries to reorder the gathering node for better vectorization

3212 /// opportunities.

3213void reorderGatherNode(TreeEntry &TE);

3214

3215structTreeEntry {

3216usingVecTreeTy =SmallVector<std::unique_ptr<TreeEntry>, 8>;

3217 TreeEntry(VecTreeTy &Container) : Container(Container) {}

3218

3219 /// \returns Common mask for reorder indices and reused scalars.

3220SmallVector<int> getCommonMask() const{

3221SmallVector<int>Mask;

3222inversePermutation(ReorderIndices, Mask);

3223::addMask(Mask, ReuseShuffleIndices);

3224returnMask;

3225 }

3226

3227 /// \returns true if the scalars in VL are equal to this entry.

3228bool isSame(ArrayRef<Value *> VL) const{

3229auto &&IsSame = [VL](ArrayRef<Value *> Scalars,ArrayRef<int>Mask) {

3230if (Mask.size() != VL.size() && VL.size() == Scalars.size())

3231return std::equal(VL.begin(), VL.end(), Scalars.begin());

3232return VL.size() ==Mask.size() &&

3233 std::equal(VL.begin(), VL.end(),Mask.begin(),

3234 [Scalars](Value *V,int Idx) {

3235 return (isa<UndefValue>(V) &&

3236 Idx == PoisonMaskElem) ||

3237 (Idx != PoisonMaskElem && V == Scalars[Idx]);

3238 });

3239 };

3240if (!ReorderIndices.empty()) {

3241// TODO: implement matching if the nodes are just reordered, still can

3242// treat the vector as the same if the list of scalars matches VL

3243// directly, without reordering.

3244SmallVector<int>Mask;

3245inversePermutation(ReorderIndices, Mask);

3246if (VL.size() == Scalars.size())

3247return IsSame(Scalars, Mask);

3248if (VL.size() == ReuseShuffleIndices.size()) {

3249::addMask(Mask, ReuseShuffleIndices);

3250return IsSame(Scalars, Mask);

3251 }

3252returnfalse;

3253 }

3254return IsSame(Scalars, ReuseShuffleIndices);

3255 }

3256

3257bool isOperandGatherNode(const EdgeInfo &UserEI) const{

3258returnisGather() && !UserTreeIndices.empty() &&

3259 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&

3260 UserTreeIndices.front().UserTE == UserEI.UserTE;

3261 }

3262

3263 /// \returns true if current entry has same operands as \p TE.

3264bool hasEqualOperands(const TreeEntry &TE) const{

3265if (TE.getNumOperands() != getNumOperands())

3266returnfalse;

3267SmallBitVector Used(getNumOperands());

3268for (unsignedI = 0, E = getNumOperands();I < E; ++I) {

3269unsigned PrevCount =Used.count();

3270for (unsigned K = 0;K < E; ++K) {

3271if (Used.test(K))

3272continue;

3273if (getOperand(K) ==TE.getOperand(I)) {

3274Used.set(K);

3275break;

3276 }

3277 }

3278// Check if we actually found the matching operand.

3279if (PrevCount ==Used.count())

3280returnfalse;

3281 }

3282returntrue;

3283 }

3284

3285 /// \return Final vectorization factor for the node. Defined by the total

3286 /// number of vectorized scalars, including those, used several times in the

3287 /// entry and counted in the \a ReuseShuffleIndices, if any.

3288unsigned getVectorFactor() const{

3289if (!ReuseShuffleIndices.empty())

3290return ReuseShuffleIndices.size();

3291return Scalars.size();

3292 };

3293

3294 /// Checks if the current node is a gather node.

3295boolisGather() const{return State == NeedToGather; }

3296

3297 /// A vector of scalars.

3298ValueList Scalars;

3299

3300 /// The Scalars are vectorized into this value. It is initialized to Null.

3301WeakTrackingVH VectorizedValue =nullptr;

3302

3303 /// New vector phi instructions emitted for the vectorized phi nodes.

3304PHINode *PHI =nullptr;

3305

3306 /// Do we need to gather this sequence or vectorize it

3307 /// (either with vector instruction or with scatter/gather

3308 /// intrinsics for store/load)?

3309enum EntryState {

3310 Vectorize,///< The node is regularly vectorized.

3311 ScatterVectorize,///< Masked scatter/gather node.

3312 StridedVectorize,///< Strided loads (and stores)

3313 NeedToGather,///< Gather/buildvector node.

3314 CombinedVectorize,///< Vectorized node, combined with its user into more

3315 ///< complex node like select/cmp to minmax, mul/add to

3316 ///< fma, etc. Must be used for the following nodes in

3317 ///< the pattern, not the very first one.

3318 };

3319 EntryState State;

3320

3321 /// List of combined opcodes supported by the vectorizer.

3322enum CombinedOpcode {

3323 NotCombinedOp = -1,

3324MinMax = Instruction::OtherOpsEnd + 1,

3325 };

3326 CombinedOpcode CombinedOp = NotCombinedOp;

3327

3328 /// Does this sequence require some shuffling?

3329SmallVector<int, 4> ReuseShuffleIndices;

3330

3331 /// Does this entry require reordering?

3332SmallVector<unsigned, 4> ReorderIndices;

3333

3334 /// Points back to the VectorizableTree.

3335 ///

3336 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has

3337 /// to be a pointer and needs to be able to initialize the child iterator.

3338 /// Thus we need a reference back to the container to translate the indices

3339 /// to entries.

3340 VecTreeTy &Container;

3341

3342 /// The TreeEntry index containing the user of this entry. We can actually

3343 /// have multiple users so the data structure is not truly a tree.

3344SmallVector<EdgeInfo, 1> UserTreeIndices;

3345

3346 /// The index of this treeEntry in VectorizableTree.

3347unsignedIdx = 0;

3348

3349 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from

3350 /// other nodes as a series of insertvector instructions.

3351SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;

3352

3353private:

3354 /// The operands of each instruction in each lane Operands[op_index][lane].

3355 /// Note: This helps avoid the replication of the code that performs the

3356 /// reordering of operands during buildTree_rec() and vectorizeTree().

3357SmallVector<ValueList, 2>Operands;

3358

3359 /// MainOp and AltOp are recorded inside. S should be obtained from

3360 /// newTreeEntry.

3361 InstructionsState S = InstructionsState::invalid();

3362

3363 /// Interleaving factor for interleaved loads Vectorize nodes.

3364unsigned InterleaveFactor = 0;

3365

3366public:

3367 /// Returns interleave factor for interleave nodes.

3368unsigned getInterleaveFactor() const{return InterleaveFactor; }

3369 /// Sets interleaving factor for the interleaving nodes.

3370void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }

3371

3372 /// Set this bundle's \p OpIdx'th operand to \p OpVL.

3373void setOperand(unsigned OpIdx,ArrayRef<Value *> OpVL) {

3374if (Operands.size() < OpIdx + 1)

3375Operands.resize(OpIdx + 1);

3376assert(Operands[OpIdx].empty() &&"Already resized?");

3377assert(OpVL.size() <= Scalars.size() &&

3378"Number of operands is greater than the number of scalars.");

3379Operands[OpIdx].resize(OpVL.size());

3380copy(OpVL, Operands[OpIdx].begin());

3381 }

3382

3383 /// Set this bundle's operand from Scalars.

3384void setOperand(constBoUpSLP &R,bool RequireReorder =false) {

3385 VLOperands Ops(Scalars, S, R);

3386if (RequireReorder)

3387 Ops.reorder();

3388for (unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands()))

3389 setOperand(I, Ops.getVL(I));

3390 }

3391

3392 /// Reorders operands of the node to the given mask \p Mask.

3393void reorderOperands(ArrayRef<int> Mask) {

3394for (ValueList &Operand : Operands)

3395reorderScalars(Operand, Mask);

3396 }

3397

3398 /// \returns the \p OpIdx operand of this TreeEntry.

3399ValueList &getOperand(unsigned OpIdx) {

3400assert(OpIdx <Operands.size() &&"Off bounds");

3401returnOperands[OpIdx];

3402 }

3403

3404 /// \returns the \p OpIdx operand of this TreeEntry.

3405ArrayRef<Value *> getOperand(unsigned OpIdx) const{

3406assert(OpIdx <Operands.size() &&"Off bounds");

3407returnOperands[OpIdx];

3408 }

3409

3410 /// \returns the number of operands.

3411unsigned getNumOperands() const{returnOperands.size(); }

3412

3413 /// \return the single \p OpIdx operand.

3414Value *getSingleOperand(unsigned OpIdx) const{

3415assert(OpIdx <Operands.size() &&"Off bounds");

3416assert(!Operands[OpIdx].empty() &&"No operand available");

3417returnOperands[OpIdx][0];

3418 }

3419

3420 /// Some of the instructions in the list have alternate opcodes.

3421bool isAltShuffle() const{return S.isAltShuffle(); }

3422

3423bool isOpcodeOrAlt(Instruction *I) const{return S.isOpcodeOrAlt(I); }

3424

3425 /// Chooses the correct key for scheduling data. If \p Op has the same (or

3426 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is

3427 /// \p OpValue.

3428Value *isOneOf(Value *Op) const{

3429auto *I = dyn_cast<Instruction>(Op);

3430if (I && isOpcodeOrAlt(I))

3431returnOp;

3432return S.getMainOp();

3433 }

3434

3435void setOperations(const InstructionsState &S) {

3436assert(S &&"InstructionsState is invalid.");

3437 this->S = S;

3438 }

3439

3440Instruction *getMainOp() const{return S.getMainOp(); }

3441

3442Instruction *getAltOp() const{return S.getAltOp(); }

3443

3444 /// The main/alternate opcodes for the list of instructions.

3445unsigned getOpcode() const{return S.getOpcode(); }

3446

3447unsigned getAltOpcode() const{return S.getAltOpcode(); }

3448

3449bool hasState() const{return S.valid(); }

3450

3451 /// When ReuseReorderShuffleIndices is empty it just returns position of \p

3452 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.

3453int findLaneForValue(Value *V) const{

3454unsigned FoundLane = getVectorFactor();

3455for (auto *It =find(Scalars, V), *End = Scalars.end(); It !=End;

3456 std::advance(It, 1)) {

3457if (*It != V)

3458continue;

3459 FoundLane = std::distance(Scalars.begin(), It);

3460assert(FoundLane < Scalars.size() &&"Couldn't find extract lane");

3461if (!ReorderIndices.empty())

3462 FoundLane = ReorderIndices[FoundLane];

3463assert(FoundLane < Scalars.size() &&"Couldn't find extract lane");

3464if (ReuseShuffleIndices.empty())

3465break;

3466if (auto *RIt =find(ReuseShuffleIndices, FoundLane);

3467 RIt != ReuseShuffleIndices.end()) {

3468 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);

3469break;

3470 }

3471 }

3472assert(FoundLane < getVectorFactor() &&"Unable to find given value.");

3473return FoundLane;

3474 }

3475

3476 /// Build a shuffle mask for graph entry which represents a merge of main

3477 /// and alternate operations.

3478void

3479 buildAltOpShuffleMask(constfunction_ref<bool(Instruction *)> IsAltOp,

3480SmallVectorImpl<int> &Mask,

3481SmallVectorImpl<Value *> *OpScalars =nullptr,

3482SmallVectorImpl<Value *> *AltScalars =nullptr)const;

3483

3484 /// Return true if this is a non-power-of-2 node.

3485bool isNonPowOf2Vec() const{

3486bool IsNonPowerOf2 = !has_single_bit(Scalars.size());

3487return IsNonPowerOf2;

3488 }

3489

3490 /// Return true if this is a node, which tries to vectorize number of

3491 /// elements, forming whole vectors.

3492bool

3493 hasNonWholeRegisterOrNonPowerOf2Vec(constTargetTransformInfo &TTI) const{

3494bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(

3495TTI,getValueType(Scalars.front()), Scalars.size());

3496assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&

3497"Reshuffling not supported with non-power-of-2 vectors yet.");

3498return IsNonPowerOf2;

3499 }

3500

3501Value *getOrdered(unsigned Idx) const{

3502assert(isGather() &&"Must be used only for buildvectors/gathers.");

3503if (ReorderIndices.empty())

3504return Scalars[Idx];

3505SmallVector<int>Mask;

3506inversePermutation(ReorderIndices, Mask);

3507return Scalars[Mask[Idx]];

3508 }

3509

3510#ifndef NDEBUG

3511 /// Debug printer.

3512LLVM_DUMP_METHODvoiddump() const{

3513dbgs() <<Idx <<".\n";

3514for (unsigned OpI = 0, OpE =Operands.size(); OpI != OpE; ++OpI) {

3515dbgs() <<"Operand " << OpI <<":\n";

3516for (constValue *V : Operands[OpI])

3517dbgs().indent(2) << *V <<"\n";

3518 }

3519dbgs() <<"Scalars: \n";

3520for (Value *V : Scalars)

3521dbgs().indent(2) << *V <<"\n";

3522dbgs() <<"State: ";

3523switch (State) {

3524case Vectorize:

3525if (InterleaveFactor > 0) {

3526dbgs() <<"Vectorize with interleave factor " << InterleaveFactor

3527 <<"\n";

3528 }else {

3529dbgs() <<"Vectorize\n";

3530 }

3531break;

3532case ScatterVectorize:

3533dbgs() <<"ScatterVectorize\n";

3534break;

3535case StridedVectorize:

3536dbgs() <<"StridedVectorize\n";

3537break;

3538case NeedToGather:

3539dbgs() <<"NeedToGather\n";

3540break;

3541case CombinedVectorize:

3542dbgs() <<"CombinedVectorize\n";

3543break;

3544 }

3545if (S) {

3546dbgs() <<"MainOp: " << *S.getMainOp() <<"\n";

3547dbgs() <<"AltOp: " << *S.getAltOp() <<"\n";

3548 }else {

3549dbgs() <<"MainOp: NULL\n";

3550dbgs() <<"AltOp: NULL\n";

3551 }

3552dbgs() <<"VectorizedValue: ";

3553if (VectorizedValue)

3554dbgs() << *VectorizedValue <<"\n";

3555else

3556dbgs() <<"NULL\n";

3557dbgs() <<"ReuseShuffleIndices: ";

3558if (ReuseShuffleIndices.empty())

3559dbgs() <<"Empty";

3560else

3561for (int ReuseIdx : ReuseShuffleIndices)

3562dbgs() << ReuseIdx <<", ";

3563dbgs() <<"\n";

3564dbgs() <<"ReorderIndices: ";

3565for (unsigned ReorderIdx : ReorderIndices)

3566dbgs() << ReorderIdx <<", ";

3567dbgs() <<"\n";

3568dbgs() <<"UserTreeIndices: ";

3569for (constauto &EInfo : UserTreeIndices)

3570dbgs() << EInfo <<", ";

3571dbgs() <<"\n";

3572if (!CombinedEntriesWithIndices.empty()) {

3573dbgs() <<"Combined entries: ";

3574interleaveComma(CombinedEntriesWithIndices,dbgs(), [&](constauto &P) {

3575dbgs() <<"Entry index " <<P.first <<" with offset " <<P.second;

3576 });

3577dbgs() <<"\n";

3578 }

3579 }

3580#endif

3581 };

3582

3583#ifndef NDEBUG

3584void dumpTreeCosts(const TreeEntry *E,InstructionCost ReuseShuffleCost,

3585InstructionCost VecCost,InstructionCost ScalarCost,

3586StringRef Banner) const{

3587dbgs() <<"SLP: " << Banner <<":\n";

3588 E->dump();

3589dbgs() <<"SLP: Costs:\n";

3590dbgs() <<"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<"\n";

3591dbgs() <<"SLP: VectorCost = " << VecCost <<"\n";

3592dbgs() <<"SLP: ScalarCost = " << ScalarCost <<"\n";

3593dbgs() <<"SLP: ReuseShuffleCost + VecCost - ScalarCost = "

3594 << ReuseShuffleCost + VecCost - ScalarCost <<"\n";

3595 }

3596#endif

3597

3598 /// Create a new VectorizableTree entry.

3599 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,

3600 std::optional<ScheduleData *> Bundle,

3601const InstructionsState &S,

3602const EdgeInfo &UserTreeIdx,

3603ArrayRef<int> ReuseShuffleIndices = {},

3604ArrayRef<unsigned> ReorderIndices = {},

3605unsigned InterleaveFactor = 0) {

3606 TreeEntry::EntryState EntryState =

3607 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;

3608 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,

3609 ReuseShuffleIndices, ReorderIndices);

3610if (E && InterleaveFactor > 0)

3611 E->setInterleave(InterleaveFactor);

3612return E;

3613 }

3614

3615 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,

3616 TreeEntry::EntryState EntryState,

3617 std::optional<ScheduleData *> Bundle,

3618const InstructionsState &S,

3619const EdgeInfo &UserTreeIdx,

3620ArrayRef<int> ReuseShuffleIndices = {},

3621ArrayRef<unsigned> ReorderIndices = {}) {

3622assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||

3623 (Bundle && EntryState != TreeEntry::NeedToGather)) &&

3624"Need to vectorize gather entry?");

3625// Gathered loads still gathered? Do not create entry, use the original one.

3626if (GatheredLoadsEntriesFirst.has_value() &&

3627 EntryState == TreeEntry::NeedToGather && S &&

3628 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&

3629 !UserTreeIdx.UserTE)

3630returnnullptr;

3631 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));

3632 TreeEntry *Last = VectorizableTree.back().get();

3633Last->Idx = VectorizableTree.size() - 1;

3634Last->State = EntryState;

3635// FIXME: Remove once support for ReuseShuffleIndices has been implemented

3636// for non-power-of-two vectors.

3637assert(

3638 (hasFullVectorsOrPowerOf2(*TTI,getValueType(VL.front()), VL.size()) ||

3639 ReuseShuffleIndices.empty()) &&

3640"Reshuffling scalars not yet supported for nodes with padding");

3641Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),

3642 ReuseShuffleIndices.end());

3643if (ReorderIndices.empty()) {

3644Last->Scalars.assign(VL.begin(), VL.end());

3645if (S)

3646Last->setOperations(S);

3647 }else {

3648// Reorder scalars and build final mask.

3649Last->Scalars.assign(VL.size(),nullptr);

3650transform(ReorderIndices,Last->Scalars.begin(),

3651 [VL](unsignedIdx) ->Value * {

3652 if (Idx >= VL.size())

3653 return UndefValue::get(VL.front()->getType());

3654 return VL[Idx];

3655 });

3656 InstructionsState S =getSameOpcode(Last->Scalars, *TLI);

3657if (S)

3658Last->setOperations(S);

3659Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());

3660 }

3661if (!Last->isGather()) {

3662for (Value *V : VL) {

3663if (isa<PoisonValue>(V))

3664continue;

3665const TreeEntry *TE = getTreeEntry(V);

3666assert((!TE || TE ==Last ||doesNotNeedToBeScheduled(V)) &&

3667"Scalar already in tree!");

3668if (TE) {

3669if (TE !=Last)

3670 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);

3671continue;

3672 }

3673 ScalarToTreeEntry[V] =Last;

3674 }

3675// Update the scheduler bundle to point to this TreeEntry.

3676 ScheduleData *BundleMember = *Bundle;

3677assert((BundleMember || isa<PHINode>(S.getMainOp()) ||

3678isVectorLikeInstWithConstOps(S.getMainOp()) ||

3679doesNotNeedToSchedule(VL)) &&

3680"Bundle and VL out of sync");

3681if (BundleMember) {

3682for (Value *V : VL) {

3683if (doesNotNeedToBeScheduled(V))

3684continue;

3685if (!BundleMember)

3686continue;

3687 BundleMember->TE =Last;

3688 BundleMember = BundleMember->NextInBundle;

3689 }

3690 }

3691assert(!BundleMember &&"Bundle and VL out of sync");

3692 }else {

3693// Build a map for gathered scalars to the nodes where they are used.

3694bool AllConstsOrCasts =true;

3695for (Value *V : VL)

3696if (!isConstant(V)) {

3697auto *I = dyn_cast<CastInst>(V);

3698 AllConstsOrCasts &=I &&I->getType()->isIntegerTy();

3699if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||

3700 !UserTreeIdx.UserTE->isGather())

3701 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);

3702 }

3703if (AllConstsOrCasts)

3704 CastMaxMinBWSizes =

3705 std::make_pair(std::numeric_limits<unsigned>::max(), 1);

3706 MustGather.insert(VL.begin(), VL.end());

3707 }

3708

3709if (UserTreeIdx.UserTE)

3710Last->UserTreeIndices.push_back(UserTreeIdx);

3711returnLast;

3712 }

3713

3714 /// -- Vectorization State --

3715 /// Holds all of the tree entries.

3716 TreeEntry::VecTreeTy VectorizableTree;

3717

3718#ifndef NDEBUG

3719 /// Debug printer.

3720LLVM_DUMP_METHODvoid dumpVectorizableTree() const{

3721for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {

3722 VectorizableTree[Id]->dump();

3723dbgs() <<"\n";

3724 }

3725 }

3726#endif

3727

3728 TreeEntry *getTreeEntry(Value *V) {

3729assert(V &&"V cannot be nullptr.");

3730return ScalarToTreeEntry.lookup(V);

3731 }

3732

3733const TreeEntry *getTreeEntry(Value *V) const{

3734assert(V &&"V cannot be nullptr.");

3735return ScalarToTreeEntry.lookup(V);

3736 }

3737

3738 /// Check that the operand node of alternate node does not generate

3739 /// buildvector sequence. If it is, then probably not worth it to build

3740 /// alternate shuffle, if number of buildvector operands + alternate

3741 /// instruction > than the number of buildvector instructions.

3742 /// \param S the instructions state of the analyzed values.

3743 /// \param VL list of the instructions with alternate opcodes.

3744bool areAltOperandsProfitable(const InstructionsState &S,

3745ArrayRef<Value *> VL)const;

3746

3747 /// Checks if the specified list of the instructions/values can be vectorized

3748 /// and fills required data before actual scheduling of the instructions.

3749 TreeEntry::EntryState

3750 getScalarsVectorizationState(const InstructionsState &S,ArrayRef<Value *> VL,

3751bool IsScatterVectorizeUserTE,

3752OrdersType &CurrentOrder,

3753SmallVectorImpl<Value *> &PointerOps);

3754

3755 /// Maps a specific scalar to its tree entry.

3756SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;

3757

3758 /// List of scalars, used in several vectorize nodes, and the list of the

3759 /// nodes.

3760SmallDenseMap<Value *, SmallVector<TreeEntry *>> MultiNodeScalars;

3761

3762 /// Maps a value to the proposed vectorizable size.

3763SmallDenseMap<Value *, unsigned> InstrElementSize;

3764

3765 /// A list of scalars that we found that we need to keep as scalars.

3766ValueSet MustGather;

3767

3768 /// A set of first non-schedulable values.

3769ValueSet NonScheduledFirst;

3770

3771 /// A map between the vectorized entries and the last instructions in the

3772 /// bundles. The bundles are built in use order, not in the def order of the

3773 /// instructions. So, we cannot rely directly on the last instruction in the

3774 /// bundle being the last instruction in the program order during

3775 /// vectorization process since the basic blocks are affected, need to

3776 /// pre-gather them before.

3777DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;

3778

3779 /// List of gather nodes, depending on other gather/vector nodes, which should

3780 /// be emitted after the vector instruction emission process to correctly

3781 /// handle order of the vector instructions and shuffles.

3782SetVector<const TreeEntry *> PostponedGathers;

3783

3784usingValueToGatherNodesMap =

3785DenseMap<Value *, SmallPtrSet<const TreeEntry *, 4>>;

3786 ValueToGatherNodesMap ValueToGatherNodes;

3787

3788 /// A list of the load entries (node indices), which can be vectorized using

3789 /// strided or masked gather approach, but attempted to be represented as

3790 /// contiguous loads.

3791SetVector<unsigned> LoadEntriesToVectorize;

3792

3793 /// true if graph nodes transforming mode is on.

3794bool IsGraphTransformMode =false;

3795

3796 /// The index of the first gathered load entry in the VectorizeTree.

3797 std::optional<unsigned> GatheredLoadsEntriesFirst;

3798

3799 /// This POD struct describes one external user in the vectorized tree.

3800structExternalUser {

3801 ExternalUser(Value *S,llvm::User *U,int L)

3802 :Scalar(S),User(U), Lane(L) {}

3803

3804// Which scalar in our function.

3805Value *Scalar;

3806

3807// Which user that uses the scalar.

3808llvm::User *User;

3809

3810// Which lane does the scalar belong to.

3811int Lane;

3812 };

3813usingUserList =SmallVector<ExternalUser, 16>;

3814

3815 /// Checks if two instructions may access the same memory.

3816 ///

3817 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it

3818 /// is invariant in the calling loop.

3819bool isAliased(constMemoryLocation &Loc1,Instruction *Inst1,

3820Instruction *Inst2) {

3821if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))

3822returntrue;

3823// First check if the result is already in the cache.

3824 AliasCacheKeyKey = std::make_pair(Inst1, Inst2);

3825auto It = AliasCache.find(Key);

3826if (It != AliasCache.end())

3827return It->second;

3828bool Aliased =isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));

3829// Store the result in the cache.

3830 AliasCache.try_emplace(Key, Aliased);

3831 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);

3832return Aliased;

3833 }

3834

3835usingAliasCacheKey = std::pair<Instruction *, Instruction *>;

3836

3837 /// Cache for alias results.

3838 /// TODO: consider moving this to the AliasAnalysis itself.

3839DenseMap<AliasCacheKey, bool> AliasCache;

3840

3841// Cache for pointerMayBeCaptured calls inside AA. This is preserved

3842// globally through SLP because we don't perform any action which

3843// invalidates capture results.

3844BatchAAResults BatchAA;

3845

3846 /// Temporary store for deleted instructions. Instructions will be deleted

3847 /// eventually when the BoUpSLP is destructed. The deferral is required to

3848 /// ensure that there are no incorrect collisions in the AliasCache, which

3849 /// can happen if a new instruction is allocated at the same address as a

3850 /// previously deleted instruction.

3851DenseSet<Instruction *> DeletedInstructions;

3852

3853 /// Set of the instruction, being analyzed already for reductions.

3854SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;

3855

3856 /// Set of hashes for the list of reduction values already being analyzed.

3857DenseSet<size_t> AnalyzedReductionVals;

3858

3859 /// Values, already been analyzed for mininmal bitwidth and found to be

3860 /// non-profitable.

3861DenseSet<Value *> AnalyzedMinBWVals;

3862

3863 /// A list of values that need to extracted out of the tree.

3864 /// This list holds pairs of (Internal Scalar : External User). External User

3865 /// can be nullptr, it means that this Internal Scalar will be used later,

3866 /// after vectorization.

3867 UserList ExternalUses;

3868

3869 /// A list of GEPs which can be reaplced by scalar GEPs instead of

3870 /// extractelement instructions.

3871SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;

3872

3873 /// Values used only by @llvm.assume calls.

3874SmallPtrSet<const Value *, 32> EphValues;

3875

3876 /// Holds all of the instructions that we gathered, shuffle instructions and

3877 /// extractelements.

3878SetVector<Instruction *> GatherShuffleExtractSeq;

3879

3880 /// A list of blocks that we are going to CSE.

3881DenseSet<BasicBlock *> CSEBlocks;

3882

3883 /// List of hashes of vector of loads, which are known to be non vectorizable.

3884DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;

3885

3886 /// Contains all scheduling relevant data for an instruction.

3887 /// A ScheduleData either represents a single instruction or a member of an

3888 /// instruction bundle (= a group of instructions which is combined into a

3889 /// vector instruction).

3890structScheduleData {

3891// The initial value for the dependency counters. It means that the

3892// dependencies are not calculated yet.

3893enum { InvalidDeps = -1 };

3894

3895 ScheduleData() =default;

3896

3897voidinit(int BlockSchedulingRegionID,Instruction *I) {

3898 FirstInBundle =this;

3899 NextInBundle =nullptr;

3900 NextLoadStore =nullptr;

3901 IsScheduled =false;

3902 SchedulingRegionID = BlockSchedulingRegionID;

3903 clearDependencies();

3904 Inst =I;

3905TE =nullptr;

3906 }

3907

3908 /// Verify basic self consistency properties

3909voidverify() {

3910if (hasValidDependencies()) {

3911assert(UnscheduledDeps <= Dependencies &&"invariant");

3912 }else {

3913assert(UnscheduledDeps == Dependencies &&"invariant");

3914 }

3915

3916if (IsScheduled) {

3917assert(isSchedulingEntity() &&

3918"unexpected scheduled state");

3919for (const ScheduleData *BundleMember =this; BundleMember;

3920 BundleMember = BundleMember->NextInBundle) {

3921assert(BundleMember->hasValidDependencies() &&

3922 BundleMember->UnscheduledDeps == 0 &&

3923"unexpected scheduled state");

3924assert((BundleMember ==this || !BundleMember->IsScheduled) &&

3925"only bundle is marked scheduled");

3926 }

3927 }

3928

3929assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&

3930"all bundle members must be in same basic block");

3931 }

3932

3933 /// Returns true if the dependency information has been calculated.

3934 /// Note that depenendency validity can vary between instructions within

3935 /// a single bundle.

3936bool hasValidDependencies() const{return Dependencies != InvalidDeps; }

3937

3938 /// Returns true for single instructions and for bundle representatives

3939 /// (= the head of a bundle).

3940bool isSchedulingEntity() const{return FirstInBundle ==this; }

3941

3942 /// Returns true if it represents an instruction bundle and not only a

3943 /// single instruction.

3944bool isPartOfBundle() const{

3945return NextInBundle !=nullptr || FirstInBundle !=this ||TE;

3946 }

3947

3948 /// Returns true if it is ready for scheduling, i.e. it has no more

3949 /// unscheduled depending instructions/bundles.

3950bool isReady() const{

3951assert(isSchedulingEntity() &&

3952"can't consider non-scheduling entity for ready list");

3953return unscheduledDepsInBundle() == 0 && !IsScheduled;

3954 }

3955

3956 /// Modifies the number of unscheduled dependencies for this instruction,

3957 /// and returns the number of remaining dependencies for the containing

3958 /// bundle.

3959int incrementUnscheduledDeps(int Incr) {

3960assert(hasValidDependencies() &&

3961"increment of unscheduled deps would be meaningless");

3962 UnscheduledDeps += Incr;

3963return FirstInBundle->unscheduledDepsInBundle();

3964 }

3965

3966 /// Sets the number of unscheduled dependencies to the number of

3967 /// dependencies.

3968void resetUnscheduledDeps() {

3969 UnscheduledDeps = Dependencies;

3970 }

3971

3972 /// Clears all dependency information.

3973void clearDependencies() {

3974 Dependencies = InvalidDeps;

3975 resetUnscheduledDeps();

3976 MemoryDependencies.clear();

3977 ControlDependencies.clear();

3978 }

3979

3980int unscheduledDepsInBundle() const{

3981assert(isSchedulingEntity() &&"only meaningful on the bundle");

3982int Sum = 0;

3983for (const ScheduleData *BundleMember =this; BundleMember;

3984 BundleMember = BundleMember->NextInBundle) {

3985if (BundleMember->UnscheduledDeps == InvalidDeps)

3986return InvalidDeps;

3987 Sum += BundleMember->UnscheduledDeps;

3988 }

3989return Sum;

3990 }

3991

3992voiddump(raw_ostream &os) const{

3993if (!isSchedulingEntity()) {

3994 os <<"/ " << *Inst;

3995 }elseif (NextInBundle) {

3996 os <<'[' << *Inst;

3997 ScheduleData *SD = NextInBundle;

3998while (SD) {

3999 os <<';' << *SD->Inst;

4000 SD = SD->NextInBundle;

4001 }

4002 os <<']';

4003 }else {

4004 os << *Inst;

4005 }

4006 }

4007

4008LLVM_DUMP_METHODvoiddump() const{dump(dbgs()); }

4009

4010Instruction *Inst =nullptr;

4011

4012 /// The TreeEntry that this instruction corresponds to.

4013 TreeEntry *TE =nullptr;

4014

4015 /// Points to the head in an instruction bundle (and always to this for

4016 /// single instructions).

4017 ScheduleData *FirstInBundle =nullptr;

4018

4019 /// Single linked list of all instructions in a bundle. Null if it is a

4020 /// single instruction.

4021 ScheduleData *NextInBundle =nullptr;

4022

4023 /// Single linked list of all memory instructions (e.g. load, store, call)

4024 /// in the block - until the end of the scheduling region.

4025 ScheduleData *NextLoadStore =nullptr;

4026

4027 /// The dependent memory instructions.

4028 /// This list is derived on demand in calculateDependencies().

4029SmallVector<ScheduleData *, 4> MemoryDependencies;

4030

4031 /// List of instructions which this instruction could be control dependent

4032 /// on. Allowing such nodes to be scheduled below this one could introduce

4033 /// a runtime fault which didn't exist in the original program.

4034 /// ex: this is a load or udiv following a readonly call which inf loops

4035SmallVector<ScheduleData *, 4> ControlDependencies;

4036

4037 /// This ScheduleData is in the current scheduling region if this matches

4038 /// the current SchedulingRegionID of BlockScheduling.

4039int SchedulingRegionID = 0;

4040

4041 /// Used for getting a "good" final ordering of instructions.

4042int SchedulingPriority = 0;

4043

4044 /// The number of dependencies. Constitutes of the number of users of the

4045 /// instruction plus the number of dependent memory instructions (if any).

4046 /// This value is calculated on demand.

4047 /// If InvalidDeps, the number of dependencies is not calculated yet.

4048int Dependencies = InvalidDeps;

4049

4050 /// The number of dependencies minus the number of dependencies of scheduled

4051 /// instructions. As soon as this is zero, the instruction/bundle gets ready

4052 /// for scheduling.

4053 /// Note that this is negative as long as Dependencies is not calculated.

4054int UnscheduledDeps = InvalidDeps;

4055

4056 /// True if this instruction is scheduled (or considered as scheduled in the

4057 /// dry-run).

4058bool IsScheduled =false;

4059 };

4060

4061#ifndef NDEBUG

4062friendinlineraw_ostream &operator<<(raw_ostream &os,

4063const BoUpSLP::ScheduleData &SD) {

4064 SD.dump(os);

4065return os;

4066 }

4067#endif

4068

4069friendstructGraphTraits<BoUpSLP *>;

4070friendstructDOTGraphTraits<BoUpSLP *>;

4071

4072 /// Contains all scheduling data for a basic block.

4073 /// It does not schedules instructions, which are not memory read/write

4074 /// instructions and their operands are either constants, or arguments, or

4075 /// phis, or instructions from others blocks, or their users are phis or from

4076 /// the other blocks. The resulting vector instructions can be placed at the

4077 /// beginning of the basic block without scheduling (if operands does not need

4078 /// to be scheduled) or at the end of the block (if users are outside of the

4079 /// block). It allows to save some compile time and memory used by the

4080 /// compiler.

4081 /// ScheduleData is assigned for each instruction in between the boundaries of

4082 /// the tree entry, even for those, which are not part of the graph. It is

4083 /// required to correctly follow the dependencies between the instructions and

4084 /// their correct scheduling. The ScheduleData is not allocated for the

4085 /// instructions, which do not require scheduling, like phis, nodes with

4086 /// extractelements/insertelements only or nodes with instructions, with

4087 /// uses/operands outside of the block.

4088structBlockScheduling {

4089 BlockScheduling(BasicBlock *BB)

4090 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}

4091

4092void clear() {

4093 ReadyInsts.clear();

4094 ScheduleStart =nullptr;

4095 ScheduleEnd =nullptr;

4096 FirstLoadStoreInRegion =nullptr;

4097 LastLoadStoreInRegion =nullptr;

4098 RegionHasStackSave =false;

4099

4100// Reduce the maximum schedule region size by the size of the

4101// previous scheduling run.

4102 ScheduleRegionSizeLimit -= ScheduleRegionSize;

4103if (ScheduleRegionSizeLimit <MinScheduleRegionSize)

4104 ScheduleRegionSizeLimit =MinScheduleRegionSize;

4105 ScheduleRegionSize = 0;

4106

4107// Make a new scheduling region, i.e. all existing ScheduleData is not

4108// in the new region yet.

4109 ++SchedulingRegionID;

4110 }

4111

4112 ScheduleData *getScheduleData(Instruction *I) {

4113if (BB !=I->getParent())

4114// Avoid lookup if can't possibly be in map.

4115returnnullptr;

4116 ScheduleData *SD = ScheduleDataMap.lookup(I);

4117if (SD && isInSchedulingRegion(SD))

4118return SD;

4119returnnullptr;

4120 }

4121

4122 ScheduleData *getScheduleData(Value *V) {

4123if (auto *I = dyn_cast<Instruction>(V))

4124return getScheduleData(I);

4125returnnullptr;

4126 }

4127

4128bool isInSchedulingRegion(ScheduleData *SD) const{

4129return SD->SchedulingRegionID == SchedulingRegionID;

4130 }

4131

4132 /// Marks an instruction as scheduled and puts all dependent ready

4133 /// instructions into the ready-list.

4134template <typename ReadyListType>

4135void schedule(ScheduleData *SD, ReadyListType &ReadyList) {

4136 SD->IsScheduled =true;

4137LLVM_DEBUG(dbgs() <<"SLP: schedule " << *SD <<"\n");

4138

4139for (ScheduleData *BundleMember = SD; BundleMember;

4140 BundleMember = BundleMember->NextInBundle) {

4141

4142// Handle the def-use chain dependencies.

4143

4144// Decrement the unscheduled counter and insert to ready list if ready.

4145auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {

4146 ScheduleData *OpDef = getScheduleData(I);

4147if (OpDef && OpDef->hasValidDependencies() &&

4148 OpDef->incrementUnscheduledDeps(-1) == 0) {

4149// There are no more unscheduled dependencies after

4150// decrementing, so we can put the dependent instruction

4151// into the ready list.

4152 ScheduleData *DepBundle = OpDef->FirstInBundle;

4153assert(!DepBundle->IsScheduled &&

4154"already scheduled bundle gets ready");

4155 ReadyList.insert(DepBundle);

4156LLVM_DEBUG(dbgs()

4157 <<"SLP: gets ready (def): " << *DepBundle <<"\n");

4158 }

4159 };

4160

4161// If BundleMember is a vector bundle, its operands may have been

4162// reordered during buildTree(). We therefore need to get its operands

4163// through the TreeEntry.

4164if (TreeEntry *TE = BundleMember->TE) {

4165// Need to search for the lane since the tree entry can be reordered.

4166auto *In = BundleMember->Inst;

4167int Lane = std::distance(TE->Scalars.begin(),

4168find(TE->Scalars, In));

4169assert(Lane >= 0 &&"Lane not set");

4170

4171// Since vectorization tree is being built recursively this assertion

4172// ensures that the tree entry has all operands set before reaching

4173// this code. Couple of exceptions known at the moment are extracts

4174// where their second (immediate) operand is not added. Since

4175// immediates do not affect scheduler behavior this is considered

4176// okay.

4177assert(

4178 In &&

4179 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||

4180In->getNumOperands() ==TE->getNumOperands()) &&

4181"Missed TreeEntry operands?");

4182

4183for (unsigned OpIdx : seq<unsigned>(TE->getNumOperands()))

4184if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))

4185 DecrUnsched(I);

4186 }else {

4187// If BundleMember is a stand-alone instruction, no operand reordering

4188// has taken place, so we directly access its operands.

4189for (Use &U : BundleMember->Inst->operands())

4190if (auto *I = dyn_cast<Instruction>(U.get()))

4191 DecrUnsched(I);

4192 }

4193// Handle the memory dependencies.

4194for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {

4195if (MemoryDepSD->hasValidDependencies() &&

4196 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {

4197// There are no more unscheduled dependencies after decrementing,

4198// so we can put the dependent instruction into the ready list.

4199 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;

4200assert(!DepBundle->IsScheduled &&

4201"already scheduled bundle gets ready");

4202 ReadyList.insert(DepBundle);

4203LLVM_DEBUG(dbgs()

4204 <<"SLP: gets ready (mem): " << *DepBundle <<"\n");

4205 }

4206 }

4207// Handle the control dependencies.

4208for (ScheduleData *DepSD : BundleMember->ControlDependencies) {

4209if (DepSD->incrementUnscheduledDeps(-1) == 0) {

4210// There are no more unscheduled dependencies after decrementing,

4211// so we can put the dependent instruction into the ready list.

4212 ScheduleData *DepBundle = DepSD->FirstInBundle;

4213assert(!DepBundle->IsScheduled &&

4214"already scheduled bundle gets ready");

4215 ReadyList.insert(DepBundle);

4216LLVM_DEBUG(dbgs()

4217 <<"SLP: gets ready (ctl): " << *DepBundle <<"\n");

4218 }

4219 }

4220 }

4221 }

4222

4223 /// Verify basic self consistency properties of the data structure.

4224voidverify() {

4225if (!ScheduleStart)

4226return;

4227

4228assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&

4229 ScheduleStart->comesBefore(ScheduleEnd) &&

4230"Not a valid scheduling region?");

4231

4232for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {

4233auto *SD = getScheduleData(I);

4234if (!SD)

4235continue;

4236assert(isInSchedulingRegion(SD) &&

4237"primary schedule data not in window?");

4238assert(isInSchedulingRegion(SD->FirstInBundle) &&

4239"entire bundle in window!");

4240 SD->verify();

4241 }

4242

4243for (auto *SD : ReadyInsts) {

4244assert(SD->isSchedulingEntity() && SD->isReady() &&

4245"item in ready list not ready?");

4246 (void)SD;

4247 }

4248 }

4249

4250 /// Put all instructions into the ReadyList which are ready for scheduling.

4251template <typename ReadyListType>

4252void initialFillReadyList(ReadyListType &ReadyList) {

4253for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {

4254 ScheduleData *SD = getScheduleData(I);

4255if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&

4256 SD->isReady()) {

4257 ReadyList.insert(SD);

4258LLVM_DEBUG(dbgs()

4259 <<"SLP: initially in ready list: " << *SD <<"\n");

4260 }

4261 }

4262 }

4263

4264 /// Build a bundle from the ScheduleData nodes corresponding to the

4265 /// scalar instruction for each lane.

4266 ScheduleData *buildBundle(ArrayRef<Value *> VL);

4267

4268 /// Checks if a bundle of instructions can be scheduled, i.e. has no

4269 /// cyclic dependencies. This is only a dry-run, no instructions are

4270 /// actually moved at this stage.

4271 /// \returns the scheduling bundle. The returned Optional value is not

4272 /// std::nullopt if \p VL is allowed to be scheduled.

4273 std::optional<ScheduleData *>

4274 tryScheduleBundle(ArrayRef<Value *> VL,BoUpSLP *SLP,

4275const InstructionsState &S);

4276

4277 /// Un-bundles a group of instructions.

4278void cancelScheduling(ArrayRef<Value *> VL,Value *OpValue);

4279

4280 /// Allocates schedule data chunk.

4281 ScheduleData *allocateScheduleDataChunks();

4282

4283 /// Extends the scheduling region so that V is inside the region.

4284 /// \returns true if the region size is within the limit.

4285bool extendSchedulingRegion(Value *V,const InstructionsState &S);

4286

4287 /// Initialize the ScheduleData structures for new instructions in the

4288 /// scheduling region.

4289void initScheduleData(Instruction *FromI,Instruction *ToI,

4290 ScheduleData *PrevLoadStore,

4291 ScheduleData *NextLoadStore);

4292

4293 /// Updates the dependency information of a bundle and of all instructions/

4294 /// bundles which depend on the original bundle.

4295void calculateDependencies(ScheduleData *SD,bool InsertInReadyList,

4296BoUpSLP *SLP);

4297

4298 /// Sets all instruction in the scheduling region to un-scheduled.

4299void resetSchedule();

4300

4301BasicBlock *BB;

4302

4303 /// Simple memory allocation for ScheduleData.

4304SmallVector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;

4305

4306 /// The size of a ScheduleData array in ScheduleDataChunks.

4307int ChunkSize;

4308

4309 /// The allocator position in the current chunk, which is the last entry

4310 /// of ScheduleDataChunks.

4311int ChunkPos;

4312

4313 /// Attaches ScheduleData to Instruction.

4314 /// Note that the mapping survives during all vectorization iterations, i.e.

4315 /// ScheduleData structures are recycled.

4316DenseMap<Instruction *, ScheduleData *> ScheduleDataMap;

4317

4318 /// The ready-list for scheduling (only used for the dry-run).

4319SetVector<ScheduleData *> ReadyInsts;

4320

4321 /// The first instruction of the scheduling region.

4322Instruction *ScheduleStart =nullptr;

4323

4324 /// The first instruction _after_ the scheduling region.

4325Instruction *ScheduleEnd =nullptr;

4326

4327 /// The first memory accessing instruction in the scheduling region

4328 /// (can be null).

4329 ScheduleData *FirstLoadStoreInRegion =nullptr;

4330

4331 /// The last memory accessing instruction in the scheduling region

4332 /// (can be null).

4333 ScheduleData *LastLoadStoreInRegion =nullptr;

4334

4335 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling

4336 /// region? Used to optimize the dependence calculation for the

4337 /// common case where there isn't.

4338bool RegionHasStackSave =false;

4339

4340 /// The current size of the scheduling region.

4341int ScheduleRegionSize = 0;

4342

4343 /// The maximum size allowed for the scheduling region.

4344int ScheduleRegionSizeLimit =ScheduleRegionSizeBudget;

4345

4346 /// The ID of the scheduling region. For a new vectorization iteration this

4347 /// is incremented which "removes" all ScheduleData from the region.

4348 /// Make sure that the initial SchedulingRegionID is greater than the

4349 /// initial SchedulingRegionID in ScheduleData (which is 0).

4350int SchedulingRegionID = 1;

4351 };

4352

4353 /// Attaches the BlockScheduling structures to basic blocks.

4354MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;

4355

4356 /// Performs the "real" scheduling. Done before vectorization is actually

4357 /// performed in a basic block.

4358void scheduleBlock(BlockScheduling *BS);

4359

4360 /// List of users to ignore during scheduling and that don't need extracting.

4361constSmallDenseSet<Value *> *UserIgnoreList =nullptr;

4362

4363 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of

4364 /// sorted SmallVectors of unsigned.

4365structOrdersTypeDenseMapInfo {

4366staticOrdersType getEmptyKey() {

4367OrdersType V;

4368V.push_back(~1U);

4369returnV;

4370 }

4371

4372staticOrdersType getTombstoneKey() {

4373OrdersType V;

4374V.push_back(~2U);

4375returnV;

4376 }

4377

4378staticunsigned getHashValue(constOrdersType &V) {

4379returnstatic_cast<unsigned>(hash_combine_range(V.begin(),V.end()));

4380 }

4381

4382staticboolisEqual(constOrdersType &LHS,constOrdersType &RHS) {

4383returnLHS ==RHS;

4384 }

4385 };

4386

4387// Analysis and block reference.

4388Function *F;

4389ScalarEvolution *SE;

4390TargetTransformInfo *TTI;

4391TargetLibraryInfo *TLI;

4392LoopInfo *LI;

4393DominatorTree *DT;

4394AssumptionCache *AC;

4395DemandedBits *DB;

4396constDataLayout *DL;

4397OptimizationRemarkEmitter *ORE;

4398

4399unsigned MaxVecRegSize;// This is set by TTI or overridden by cl::opt.

4400unsigned MinVecRegSize;// Set by cl::opt (default: 128).

4401

4402 /// Instruction builder to construct the vectorized tree.

4403IRBuilder<TargetFolder> Builder;

4404

4405 /// A map of scalar integer values to the smallest bit width with which they

4406 /// can legally be represented. The values map to (width, signed) pairs,

4407 /// where "width" indicates the minimum bit width and "signed" is True if the

4408 /// value must be signed-extended, rather than zero-extended, back to its

4409 /// original width.

4410DenseMap<const TreeEntry *, std::pair<uint64_t, bool>> MinBWs;

4411

4412 /// Final size of the reduced vector, if the current graph represents the

4413 /// input for the reduction and it was possible to narrow the size of the

4414 /// reduction.

4415unsigned ReductionBitWidth = 0;

4416

4417 /// Canonical graph size before the transformations.

4418unsigned BaseGraphSize = 1;

4419

4420 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of

4421 /// type sizes, used in the tree.

4422 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;

4423

4424 /// Indices of the vectorized nodes, which supposed to be the roots of the new

4425 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.

4426DenseSet<unsigned> ExtraBitWidthNodes;

4427};

4428

4429}// end namespace slpvectorizer

4430

4431template <>structGraphTraits<BoUpSLP *> {

4432usingTreeEntry = BoUpSLP::TreeEntry;

4433

4434 /// NodeRef has to be a pointer per the GraphWriter.

4435usingNodeRef =TreeEntry *;

4436

4437usingContainerTy =BoUpSLP::TreeEntry::VecTreeTy;

4438

4439 /// Add the VectorizableTree to the index iterator to be able to return

4440 /// TreeEntry pointers.

4441structChildIteratorType

4442 :publiciterator_adaptor_base<

4443 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {

4444ContainerTy &VectorizableTree;

4445

4446ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,

4447ContainerTy &VT)

4448 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}

4449

4450NodeRef operator*() {returnI->UserTE; }

4451 };

4452

4453staticNodeRef getEntryNode(BoUpSLP &R) {

4454return R.VectorizableTree[0].get();

4455 }

4456

4457static ChildIteratorTypechild_begin(NodeRef N) {

4458return {N->UserTreeIndices.begin(),N->Container};

4459 }

4460

4461static ChildIteratorTypechild_end(NodeRef N) {

4462return {N->UserTreeIndices.end(),N->Container};

4463 }

4464

4465 /// For the node iterator we just need to turn the TreeEntry iterator into a

4466 /// TreeEntry* iterator so that it dereferences to NodeRef.

4467classnodes_iterator {

4468usingItTy =ContainerTy::iterator;

4469ItTy It;

4470

4471public:

4472nodes_iterator(constItTy &It2) : It(It2) {}

4473NodeRef operator*() {return It->get(); }

4474 nodes_iteratoroperator++() {

4475 ++It;

4476return *this;

4477 }

4478booloperator!=(const nodes_iterator &N2) const{return N2.It != It; }

4479 };

4480

4481static nodes_iteratornodes_begin(BoUpSLP *R) {

4482return nodes_iterator(R->VectorizableTree.begin());

4483 }

4484

4485static nodes_iteratornodes_end(BoUpSLP *R) {

4486return nodes_iterator(R->VectorizableTree.end());

4487 }

4488

4489staticunsignedsize(BoUpSLP *R) {return R->VectorizableTree.size(); }

4490};

4491

4492template <>structDOTGraphTraits<BoUpSLP *> :publicDefaultDOTGraphTraits {

4493usingTreeEntry = BoUpSLP::TreeEntry;

4494

4495DOTGraphTraits(bool IsSimple =false) :DefaultDOTGraphTraits(IsSimple) {}

4496

4497 std::stringgetNodeLabel(constTreeEntry *Entry,constBoUpSLP *R) {

4498 std::string Str;

4499raw_string_ostream OS(Str);

4500OS << Entry->Idx <<".\n";

4501if (isSplat(Entry->Scalars))

4502OS <<"<splat> ";

4503for (auto *V : Entry->Scalars) {

4504OS << *V;

4505if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {

4506 return EU.Scalar == V;

4507 }))

4508OS <<" <extract>";

4509OS <<"\n";

4510 }

4511return Str;

4512 }

4513

4514static std::stringgetNodeAttributes(constTreeEntry *Entry,

4515constBoUpSLP *) {

4516if (Entry->isGather())

4517return"color=red";

4518if (Entry->State == TreeEntry::ScatterVectorize ||

4519 Entry->State == TreeEntry::StridedVectorize)

4520return"color=blue";

4521return"";

4522 }

4523};

4524

4525}// end namespace llvm

4526

4527BoUpSLP::~BoUpSLP() {

4528SmallVector<WeakTrackingVH> DeadInsts;

4529for (auto *I : DeletedInstructions) {

4530if (!I->getParent()) {

4531// Temporarily insert instruction back to erase them from parent and

4532// memory later.

4533if (isa<PHINode>(I))

4534// Phi nodes must be the very first instructions in the block.

4535I->insertBefore(F->getEntryBlock(),

4536F->getEntryBlock().getFirstNonPHIIt());

4537else

4538I->insertBefore(F->getEntryBlock().getTerminator()->getIterator());

4539continue;

4540 }

4541for (Use &U :I->operands()) {

4542auto *Op = dyn_cast<Instruction>(U.get());

4543if (Op && !DeletedInstructions.count(Op) &&Op->hasOneUser() &&

4544wouldInstructionBeTriviallyDead(Op, TLI))

4545 DeadInsts.emplace_back(Op);

4546 }

4547I->dropAllReferences();

4548 }

4549for (auto *I : DeletedInstructions) {

4550assert(I->use_empty() &&

4551"trying to erase instruction with users.");

4552I->eraseFromParent();

4553 }

4554

4555// Cleanup any dead scalar code feeding the vectorized instructions

4556RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI);

4557

4558#ifdef EXPENSIVE_CHECKS

4559// If we could guarantee that this call is not extremely slow, we could

4560// remove the ifdef limitation (see PR47712).

4561assert(!verifyFunction(*F, &dbgs()));

4562#endif

4563}

4564

4565/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses

4566/// contains original mask for the scalars reused in the node. Procedure

4567/// transform this mask in accordance with the given \p Mask.

4568staticvoidreorderReuses(SmallVectorImpl<int> &Reuses,ArrayRef<int> Mask) {

4569assert(!Mask.empty() && Reuses.size() == Mask.size() &&

4570"Expected non-empty mask.");

4571SmallVector<int> Prev(Reuses.begin(), Reuses.end());

4572 Prev.swap(Reuses);

4573for (unsignedI = 0,E = Prev.size();I <E; ++I)

4574if (Mask[I] !=PoisonMaskElem)

4575 Reuses[Mask[I]] = Prev[I];

4576}

4577

4578/// Reorders the given \p Order according to the given \p Mask. \p Order - is

4579/// the original order of the scalars. Procedure transforms the provided order

4580/// in accordance with the given \p Mask. If the resulting \p Order is just an

4581/// identity order, \p Order is cleared.

4582staticvoidreorderOrder(SmallVectorImpl<unsigned> &Order,ArrayRef<int> Mask,

4583bool BottomOrder =false) {

4584assert(!Mask.empty() &&"Expected non-empty mask.");

4585unsigned Sz = Mask.size();

4586if (BottomOrder) {

4587SmallVector<unsigned> PrevOrder;

4588if (Order.empty()) {

4589 PrevOrder.resize(Sz);

4590 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);

4591 }else {

4592 PrevOrder.swap(Order);

4593 }

4594 Order.assign(Sz, Sz);

4595for (unsignedI = 0;I < Sz; ++I)

4596if (Mask[I] !=PoisonMaskElem)

4597 Order[I] = PrevOrder[Mask[I]];

4598if (all_of(enumerate(Order), [&](constauto &Data) {

4599returnData.value() == Sz ||Data.index() ==Data.value();

4600 })) {

4601 Order.clear();

4602return;

4603 }

4604fixupOrderingIndices(Order);

4605return;

4606 }

4607SmallVector<int> MaskOrder;

4608if (Order.empty()) {

4609 MaskOrder.resize(Sz);

4610 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);

4611 }else {

4612inversePermutation(Order, MaskOrder);

4613 }

4614reorderReuses(MaskOrder, Mask);

4615if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {

4616 Order.clear();

4617return;

4618 }

4619 Order.assign(Sz, Sz);

4620for (unsignedI = 0;I < Sz; ++I)

4621if (MaskOrder[I] !=PoisonMaskElem)

4622 Order[MaskOrder[I]] =I;

4623fixupOrderingIndices(Order);

4624}

4625

4626std::optional<BoUpSLP::OrdersType>

4627BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {

4628assert(TE.isGather() &&"Expected gather node only.");

4629// Try to find subvector extract/insert patterns and reorder only such

4630// patterns.

4631SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());

4632Type *ScalarTy = GatheredScalars.front()->getType();

4633int NumScalars = GatheredScalars.size();

4634if (!isValidElementType(ScalarTy))

4635return std::nullopt;

4636auto *VecTy =getWidenedType(ScalarTy, NumScalars);

4637unsigned NumParts =::getNumberOfParts(*TTI, VecTy, NumScalars);

4638SmallVector<int> ExtractMask;

4639SmallVector<int> Mask;

4640SmallVector<SmallVector<const TreeEntry *>> Entries;

4641SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> ExtractShuffles =

4642 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);

4643SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles =

4644 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,

4645/*ForOrder=*/true);

4646// No shuffled operands - ignore.

4647if (GatherShuffles.empty() && ExtractShuffles.empty())

4648return std::nullopt;

4649OrdersType CurrentOrder(NumScalars, NumScalars);

4650if (GatherShuffles.size() == 1 &&

4651 *GatherShuffles.front() ==TTI::SK_PermuteSingleSrc &&

4652 Entries.front().front()->isSame(TE.Scalars)) {

4653// Perfect match in the graph, will reuse the previously vectorized

4654// node. Cost is 0.

4655 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);

4656return CurrentOrder;

4657 }

4658auto IsSplatMask = [](ArrayRef<int> Mask) {

4659int SingleElt =PoisonMaskElem;

4660returnall_of(Mask, [&](intI) {

4661if (SingleElt ==PoisonMaskElem &&I !=PoisonMaskElem)

4662 SingleElt =I;

4663returnI ==PoisonMaskElem ||I == SingleElt;

4664 });

4665 };

4666// Exclusive broadcast mask - ignore.

4667if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&

4668 (Entries.size() != 1 ||

4669 Entries.front().front()->ReorderIndices.empty())) ||

4670 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))

4671return std::nullopt;

4672SmallBitVector ShuffledSubMasks(NumParts);

4673auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,

4674ArrayRef<int> Mask,int PartSz,int NumParts,

4675function_ref<unsigned(unsigned)> GetVF) {

4676for (intI : seq<int>(0, NumParts)) {

4677if (ShuffledSubMasks.test(I))

4678continue;

4679constint VF = GetVF(I);

4680if (VF == 0)

4681continue;

4682unsigned Limit =getNumElems(CurrentOrder.size(), PartSz,I);

4683MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);

4684// Shuffle of at least 2 vectors - ignore.

4685if (any_of(Slice, [&](intI) {returnI != NumScalars; })) {

4686 std::fill(Slice.begin(), Slice.end(), NumScalars);

4687 ShuffledSubMasks.set(I);

4688continue;

4689 }

4690// Try to include as much elements from the mask as possible.

4691int FirstMin = INT_MAX;

4692int SecondVecFound =false;

4693for (int K : seq<int>(Limit)) {

4694intIdx = Mask[I * PartSz + K];

4695if (Idx ==PoisonMaskElem) {

4696Value *V = GatheredScalars[I * PartSz + K];

4697if (isConstant(V) && !isa<PoisonValue>(V)) {

4698 SecondVecFound =true;

4699break;

4700 }

4701continue;

4702 }

4703if (Idx < VF) {

4704if (FirstMin >Idx)

4705 FirstMin =Idx;

4706 }else {

4707 SecondVecFound =true;

4708break;

4709 }

4710 }

4711 FirstMin = (FirstMin / PartSz) * PartSz;

4712// Shuffle of at least 2 vectors - ignore.

4713if (SecondVecFound) {

4714 std::fill(Slice.begin(), Slice.end(), NumScalars);

4715 ShuffledSubMasks.set(I);

4716continue;

4717 }

4718for (int K : seq<int>(Limit)) {

4719intIdx = Mask[I * PartSz + K];

4720if (Idx ==PoisonMaskElem)

4721continue;

4722Idx -= FirstMin;

4723if (Idx >= PartSz) {

4724 SecondVecFound =true;

4725break;

4726 }

4727if (CurrentOrder[I * PartSz +Idx] >

4728static_cast<unsigned>(I * PartSz + K) &&

4729 CurrentOrder[I * PartSz +Idx] !=

4730static_cast<unsigned>(I * PartSz +Idx))

4731 CurrentOrder[I * PartSz +Idx] =I * PartSz + K;

4732 }

4733// Shuffle of at least 2 vectors - ignore.

4734if (SecondVecFound) {

4735 std::fill(Slice.begin(), Slice.end(), NumScalars);

4736 ShuffledSubMasks.set(I);

4737continue;

4738 }

4739 }

4740 };

4741int PartSz =getPartNumElems(NumScalars, NumParts);

4742if (!ExtractShuffles.empty())

4743 TransformMaskToOrder(

4744 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsignedI) {

4745if (!ExtractShuffles[I])

4746return 0U;

4747unsigned VF = 0;

4748unsigned Sz =getNumElems(TE.getVectorFactor(), PartSz,I);

4749for (unsignedIdx : seq<unsigned>(Sz)) {

4750int K =I * PartSz +Idx;

4751if (ExtractMask[K] ==PoisonMaskElem)

4752continue;

4753if (!TE.ReuseShuffleIndices.empty())

4754 K = TE.ReuseShuffleIndices[K];

4755if (K ==PoisonMaskElem)

4756continue;

4757if (!TE.ReorderIndices.empty())

4758 K = std::distance(TE.ReorderIndices.begin(),

4759find(TE.ReorderIndices, K));

4760auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);

4761if (!EI)

4762continue;

4763 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())

4764 ->getElementCount()

4765 .getKnownMinValue());

4766 }

4767return VF;

4768 });

4769// Check special corner case - single shuffle of the same entry.

4770if (GatherShuffles.size() == 1 && NumParts != 1) {

4771if (ShuffledSubMasks.any())

4772return std::nullopt;

4773 PartSz = NumScalars;

4774 NumParts = 1;

4775 }

4776if (!Entries.empty())

4777 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsignedI) {

4778if (!GatherShuffles[I])

4779return 0U;

4780return std::max(Entries[I].front()->getVectorFactor(),

4781 Entries[I].back()->getVectorFactor());

4782 });

4783int NumUndefs =

4784count_if(CurrentOrder, [&](intIdx) {returnIdx == NumScalars; });

4785if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))

4786return std::nullopt;

4787return std::move(CurrentOrder);

4788}

4789

4790staticboolarePointersCompatible(Value *Ptr1,Value *Ptr2,

4791constTargetLibraryInfo &TLI,

4792bool CompareOpcodes =true) {

4793if (getUnderlyingObject(Ptr1,RecursionMaxDepth) !=

4794getUnderlyingObject(Ptr2,RecursionMaxDepth))

4795returnfalse;

4796auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);

4797auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);

4798return (!GEP1 || GEP1->getNumOperands() == 2) &&

4799 (!GEP2 || GEP2->getNumOperands() == 2) &&

4800 (((!GEP1 ||isConstant(GEP1->getOperand(1))) &&

4801 (!GEP2 ||isConstant(GEP2->getOperand(1)))) ||

4802 !CompareOpcodes ||

4803 (GEP1 && GEP2 &&

4804getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));

4805}

4806

4807/// Calculates minimal alignment as a common alignment.

4808template <typename T>

4809staticAlign computeCommonAlignment(ArrayRef<Value *> VL) {

4810Align CommonAlignment = cast<T>(VL.front())->getAlign();

4811for (Value *V : VL.drop_front())

4812 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());

4813return CommonAlignment;

4814}

4815

4816/// Check if \p Order represents reverse order.

4817staticboolisReverseOrder(ArrayRef<unsigned> Order) {

4818assert(!Order.empty() &&

4819"Order is empty. Please check it before using isReverseOrder.");

4820unsigned Sz = Order.size();

4821returnall_of(enumerate(Order), [&](constauto &Pair) {

4822return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();

4823 });

4824}

4825

4826/// Checks if the provided list of pointers \p Pointers represents the strided

4827/// pointers for type ElemTy. If they are not, std::nullopt is returned.

4828/// Otherwise, if \p Inst is not specified, just initialized optional value is

4829/// returned to show that the pointers represent strided pointers. If \p Inst

4830/// specified, the runtime stride is materialized before the given \p Inst.

4831/// \returns std::nullopt if the pointers are not pointers with the runtime

4832/// stride, nullptr or actual stride value, otherwise.

4833static std::optional<Value *>

4834calculateRtStride(ArrayRef<Value *> PointerOps,Type *ElemTy,

4835constDataLayout &DL,ScalarEvolution &SE,

4836SmallVectorImpl<unsigned> &SortedIndices,

4837Instruction *Inst =nullptr) {

4838SmallVector<const SCEV *> SCEVs;

4839constSCEV *PtrSCEVLowest =nullptr;

4840constSCEV *PtrSCEVHighest =nullptr;

4841// Find lower/upper pointers from the PointerOps (i.e. with lowest and highest

4842// addresses).

4843for (Value *Ptr : PointerOps) {

4844constSCEV *PtrSCEV = SE.getSCEV(Ptr);

4845if (!PtrSCEV)

4846return std::nullopt;

4847 SCEVs.push_back(PtrSCEV);

4848if (!PtrSCEVLowest && !PtrSCEVHighest) {

4849 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;

4850continue;

4851 }

4852constSCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);

4853if (isa<SCEVCouldNotCompute>(Diff))

4854return std::nullopt;

4855if (Diff->isNonConstantNegative()) {

4856 PtrSCEVLowest = PtrSCEV;

4857continue;

4858 }

4859constSCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);

4860if (isa<SCEVCouldNotCompute>(Diff1))

4861return std::nullopt;

4862if (Diff1->isNonConstantNegative()) {

4863 PtrSCEVHighest = PtrSCEV;

4864continue;

4865 }

4866 }

4867// Dist = PtrSCEVHighest - PtrSCEVLowest;

4868constSCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);

4869if (isa<SCEVCouldNotCompute>(Dist))

4870return std::nullopt;

4871intSize =DL.getTypeStoreSize(ElemTy);

4872auto TryGetStride = [&](constSCEV *Dist,

4873constSCEV *Multiplier) ->constSCEV * {

4874if (constauto *M = dyn_cast<SCEVMulExpr>(Dist)) {

4875if (M->getOperand(0) == Multiplier)

4876return M->getOperand(1);

4877if (M->getOperand(1) == Multiplier)

4878return M->getOperand(0);

4879returnnullptr;

4880 }

4881if (Multiplier == Dist)

4882return SE.getConstant(Dist->getType(), 1);

4883return SE.getUDivExactExpr(Dist, Multiplier);

4884 };

4885// Stride_in_elements = Dist / element_size * (num_elems - 1).

4886constSCEV *Stride =nullptr;

4887if (Size != 1 || SCEVs.size() > 2) {

4888constSCEV *Sz = SE.getConstant(Dist->getType(),Size * (SCEVs.size() - 1));

4889 Stride = TryGetStride(Dist, Sz);

4890if (!Stride)

4891return std::nullopt;

4892 }

4893if (!Stride || isa<SCEVConstant>(Stride))

4894return std::nullopt;

4895// Iterate through all pointers and check if all distances are

4896// unique multiple of Stride.

4897usingDistOrdPair = std::pair<int64_t, int>;

4898auto Compare =llvm::less_first();

4899 std::set<DistOrdPair,decltype(Compare)> Offsets(Compare);

4900int Cnt = 0;

4901bool IsConsecutive =true;

4902for (constSCEV *PtrSCEV : SCEVs) {

4903unsigned Dist = 0;

4904if (PtrSCEV != PtrSCEVLowest) {

4905constSCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);

4906constSCEV *Coeff = TryGetStride(Diff, Stride);

4907if (!Coeff)

4908return std::nullopt;

4909constauto *SC = dyn_cast<SCEVConstant>(Coeff);

4910if (!SC || isa<SCEVCouldNotCompute>(SC))

4911return std::nullopt;

4912if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,

4913 SE.getMulExpr(Stride, SC)))

4914 ->isZero())

4915return std::nullopt;

4916 Dist = SC->getAPInt().getZExtValue();

4917 }

4918// If the strides are not the same or repeated, we can't vectorize.

4919if ((Dist /Size) *Size != Dist || (Dist /Size) >= SCEVs.size())

4920return std::nullopt;

4921auto Res = Offsets.emplace(Dist, Cnt);

4922if (!Res.second)

4923return std::nullopt;

4924// Consecutive order if the inserted element is the last one.

4925 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();

4926 ++Cnt;

4927 }

4928if (Offsets.size() != SCEVs.size())

4929return std::nullopt;

4930 SortedIndices.clear();

4931if (!IsConsecutive) {

4932// Fill SortedIndices array only if it is non-consecutive.

4933 SortedIndices.resize(PointerOps.size());

4934 Cnt = 0;

4935for (const std::pair<int64_t, int> &Pair : Offsets) {

4936 SortedIndices[Cnt] = Pair.second;

4937 ++Cnt;

4938 }

4939 }

4940if (!Inst)

4941returnnullptr;

4942SCEVExpander Expander(SE,DL,"strided-load-vec");

4943return Expander.expandCodeFor(Stride, Stride->getType(), Inst);

4944}

4945

4946static std::pair<InstructionCost, InstructionCost>

4947getGEPCosts(constTargetTransformInfo &TTI,ArrayRef<Value *> Ptrs,

4948Value *BasePtr,unsigned Opcode,TTI::TargetCostKind CostKind,

4949Type *ScalarTy,VectorType *VecTy);

4950

4951/// Returns the cost of the shuffle instructions with the given \p Kind, vector

4952/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert

4953/// subvector pattern.

4954staticInstructionCost

4955getShuffleCost(constTargetTransformInfo &TTI,TTI::ShuffleKind Kind,

4956VectorType *Tp,ArrayRef<int> Mask = {},

4957TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput,

4958intIndex = 0,VectorType *SubTp =nullptr,

4959ArrayRef<const Value *>Args = {}) {

4960if (Kind !=TTI::SK_PermuteTwoSrc)

4961returnTTI.getShuffleCost(Kind, Tp, Mask,CostKind,Index, SubTp, Args);

4962int NumSrcElts = Tp->getElementCount().getKnownMinValue();

4963int NumSubElts;

4964if (Mask.size() > 2 &&ShuffleVectorInst::isInsertSubvectorMask(

4965 Mask, NumSrcElts, NumSubElts,Index)) {

4966if (Index + NumSubElts > NumSrcElts &&

4967Index + NumSrcElts <=static_cast<int>(Mask.size()))

4968returnTTI.getShuffleCost(

4969TTI::SK_InsertSubvector,

4970getWidenedType(Tp->getElementType(),Mask.size()), Mask,

4971TTI::TCK_RecipThroughput,Index, Tp);

4972 }

4973returnTTI.getShuffleCost(Kind, Tp, Mask,CostKind,Index, SubTp, Args);

4974}

4975

4976/// Correctly creates insert_subvector, checking that the index is multiple of

4977/// the subvectors length. Otherwise, generates shuffle using \p Generator or

4978/// using default shuffle.

4979staticValue *createInsertVector(

4980IRBuilderBase &Builder,Value *Vec,Value *V,unsignedIndex,

4981function_ref<Value *(Value *,Value *,ArrayRef<int>)> Generator = {}) {

4982constunsigned SubVecVF =getNumElements(V->getType());

4983if (Index % SubVecVF == 0) {

4984 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,

4985 Builder.getInt64(Index));

4986 }else {

4987// Create shuffle, insertvector requires that index is multiple of

4988// the subvector length.

4989constunsigned VecVF =getNumElements(Vec->getType());

4990SmallVector<int>Mask(VecVF,PoisonMaskElem);

4991 std::iota(Mask.begin(),Mask.end(), 0);

4992for (unsignedI : seq<unsigned>(SubVecVF))

4993Mask[I +Index] =I + VecVF;

4994if (Generator) {

4995 Vec = Generator(Vec, V, Mask);

4996 }else {

4997// 1. Resize V to the size of Vec.

4998SmallVector<int> ResizeMask(VecVF,PoisonMaskElem);

4999 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);

5000V = Builder.CreateShuffleVector(V, ResizeMask);

5001 Vec = Builder.CreateShuffleVector(Vec, V, Mask);

5002 }

5003 }

5004return Vec;

5005}

5006

5007/// Correctly creates extract_subvector, checking that the index is multiple of

5008/// the subvectors length. Otherwise, generates shuffle using \p Generator or

5009/// using default shuffle.

5010staticValue *createExtractVector(IRBuilderBase &Builder,Value *Vec,

5011unsigned SubVecVF,unsignedIndex) {

5012if (Index % SubVecVF == 0) {

5013VectorType *SubVecTy =

5014getWidenedType(Vec->getType()->getScalarType(), SubVecVF);

5015return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));

5016 }

5017// Create shuffle, extract_subvector requires that index is multiple of

5018// the subvector length.

5019SmallVector<int> Mask(SubVecVF,PoisonMaskElem);

5020 std::iota(Mask.begin(), Mask.end(),Index);

5021return Builder.CreateShuffleVector(Vec, Mask);

5022}

5023

5024BoUpSLP::LoadsState

5025BoUpSLP::canVectorizeLoads(ArrayRef<Value *> VL,constValue *VL0,

5026SmallVectorImpl<unsigned> &Order,

5027SmallVectorImpl<Value *> &PointerOps,

5028unsigned *BestVF,bool TryRecursiveCheck) const{

5029// Check that a vectorized load would load the same memory as a scalar

5030// load. For example, we don't want to vectorize loads that are smaller

5031// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

5032// treats loading/storing it as an i8 struct. If we vectorize loads/stores

5033// from such a struct, we read/write packed bits disagreeing with the

5034// unvectorized version.

5035if (BestVF)

5036 *BestVF = 0;

5037if (areKnownNonVectorizableLoads(VL))

5038returnLoadsState::Gather;

5039Type *ScalarTy = VL0->getType();

5040

5041if (DL->getTypeSizeInBits(ScalarTy) !=DL->getTypeAllocSizeInBits(ScalarTy))

5042returnLoadsState::Gather;

5043

5044// Make sure all loads in the bundle are simple - we can't vectorize

5045// atomic or volatile loads.

5046 PointerOps.clear();

5047constunsigned Sz = VL.size();

5048 PointerOps.resize(Sz);

5049auto *POIter = PointerOps.begin();

5050for (Value *V : VL) {

5051auto *L = dyn_cast<LoadInst>(V);

5052if (!L || !L->isSimple())

5053returnLoadsState::Gather;

5054 *POIter = L->getPointerOperand();

5055 ++POIter;

5056 }

5057

5058 Order.clear();

5059// Check the order of pointer operands or that all pointers are the same.

5060bool IsSorted =sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);

5061

5062auto *VecTy =getWidenedType(ScalarTy, Sz);

5063Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);

5064if (!IsSorted) {

5065if (Sz >MinProfitableStridedLoads &&TTI->isTypeLegal(VecTy)) {

5066if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&

5067calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))

5068returnLoadsState::StridedVectorize;

5069 }

5070

5071if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

5072TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))

5073returnLoadsState::Gather;

5074

5075if (!all_of(PointerOps, [&](Value *P) {

5076returnarePointersCompatible(P, PointerOps.front(), *TLI);

5077 }))

5078returnLoadsState::Gather;

5079

5080 }else {

5081Value *Ptr0;

5082Value *PtrN;

5083if (Order.empty()) {

5084 Ptr0 = PointerOps.front();

5085 PtrN = PointerOps.back();

5086 }else {

5087 Ptr0 = PointerOps[Order.front()];

5088 PtrN = PointerOps[Order.back()];

5089 }

5090 std::optional<int> Diff =

5091getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

5092// Check that the sorted loads are consecutive.

5093if (static_cast<unsigned>(*Diff) == Sz - 1)

5094returnLoadsState::Vectorize;

5095if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||

5096TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))

5097returnLoadsState::Gather;

5098// Simple check if not a strided access - clear order.

5099bool IsPossibleStrided = *Diff % (Sz - 1) == 0;

5100// Try to generate strided load node if:

5101// 1. Target with strided load support is detected.

5102// 2. The number of loads is greater than MinProfitableStridedLoads,

5103// or the potential stride <= MaxProfitableLoadStride and the

5104// potential stride is power-of-2 (to avoid perf regressions for the very

5105// small number of loads) and max distance > number of loads, or potential

5106// stride is -1.

5107// 3. The loads are ordered, or number of unordered loads <=

5108// MaxProfitableUnorderedLoads, or loads are in reversed order.

5109// (this check is to avoid extra costs for very expensive shuffles).

5110// 4. Any pointer operand is an instruction with the users outside of the

5111// current graph (for masked gathers extra extractelement instructions

5112// might be required).

5113auto IsAnyPointerUsedOutGraph =

5114 IsPossibleStrided &&any_of(PointerOps, [&](Value *V) {

5115return isa<Instruction>(V) &&any_of(V->users(), [&](User *U) {

5116 return !getTreeEntry(U) && !MustGather.contains(U);

5117 });

5118 });

5119constunsigned AbsoluteDiff = std::abs(*Diff);

5120if (IsPossibleStrided &&

5121 (IsAnyPointerUsedOutGraph ||

5122 (AbsoluteDiff > Sz &&

5123 (Sz >MinProfitableStridedLoads ||

5124 (AbsoluteDiff <=MaxProfitableLoadStride * Sz &&

5125 AbsoluteDiff % Sz == 0 &&has_single_bit(AbsoluteDiff / Sz)))) ||

5126 *Diff == -(static_cast<int>(Sz) - 1))) {

5127int Stride = *Diff /static_cast<int>(Sz - 1);

5128if (*Diff == Stride *static_cast<int>(Sz - 1)) {

5129Align Alignment =

5130 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])

5131 ->getAlign();

5132if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {

5133// Iterate through all pointers and check if all distances are

5134// unique multiple of Dist.

5135SmallSet<int, 4> Dists;

5136for (Value *Ptr : PointerOps) {

5137int Dist = 0;

5138if (Ptr == PtrN)

5139 Dist = *Diff;

5140elseif (Ptr != Ptr0)

5141 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy,Ptr, *DL, *SE);

5142// If the strides are not the same or repeated, we can't

5143// vectorize.

5144if (((Dist / Stride) * Stride) != Dist ||

5145 !Dists.insert(Dist).second)

5146break;

5147 }

5148if (Dists.size() == Sz)

5149returnLoadsState::StridedVectorize;

5150 }

5151 }

5152 }

5153 }

5154// Correctly identify compare the cost of loads + shuffles rather than

5155// strided/masked gather loads. Returns true if vectorized + shuffles

5156// representation is better than just gather.

5157auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,

5158unsigned *BestVF,

5159bool ProfitableGatherPointers) {

5160if (BestVF)

5161 *BestVF = 0;

5162// Compare masked gather cost and loads + insert subvector costs.

5163TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

5164auto [ScalarGEPCost, VectorGEPCost] =

5165getGEPCosts(TTI, PointerOps, PointerOps.front(),

5166 Instruction::GetElementPtr,CostKind, ScalarTy, VecTy);

5167// Estimate the cost of masked gather GEP. If not a splat, roughly

5168// estimate as a buildvector, otherwise estimate as splat.

5169APInt DemandedElts =APInt::getAllOnes(VecTy->getNumElements());

5170VectorType *PtrVecTy =

5171getWidenedType(PointerOps.front()->getType()->getScalarType(),

5172 VecTy->getNumElements());

5173if (static_cast<unsigned>(count_if(

5174 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||

5175any_of(PointerOps, [&](Value *V) {

5176returngetUnderlyingObject(V) !=

5177getUnderlyingObject(PointerOps.front());

5178 }))

5179 VectorGEPCost +=TTI.getScalarizationOverhead(

5180 PtrVecTy, DemandedElts,/*Insert=*/true,/*Extract=*/false,CostKind);

5181else

5182 VectorGEPCost +=

5183TTI.getScalarizationOverhead(

5184 PtrVecTy,APInt::getOneBitSet(VecTy->getNumElements(), 0),

5185/*Insert=*/true,/*Extract=*/false,CostKind) +

5186::getShuffleCost(TTI,TTI::SK_Broadcast, PtrVecTy, {},CostKind);

5187// The cost of scalar loads.

5188InstructionCost ScalarLoadsCost =

5189 std::accumulate(VL.begin(), VL.end(),InstructionCost(),

5190 [&](InstructionCost C,Value *V) {

5191returnC +TTI.getInstructionCost(

5192 cast<Instruction>(V),CostKind);

5193 }) +

5194 ScalarGEPCost;

5195// The cost of masked gather.

5196InstructionCost MaskedGatherCost =

5197TTI.getGatherScatterOpCost(

5198 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),

5199/*VariableMask=*/false, CommonAlignment,CostKind) +

5200 (ProfitableGatherPointers ? 0 : VectorGEPCost);

5201InstructionCost GatherCost =

5202TTI.getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,

5203/*Extract=*/false,CostKind) +

5204 ScalarLoadsCost;

5205// The list of loads is small or perform partial check already - directly

5206// compare masked gather cost and gather cost.

5207constexprunsigned ListLimit = 4;

5208if (!TryRecursiveCheck || VL.size() < ListLimit)

5209return MaskedGatherCost - GatherCost >= -SLPCostThreshold;

5210

5211// FIXME: The following code has not been updated for non-power-of-2

5212// vectors (and not whole registers). The splitting logic here does not

5213// cover the original vector if the vector factor is not a power of two.

5214if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))

5215returnfalse;

5216

5217unsigned Sz =DL->getTypeSizeInBits(ScalarTy);

5218unsigned MinVF =getMinVF(2 * Sz);

5219 DemandedElts.clearAllBits();

5220// Iterate through possible vectorization factors and check if vectorized +

5221// shuffles is better than just gather.

5222for (unsigned VF =

5223getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);

5224 VF >= MinVF;

5225 VF =getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {

5226SmallVector<LoadsState> States;

5227for (unsigned Cnt = 0,End = VL.size(); Cnt + VF <=End; Cnt += VF) {

5228ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

5229SmallVector<unsigned> Order;

5230SmallVector<Value *> PointerOps;

5231LoadsState LS =

5232canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,

5233/*TryRecursiveCheck=*/false);

5234// Check that the sorted loads are consecutive.

5235if (LS ==LoadsState::Gather) {

5236if (BestVF) {

5237 DemandedElts.setAllBits();

5238break;

5239 }

5240 DemandedElts.setBits(Cnt, Cnt + VF);

5241continue;

5242 }

5243// If need the reorder - consider as high-cost masked gather for now.

5244if ((LS ==LoadsState::Vectorize ||

5245 LS ==LoadsState::StridedVectorize) &&

5246 !Order.empty() && !isReverseOrder(Order))

5247 LS =LoadsState::ScatterVectorize;

5248 States.push_back(LS);

5249 }

5250if (DemandedElts.isAllOnes())

5251// All loads gathered - try smaller VF.

5252continue;

5253// Can be vectorized later as a serie of loads/insertelements.

5254InstructionCost VecLdCost = 0;

5255if (!DemandedElts.isZero()) {

5256 VecLdCost =

5257TTI.getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,

5258/*Extract=*/false,CostKind) +

5259 ScalarGEPCost;

5260for (unsignedIdx : seq<unsigned>(VL.size()))

5261if (DemandedElts[Idx])

5262 VecLdCost +=

5263TTI.getInstructionCost(cast<Instruction>(VL[Idx]),CostKind);

5264 }

5265unsigned ScalarTyNumElements =getNumElements(ScalarTy);

5266auto *SubVecTy =getWidenedType(ScalarTy, VF);

5267for (auto [I, LS] :enumerate(States)) {

5268auto *LI0 = cast<LoadInst>(VL[I * VF]);

5269InstructionCost VectorGEPCost =

5270 (LS ==LoadsState::ScatterVectorize && ProfitableGatherPointers)

5271 ? 0

5272 :getGEPCosts(TTI,ArrayRef(PointerOps).slice(I * VF, VF),

5273 LI0->getPointerOperand(),

5274 Instruction::GetElementPtr,CostKind, ScalarTy,

5275 SubVecTy)

5276 .second;

5277if (LS ==LoadsState::ScatterVectorize) {

5278if (static_cast<unsigned>(

5279count_if(PointerOps, IsaPred<GetElementPtrInst>)) <

5280 PointerOps.size() - 1 ||

5281any_of(PointerOps, [&](Value *V) {

5282returngetUnderlyingObject(V) !=

5283getUnderlyingObject(PointerOps.front());

5284 }))

5285 VectorGEPCost +=TTI.getScalarizationOverhead(

5286 SubVecTy,APInt::getAllOnes(VF),

5287/*Insert=*/true,/*Extract=*/false,CostKind);

5288else

5289 VectorGEPCost +=

5290TTI.getScalarizationOverhead(

5291 SubVecTy,APInt::getOneBitSet(ScalarTyNumElements * VF, 0),

5292/*Insert=*/true,/*Extract=*/false,CostKind) +

5293::getShuffleCost(TTI,TTI::SK_Broadcast, SubVecTy, {},

5294CostKind);

5295 }

5296switch (LS) {

5297caseLoadsState::Vectorize:

5298 VecLdCost +=

5299TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),

5300 LI0->getPointerAddressSpace(),CostKind,

5301TTI::OperandValueInfo()) +

5302 VectorGEPCost;

5303break;

5304caseLoadsState::StridedVectorize:

5305 VecLdCost +=TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,

5306 LI0->getPointerOperand(),

5307/*VariableMask=*/false,

5308 CommonAlignment,CostKind) +

5309 VectorGEPCost;

5310break;

5311caseLoadsState::ScatterVectorize:

5312 VecLdCost +=TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,

5313 LI0->getPointerOperand(),

5314/*VariableMask=*/false,

5315 CommonAlignment,CostKind) +

5316 VectorGEPCost;

5317break;

5318caseLoadsState::Gather:

5319// Gathers are already calculated - ignore.

5320continue;

5321 }

5322SmallVector<int> ShuffleMask(VL.size());

5323for (intIdx : seq<int>(0, VL.size()))

5324 ShuffleMask[Idx] =Idx / VF ==I ? VL.size() +Idx % VF :Idx;

5325if (I > 0)

5326 VecLdCost +=

5327::getShuffleCost(TTI,TTI::SK_InsertSubvector, VecTy, ShuffleMask,

5328CostKind,I * VF, SubVecTy);

5329 }

5330// If masked gather cost is higher - better to vectorize, so

5331// consider it as a gather node. It will be better estimated

5332// later.

5333if (MaskedGatherCost >= VecLdCost &&

5334 VecLdCost - GatherCost < -SLPCostThreshold) {

5335if (BestVF)

5336 *BestVF = VF;

5337returntrue;

5338 }

5339 }

5340return MaskedGatherCost - GatherCost >= -SLPCostThreshold;

5341 };

5342// TODO: need to improve analysis of the pointers, if not all of them are

5343// GEPs or have > 2 operands, we end up with a gather node, which just

5344// increases the cost.

5345Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());

5346bool ProfitableGatherPointers =

5347 L && Sz > 2 &&static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {

5348return L->isLoopInvariant(V);

5349 })) <= Sz / 2;

5350if (ProfitableGatherPointers ||all_of(PointerOps, [](Value *P) {

5351auto *GEP = dyn_cast<GetElementPtrInst>(P);

5352return (!GEP &&doesNotNeedToBeScheduled(P)) ||

5353 (GEP &&GEP->getNumOperands() == 2 &&

5354 isa<Constant, Instruction>(GEP->getOperand(1)));

5355 })) {

5356// Check if potential masked gather can be represented as series

5357// of loads + insertsubvectors.

5358// If masked gather cost is higher - better to vectorize, so

5359// consider it as a gather node. It will be better estimated

5360// later.

5361if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,

5362 ProfitableGatherPointers))

5363returnLoadsState::ScatterVectorize;

5364 }

5365

5366returnLoadsState::Gather;

5367}

5368

5369staticboolclusterSortPtrAccesses(ArrayRef<Value *> VL,

5370ArrayRef<BasicBlock *> BBs,Type *ElemTy,

5371constDataLayout &DL,ScalarEvolution &SE,

5372SmallVectorImpl<unsigned> &SortedIndices) {

5373assert(

5374all_of(VL, [](constValue *V) {return V->getType()->isPointerTy(); }) &&

5375"Expected list of pointer operands.");

5376// Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each

5377// Ptr into, sort and return the sorted indices with values next to one

5378// another.

5379SmallMapVector<std::pair<BasicBlock *, Value *>,

5380SmallVector<SmallVector<std::tuple<Value *, int, unsigned>>>, 8>

5381 Bases;

5382 Bases

5383 .try_emplace(std::make_pair(

5384 BBs.front(),getUnderlyingObject(VL.front(),RecursionMaxDepth)))

5385 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);

5386

5387 SortedIndices.clear();

5388for (auto [Cnt,Ptr] :enumerate(VL.drop_front())) {

5389auto Key = std::make_pair(BBs[Cnt + 1],

5390getUnderlyingObject(Ptr,RecursionMaxDepth));

5391bool Found =any_of(Bases.try_emplace(Key).first->second,

5392 [&, &Cnt = Cnt, &Ptr =Ptr](auto &Base) {

5393 std::optional<int> Diff = getPointersDiff(

5394 ElemTy, std::get<0>(Base.front()), ElemTy,

5395 Ptr, DL, SE,

5396/*StrictCheck=*/true);

5397 if (!Diff)

5398 return false;

5399

5400 Base.emplace_back(Ptr, *Diff, Cnt + 1);

5401 return true;

5402 });

5403

5404if (!Found) {

5405// If we haven't found enough to usefully cluster, return early.

5406if (Bases.size() > VL.size() / 2 - 1)

5407returnfalse;

5408

5409// Not found already - add a new Base

5410 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);

5411 }

5412 }

5413

5414if (Bases.size() == VL.size())

5415returnfalse;

5416

5417if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||

5418 Bases.front().second.size() == VL.size()))

5419returnfalse;

5420

5421// For each of the bases sort the pointers by Offset and check if any of the

5422// base become consecutively allocated.

5423auto ComparePointers = [](Value *Ptr1,Value *Ptr2) {

5424SmallPtrSet<Value *, 13> FirstPointers;

5425SmallPtrSet<Value *, 13> SecondPointers;

5426Value *P1 = Ptr1;

5427Value *P2 = Ptr2;

5428unsignedDepth = 0;

5429while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {

5430if (P1 == P2 ||Depth >RecursionMaxDepth)

5431returnfalse;

5432 FirstPointers.insert(P1);

5433 SecondPointers.insert(P2);

5434 P1 =getUnderlyingObject(P1,/*MaxLookup=*/1);

5435 P2 =getUnderlyingObject(P2,/*MaxLookup=*/1);

5436 ++Depth;

5437 }

5438assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&

5439"Unable to find matching root.");

5440return FirstPointers.contains(P2) && !SecondPointers.contains(P1);

5441 };

5442for (auto &Base : Bases) {

5443for (auto &Vec :Base.second) {

5444if (Vec.size() > 1) {

5445stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,

5446const std::tuple<Value *, int, unsigned> &Y) {

5447return std::get<1>(X) < std::get<1>(Y);

5448 });

5449int InitialOffset = std::get<1>(Vec[0]);

5450bool AnyConsecutive =

5451all_of(enumerate(Vec), [InitialOffset](constauto &P) {

5452return std::get<1>(P.value()) == int(P.index()) + InitialOffset;

5453 });

5454// Fill SortedIndices array only if it looks worth-while to sort the

5455// ptrs.

5456if (!AnyConsecutive)

5457returnfalse;

5458 }

5459 }

5460stable_sort(Base.second, [&](constauto &V1,constauto &V2) {

5461 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));

5462 });

5463 }

5464

5465for (auto &T : Bases)

5466for (constauto &Vec :T.second)

5467for (constauto &P : Vec)

5468 SortedIndices.push_back(std::get<2>(P));

5469

5470assert(SortedIndices.size() == VL.size() &&

5471"Expected SortedIndices to be the size of VL");

5472returntrue;

5473}

5474

5475std::optional<BoUpSLP::OrdersType>

5476BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {

5477assert(TE.isGather() &&"Expected gather node only.");

5478Type *ScalarTy = TE.Scalars[0]->getType();

5479

5480SmallVector<Value *> Ptrs;

5481 Ptrs.reserve(TE.Scalars.size());

5482SmallVector<BasicBlock *> BBs;

5483 BBs.reserve(TE.Scalars.size());

5484for (Value *V : TE.Scalars) {

5485auto *L = dyn_cast<LoadInst>(V);

5486if (!L || !L->isSimple())

5487return std::nullopt;

5488 Ptrs.push_back(L->getPointerOperand());

5489 BBs.push_back(L->getParent());

5490 }

5491

5492BoUpSLP::OrdersType Order;

5493if (!LoadEntriesToVectorize.contains(TE.Idx) &&

5494clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))

5495return std::move(Order);

5496return std::nullopt;

5497}

5498

5499/// Check if two insertelement instructions are from the same buildvector.

5500staticboolareTwoInsertFromSameBuildVector(

5501InsertElementInst *VU,InsertElementInst *V,

5502function_ref<Value *(InsertElementInst *)> GetBaseOperand) {

5503// Instructions must be from the same basic blocks.

5504if (VU->getParent() != V->getParent())

5505returnfalse;

5506// Checks if 2 insertelements are from the same buildvector.

5507if (VU->getType() != V->getType())

5508returnfalse;

5509// Multiple used inserts are separate nodes.

5510if (!VU->hasOneUse() && !V->hasOneUse())

5511returnfalse;

5512auto *IE1 = VU;

5513auto *IE2 = V;

5514 std::optional<unsigned> Idx1 =getElementIndex(IE1);

5515 std::optional<unsigned> Idx2 =getElementIndex(IE2);

5516if (Idx1 == std::nullopt || Idx2 == std::nullopt)

5517returnfalse;

5518// Go through the vector operand of insertelement instructions trying to find

5519// either VU as the original vector for IE2 or V as the original vector for

5520// IE1.

5521SmallBitVector ReusedIdx(

5522 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());

5523bool IsReusedIdx =false;

5524do {

5525if (IE2 == VU && !IE1)

5526return VU->hasOneUse();

5527if (IE1 == V && !IE2)

5528return V->hasOneUse();

5529if (IE1 && IE1 != V) {

5530unsigned Idx1 =getElementIndex(IE1).value_or(*Idx2);

5531 IsReusedIdx |= ReusedIdx.test(Idx1);

5532 ReusedIdx.set(Idx1);

5533if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)

5534 IE1 =nullptr;

5535else

5536 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));

5537 }

5538if (IE2 && IE2 != VU) {

5539unsigned Idx2 =getElementIndex(IE2).value_or(*Idx1);

5540 IsReusedIdx |= ReusedIdx.test(Idx2);

5541 ReusedIdx.set(Idx2);

5542if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)

5543 IE2 =nullptr;

5544else

5545 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));

5546 }

5547 }while (!IsReusedIdx && (IE1 || IE2));

5548returnfalse;

5549}

5550

5551std::optional<BoUpSLP::OrdersType>

5552BoUpSLP::getReorderingData(const TreeEntry &TE,bool TopToBottom) {

5553// No need to reorder if need to shuffle reuses, still need to shuffle the

5554// node.

5555if (!TE.ReuseShuffleIndices.empty()) {

5556// FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.

5557assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&

5558"Reshuffling scalars not yet supported for nodes with padding");

5559

5560if (isSplat(TE.Scalars))

5561return std::nullopt;

5562// Check if reuse shuffle indices can be improved by reordering.

5563// For this, check that reuse mask is "clustered", i.e. each scalar values

5564// is used once in each submask of size <number_of_scalars>.

5565// Example: 4 scalar values.

5566// ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.

5567// 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because

5568// element 3 is used twice in the second submask.

5569unsigned Sz = TE.Scalars.size();

5570if (TE.isGather()) {

5571if (std::optional<OrdersType> CurrentOrder =

5572findReusedOrderedScalars(TE)) {

5573SmallVector<int> Mask;

5574fixupOrderingIndices(*CurrentOrder);

5575inversePermutation(*CurrentOrder, Mask);

5576::addMask(Mask, TE.ReuseShuffleIndices);

5577OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());

5578unsigned Sz = TE.Scalars.size();

5579for (int K = 0,E = TE.getVectorFactor() / Sz; K <E; ++K) {

5580for (auto [I,Idx] :enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))

5581if (Idx !=PoisonMaskElem)

5582 Res[Idx + K * Sz] =I + K * Sz;

5583 }

5584return std::move(Res);

5585 }

5586 }

5587if (Sz == 2 && TE.getVectorFactor() == 4 &&

5588::getNumberOfParts(*TTI,getWidenedType(TE.Scalars.front()->getType(),

5589 2 * TE.getVectorFactor())) == 1)

5590return std::nullopt;

5591if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,

5592 Sz)) {

5593SmallVector<int> ReorderMask(Sz,PoisonMaskElem);

5594if (TE.ReorderIndices.empty())

5595 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);

5596else

5597inversePermutation(TE.ReorderIndices, ReorderMask);

5598::addMask(ReorderMask, TE.ReuseShuffleIndices);

5599unsigned VF = ReorderMask.size();

5600OrdersType ResOrder(VF, VF);

5601unsigned NumParts =divideCeil(VF, Sz);

5602SmallBitVector UsedVals(NumParts);

5603for (unsignedI = 0;I < VF;I += Sz) {

5604int Val =PoisonMaskElem;

5605unsigned UndefCnt = 0;

5606unsigned Limit = std::min(Sz, VF -I);

5607if (any_of(ArrayRef(ReorderMask).slice(I, Limit),

5608 [&](intIdx) {

5609if (Val ==PoisonMaskElem &&Idx !=PoisonMaskElem)

5610 Val =Idx;

5611if (Idx ==PoisonMaskElem)

5612 ++UndefCnt;

5613returnIdx !=PoisonMaskElem &&Idx != Val;

5614 }) ||

5615 Val >=static_cast<int>(NumParts) || UsedVals.test(Val) ||

5616 UndefCnt > Sz / 2)

5617return std::nullopt;

5618 UsedVals.set(Val);

5619for (unsigned K = 0; K < NumParts; ++K) {

5620unsignedIdx = Val + Sz * K;

5621if (Idx < VF)

5622 ResOrder[Idx] =I + K;

5623 }

5624 }

5625return std::move(ResOrder);

5626 }

5627unsigned VF = TE.getVectorFactor();

5628// Try build correct order for extractelement instructions.

5629SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),

5630 TE.ReuseShuffleIndices.end());

5631if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&

5632all_of(TE.Scalars, [Sz](Value *V) {

5633 if (isa<PoisonValue>(V))

5634 return true;

5635 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));

5636 return Idx && *Idx < Sz;

5637 })) {

5638assert(!TE.isAltShuffle() &&"Alternate instructions are only supported "

5639"by BinaryOperator and CastInst.");

5640SmallVector<int> ReorderMask(Sz,PoisonMaskElem);

5641if (TE.ReorderIndices.empty())

5642 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);

5643else

5644inversePermutation(TE.ReorderIndices, ReorderMask);

5645for (unsignedI = 0;I < VF; ++I) {

5646int &Idx = ReusedMask[I];

5647if (Idx ==PoisonMaskElem)

5648continue;

5649Value *V = TE.Scalars[ReorderMask[Idx]];

5650 std::optional<unsigned> EI =getExtractIndex(cast<Instruction>(V));

5651Idx = std::distance(ReorderMask.begin(),find(ReorderMask, *EI));

5652 }

5653 }

5654// Build the order of the VF size, need to reorder reuses shuffles, they are

5655// always of VF size.

5656OrdersType ResOrder(VF);

5657 std::iota(ResOrder.begin(), ResOrder.end(), 0);

5658auto *It = ResOrder.begin();

5659for (unsigned K = 0; K < VF; K += Sz) {

5660OrdersType CurrentOrder(TE.ReorderIndices);

5661SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};

5662if (SubMask.front() ==PoisonMaskElem)

5663 std::iota(SubMask.begin(), SubMask.end(), 0);

5664reorderOrder(CurrentOrder, SubMask);

5665transform(CurrentOrder, It, [K](unsigned Pos) {return Pos + K; });

5666 std::advance(It, Sz);

5667 }

5668if (TE.isGather() &&all_of(enumerate(ResOrder), [](constauto &Data) {

5669returnData.index() ==Data.value();

5670 }))

5671return std::nullopt;// No need to reorder.

5672return std::move(ResOrder);

5673 }

5674if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&

5675any_of(TE.UserTreeIndices,

5676 [](constEdgeInfo &EI) {

5677 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());

5678 }) &&

5679 (TE.ReorderIndices.empty() ||isReverseOrder(TE.ReorderIndices)))

5680return std::nullopt;

5681if ((TE.State == TreeEntry::Vectorize ||

5682 TE.State == TreeEntry::StridedVectorize) &&

5683 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||

5684 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {

5685assert(!TE.isAltShuffle() &&"Alternate instructions are only supported by "

5686"BinaryOperator and CastInst.");

5687return TE.ReorderIndices;

5688 }

5689if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {

5690if (!TE.ReorderIndices.empty())

5691return TE.ReorderIndices;

5692

5693SmallVector<Instruction *> UserBVHead(TE.Scalars.size());

5694for (auto [I, V] :zip(UserBVHead, TE.Scalars)) {

5695if (!V->hasNUsesOrMore(1))

5696continue;

5697auto *II = dyn_cast<InsertElementInst>(*V->user_begin());

5698if (!II)

5699continue;

5700Instruction *BVHead =nullptr;

5701BasicBlock *BB =II->getParent();

5702while (II &&II->hasOneUse() &&II->getParent() == BB) {

5703 BVHead =II;

5704II = dyn_cast<InsertElementInst>(II->getOperand(0));

5705 }

5706I = BVHead;

5707 }

5708

5709auto CompareByBasicBlocks = [&](BasicBlock *BB1,BasicBlock *BB2) {

5710assert(BB1 != BB2 &&"Expected different basic blocks.");

5711auto *NodeA = DT->getNode(BB1);

5712auto *NodeB = DT->getNode(BB2);

5713assert(NodeA &&"Should only process reachable instructions");

5714assert(NodeB &&"Should only process reachable instructions");

5715assert((NodeA == NodeB) ==

5716 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

5717"Different nodes should have different DFS numbers");

5718return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();

5719 };

5720auto PHICompare = [&](unsigned I1,unsigned I2) {

5721Value *V1 = TE.Scalars[I1];

5722Value *V2 = TE.Scalars[I2];

5723if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))

5724returnfalse;

5725if (isa<PoisonValue>(V1))

5726returntrue;

5727if (isa<PoisonValue>(V2))

5728returnfalse;

5729if (V1->getNumUses() < V2->getNumUses())

5730returntrue;

5731if (V1->getNumUses() > V2->getNumUses())

5732returnfalse;

5733auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());

5734auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());

5735if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())

5736return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),

5737 FirstUserOfPhi2->getParent());

5738auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);

5739auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);

5740auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);

5741auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);

5742if (IE1 && !IE2)

5743returntrue;

5744if (!IE1 && IE2)

5745returnfalse;

5746if (IE1 && IE2) {

5747if (UserBVHead[I1] && !UserBVHead[I2])

5748returntrue;

5749if (!UserBVHead[I1])

5750returnfalse;

5751if (UserBVHead[I1] == UserBVHead[I2])

5752returngetElementIndex(IE1) <getElementIndex(IE2);

5753if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())

5754return CompareByBasicBlocks(UserBVHead[I1]->getParent(),

5755 UserBVHead[I2]->getParent());

5756return UserBVHead[I1]->comesBefore(UserBVHead[I2]);

5757 }

5758if (EE1 && !EE2)

5759returntrue;

5760if (!EE1 && EE2)

5761returnfalse;

5762if (EE1 && EE2) {

5763auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));

5764auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));

5765auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));

5766auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));

5767if (!Inst2 && !P2)

5768return Inst1 || P1;

5769if (EE1->getOperand(0) == EE2->getOperand(0))

5770returngetElementIndex(EE1) <getElementIndex(EE2);

5771if (!Inst1 && Inst2)

5772returnfalse;

5773if (Inst1 && Inst2) {

5774if (Inst1->getParent() != Inst2->getParent())

5775return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());

5776return Inst1->comesBefore(Inst2);

5777 }

5778if (!P1 && P2)

5779returnfalse;

5780assert(P1 && P2 &&

5781"Expected either instructions or arguments vector operands.");

5782return P1->getArgNo() < P2->getArgNo();

5783 }

5784returnfalse;

5785 };

5786OrdersType Phis(TE.Scalars.size());

5787 std::iota(Phis.begin(), Phis.end(), 0);

5788stable_sort(Phis, PHICompare);

5789if (isIdentityOrder(Phis))

5790return std::nullopt;// No need to reorder.

5791return std::move(Phis);

5792 }

5793if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&

5794allSameType(TE.Scalars)) {

5795// TODO: add analysis of other gather nodes with extractelement

5796// instructions and other values/instructions, not only undefs.

5797if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||

5798 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&

5799any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&

5800all_of(TE.Scalars, [](Value *V) {

5801 auto *EE = dyn_cast<ExtractElementInst>(V);

5802 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());

5803 })) {

5804// Check that gather of extractelements can be represented as

5805// just a shuffle of a single vector.

5806OrdersType CurrentOrder;

5807bool Reuse =

5808 canReuseExtract(TE.Scalars, CurrentOrder,/*ResizeAllowed=*/true);

5809if (Reuse || !CurrentOrder.empty())

5810return std::move(CurrentOrder);

5811 }

5812// If the gather node is <undef, v, .., poison> and

5813// insertelement poison, v, 0 [+ permute]

5814// is cheaper than

5815// insertelement poison, v, n - try to reorder.

5816// If rotating the whole graph, exclude the permute cost, the whole graph

5817// might be transformed.

5818int Sz = TE.Scalars.size();

5819if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&

5820count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {

5821constauto *It =

5822find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });

5823if (It == TE.Scalars.begin())

5824returnOrdersType();

5825auto *Ty =getWidenedType(TE.Scalars.front()->getType(), Sz);

5826if (It != TE.Scalars.end()) {

5827OrdersType Order(Sz, Sz);

5828unsignedIdx = std::distance(TE.Scalars.begin(), It);

5829 Order[Idx] = 0;

5830fixupOrderingIndices(Order);

5831SmallVector<int> Mask;

5832inversePermutation(Order, Mask);

5833InstructionCost PermuteCost =

5834 TopToBottom

5835 ? 0

5836 :::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, Ty, Mask);

5837InstructionCost InsertFirstCost =TTI->getVectorInstrCost(

5838 Instruction::InsertElement, Ty,TTI::TCK_RecipThroughput, 0,

5839PoisonValue::get(Ty), *It);

5840InstructionCost InsertIdxCost =TTI->getVectorInstrCost(

5841 Instruction::InsertElement, Ty,TTI::TCK_RecipThroughput,Idx,

5842PoisonValue::get(Ty), *It);

5843if (InsertFirstCost + PermuteCost < InsertIdxCost) {

5844OrdersType Order(Sz, Sz);

5845 Order[Idx] = 0;

5846return std::move(Order);

5847 }

5848 }

5849 }

5850if (isSplat(TE.Scalars))

5851return std::nullopt;

5852if (TE.Scalars.size() >= 3)

5853if (std::optional<OrdersType> Order =findPartiallyOrderedLoads(TE))

5854return Order;

5855// Check if can include the order of vectorized loads. For masked gathers do

5856// extra analysis later, so include such nodes into a special list.

5857if (TE.hasState() && TE.getOpcode() == Instruction::Load) {

5858SmallVector<Value *> PointerOps;

5859OrdersType CurrentOrder;

5860LoadsState Res =canVectorizeLoads(TE.Scalars, TE.Scalars.front(),

5861 CurrentOrder, PointerOps);

5862if (Res ==LoadsState::Vectorize || Res ==LoadsState::StridedVectorize)

5863return std::move(CurrentOrder);

5864 }

5865// FIXME: Remove the non-power-of-two check once findReusedOrderedScalars

5866// has been auditted for correctness with non-power-of-two vectors.

5867if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))

5868if (std::optional<OrdersType> CurrentOrder =findReusedOrderedScalars(TE))

5869return CurrentOrder;

5870 }

5871return std::nullopt;

5872}

5873

5874/// Checks if the given mask is a "clustered" mask with the same clusters of

5875/// size \p Sz, which are not identity submasks.

5876staticboolisRepeatedNonIdentityClusteredMask(ArrayRef<int> Mask,

5877unsigned Sz) {

5878ArrayRef<int> FirstCluster = Mask.slice(0, Sz);

5879if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))

5880returnfalse;

5881for (unsignedI = Sz,E = Mask.size();I <E;I += Sz) {

5882ArrayRef<int> Cluster = Mask.slice(I, Sz);

5883if (Cluster != FirstCluster)

5884returnfalse;

5885 }

5886returntrue;

5887}

5888

5889void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,ArrayRef<int> Mask) const{

5890// Reorder reuses mask.

5891reorderReuses(TE.ReuseShuffleIndices, Mask);

5892constunsigned Sz =TE.Scalars.size();

5893// For vectorized and non-clustered reused no need to do anything else.

5894if (!TE.isGather() ||

5895 !ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,

5896 Sz) ||

5897 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))

5898return;

5899SmallVector<int> NewMask;

5900inversePermutation(TE.ReorderIndices, NewMask);

5901addMask(NewMask,TE.ReuseShuffleIndices);

5902// Clear reorder since it is going to be applied to the new mask.

5903TE.ReorderIndices.clear();

5904// Try to improve gathered nodes with clustered reuses, if possible.

5905ArrayRef<int> Slice =ArrayRef(NewMask).slice(0, Sz);

5906SmallVector<unsigned> NewOrder(Slice);

5907inversePermutation(NewOrder, NewMask);

5908reorderScalars(TE.Scalars, NewMask);

5909// Fill the reuses mask with the identity submasks.

5910for (auto *It =TE.ReuseShuffleIndices.begin(),

5911 *End =TE.ReuseShuffleIndices.end();

5912 It !=End; std::advance(It, Sz))

5913 std::iota(It, std::next(It, Sz), 0);

5914}

5915

5916staticvoidcombineOrders(MutableArrayRef<unsigned> Order,

5917ArrayRef<unsigned> SecondaryOrder) {

5918assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&

5919"Expected same size of orders");

5920unsigned Sz = Order.size();

5921SmallBitVector UsedIndices(Sz);

5922for (unsignedIdx : seq<unsigned>(0, Sz)) {

5923if (Order[Idx] != Sz)

5924 UsedIndices.set(Order[Idx]);

5925 }

5926if (SecondaryOrder.empty()) {

5927for (unsignedIdx : seq<unsigned>(0, Sz))

5928if (Order[Idx] == Sz && !UsedIndices.test(Idx))

5929 Order[Idx] =Idx;

5930 }else {

5931for (unsignedIdx : seq<unsigned>(0, Sz))

5932if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&

5933 !UsedIndices.test(SecondaryOrder[Idx]))

5934 Order[Idx] = SecondaryOrder[Idx];

5935 }

5936}

5937

5938voidBoUpSLP::reorderTopToBottom() {

5939// Maps VF to the graph nodes.

5940DenseMap<unsigned, SetVector<TreeEntry *>> VFToOrderedEntries;

5941// ExtractElement gather nodes which can be vectorized and need to handle

5942// their ordering.

5943DenseMap<const TreeEntry *, OrdersType> GathersToOrders;

5944

5945// Phi nodes can have preferred ordering based on their result users

5946DenseMap<const TreeEntry *, OrdersType> PhisToOrders;

5947

5948// AltShuffles can also have a preferred ordering that leads to fewer

5949// instructions, e.g., the addsub instruction in x86.

5950DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;

5951

5952// Maps a TreeEntry to the reorder indices of external users.

5953DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>

5954 ExternalUserReorderMap;

5955// Find all reorderable nodes with the given VF.

5956// Currently the are vectorized stores,loads,extracts + some gathering of

5957// extracts.

5958for_each(VectorizableTree, [&, &TTIRef = *TTI](

5959const std::unique_ptr<TreeEntry> &TE) {

5960// Look for external users that will probably be vectorized.

5961SmallVector<OrdersType, 1> ExternalUserReorderIndices =

5962 findExternalStoreUsersReorderIndices(TE.get());

5963if (!ExternalUserReorderIndices.empty()) {

5964 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

5965 ExternalUserReorderMap.try_emplace(TE.get(),

5966 std::move(ExternalUserReorderIndices));

5967 }

5968

5969// Patterns like [fadd,fsub] can be combined into a single instruction in

5970// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need

5971// to take into account their order when looking for the most used order.

5972if (TE->hasState() && TE->isAltShuffle()) {

5973VectorType *VecTy =

5974getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());

5975unsigned Opcode0 = TE->getOpcode();

5976unsigned Opcode1 = TE->getAltOpcode();

5977SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));

5978// If this pattern is supported by the target then we consider the order.

5979if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {

5980 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

5981 AltShufflesToOrders.try_emplace(TE.get(),OrdersType());

5982 }

5983// TODO: Check the reverse order too.

5984 }

5985

5986if (std::optional<OrdersType> CurrentOrder =

5987getReorderingData(*TE,/*TopToBottom=*/true)) {

5988// Do not include ordering for nodes used in the alt opcode vectorization,

5989// better to reorder them during bottom-to-top stage. If follow the order

5990// here, it causes reordering of the whole graph though actually it is

5991// profitable just to reorder the subgraph that starts from the alternate

5992// opcode vectorization node. Such nodes already end-up with the shuffle

5993// instruction and it is just enough to change this shuffle rather than

5994// rotate the scalars for the whole graph.

5995unsigned Cnt = 0;

5996const TreeEntry *UserTE = TE.get();

5997while (UserTE && Cnt <RecursionMaxDepth) {

5998if (UserTE->UserTreeIndices.size() != 1)

5999break;

6000if (all_of(UserTE->UserTreeIndices, [](constEdgeInfo &EI) {

6001 return EI.UserTE->State == TreeEntry::Vectorize &&

6002 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;

6003 }))

6004return;

6005 UserTE = UserTE->UserTreeIndices.back().UserTE;

6006 ++Cnt;

6007 }

6008 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());

6009if (!(TE->State == TreeEntry::Vectorize ||

6010 TE->State == TreeEntry::StridedVectorize) ||

6011 !TE->ReuseShuffleIndices.empty())

6012 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);

6013if (TE->State == TreeEntry::Vectorize &&

6014 TE->getOpcode() == Instruction::PHI)

6015 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);

6016 }

6017 });

6018

6019// Reorder the graph nodes according to their vectorization factor.

6020for (unsigned VF = VectorizableTree.front()->getVectorFactor();

6021 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {

6022auto It = VFToOrderedEntries.find(VF);

6023if (It == VFToOrderedEntries.end())

6024continue;

6025// Try to find the most profitable order. We just are looking for the most

6026// used order and reorder scalar elements in the nodes according to this

6027// mostly used order.

6028ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();

6029// Delete VF entry upon exit.

6030autoCleanup =make_scope_exit([&]() { VFToOrderedEntries.erase(It); });

6031

6032// All operands are reordered and used only in this node - propagate the

6033// most used order to the user node.

6034MapVector<OrdersType,unsigned,

6035DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

6036 OrdersUses;

6037SmallPtrSet<const TreeEntry *, 4> VisitedOps;

6038for (const TreeEntry *OpTE : OrderedEntries) {

6039// No need to reorder this nodes, still need to extend and to use shuffle,

6040// just need to merge reordering shuffle and the reuse shuffle.

6041if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))

6042continue;

6043// Count number of orders uses.

6044constauto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,

6045 &PhisToOrders]() ->constOrdersType & {

6046if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {

6047auto It = GathersToOrders.find(OpTE);

6048if (It != GathersToOrders.end())

6049return It->second;

6050 }

6051if (OpTE->hasState() && OpTE->isAltShuffle()) {

6052auto It = AltShufflesToOrders.find(OpTE);

6053if (It != AltShufflesToOrders.end())

6054return It->second;

6055 }

6056if (OpTE->State == TreeEntry::Vectorize &&

6057 OpTE->getOpcode() == Instruction::PHI) {

6058auto It = PhisToOrders.find(OpTE);

6059if (It != PhisToOrders.end())

6060return It->second;

6061 }

6062return OpTE->ReorderIndices;

6063 }();

6064// First consider the order of the external scalar users.

6065auto It = ExternalUserReorderMap.find(OpTE);

6066if (It != ExternalUserReorderMap.end()) {

6067constauto &ExternalUserReorderIndices = It->second;

6068// If the OpTE vector factor != number of scalars - use natural order,

6069// it is an attempt to reorder node with reused scalars but with

6070// external uses.

6071if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {

6072 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=

6073 ExternalUserReorderIndices.size();

6074 }else {

6075for (constOrdersType &ExtOrder : ExternalUserReorderIndices)

6076 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;

6077 }

6078// No other useful reorder data in this entry.

6079if (Order.empty())

6080continue;

6081 }

6082// Stores actually store the mask, not the order, need to invert.

6083if (OpTE->State == TreeEntry::Vectorize &&

6084 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

6085assert(!OpTE->isAltShuffle() &&

6086"Alternate instructions are only supported by BinaryOperator "

6087"and CastInst.");

6088SmallVector<int> Mask;

6089inversePermutation(Order, Mask);

6090unsignedE = Order.size();

6091OrdersType CurrentOrder(E,E);

6092transform(Mask, CurrentOrder.begin(), [E](intIdx) {

6093 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);

6094 });

6095fixupOrderingIndices(CurrentOrder);

6096 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;

6097 }else {

6098 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;

6099 }

6100 }

6101if (OrdersUses.empty())

6102continue;

6103// Choose the most used order.

6104unsigned IdentityCnt = 0;

6105unsigned FilledIdentityCnt = 0;

6106OrdersType IdentityOrder(VF, VF);

6107for (auto &Pair : OrdersUses) {

6108if (Pair.first.empty() ||isIdentityOrder(Pair.first)) {

6109if (!Pair.first.empty())

6110 FilledIdentityCnt += Pair.second;

6111 IdentityCnt += Pair.second;

6112combineOrders(IdentityOrder, Pair.first);

6113 }

6114 }

6115MutableArrayRef<unsigned> BestOrder = IdentityOrder;

6116unsigned Cnt = IdentityCnt;

6117for (auto &Pair : OrdersUses) {

6118// Prefer identity order. But, if filled identity found (non-empty order)

6119// with same number of uses, as the new candidate order, we can choose

6120// this candidate order.

6121if (Cnt < Pair.second ||

6122 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&

6123 Cnt == Pair.second && !BestOrder.empty() &&

6124isIdentityOrder(BestOrder))) {

6125combineOrders(Pair.first, BestOrder);

6126 BestOrder = Pair.first;

6127 Cnt = Pair.second;

6128 }else {

6129combineOrders(BestOrder, Pair.first);

6130 }

6131 }

6132// Set order of the user node.

6133if (isIdentityOrder(BestOrder))

6134continue;

6135fixupOrderingIndices(BestOrder);

6136SmallVector<int> Mask;

6137inversePermutation(BestOrder, Mask);

6138SmallVector<int> MaskOrder(BestOrder.size(),PoisonMaskElem);

6139unsignedE = BestOrder.size();

6140transform(BestOrder, MaskOrder.begin(), [E](unsignedI) {

6141 return I < E ? static_cast<int>(I) : PoisonMaskElem;

6142 });

6143// Do an actual reordering, if profitable.

6144for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

6145// Just do the reordering for the nodes with the given VF.

6146if (TE->Scalars.size() != VF) {

6147if (TE->ReuseShuffleIndices.size() == VF) {

6148// Need to reorder the reuses masks of the operands with smaller VF to

6149// be able to find the match between the graph nodes and scalar

6150// operands of the given node during vectorization/cost estimation.

6151assert(all_of(TE->UserTreeIndices,

6152 [VF, &TE](constEdgeInfo &EI) {

6153 return EI.UserTE->Scalars.size() == VF ||

6154 EI.UserTE->Scalars.size() ==

6155 TE->Scalars.size();

6156 }) &&

6157"All users must be of VF size.");

6158if (SLPReVec) {

6159assert(SLPReVec &&"Only supported by REVEC.");

6160// ShuffleVectorInst does not do reorderOperands (and it should not

6161// because ShuffleVectorInst supports only a limited set of

6162// patterns). Only do reorderNodeWithReuses if all of the users are

6163// not ShuffleVectorInst.

6164if (all_of(TE->UserTreeIndices, [&](constEdgeInfo &EI) {

6165 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());

6166 }))

6167continue;

6168assert(none_of(TE->UserTreeIndices,

6169 [&](constEdgeInfo &EI) {

6170 return isa<ShuffleVectorInst>(

6171 EI.UserTE->getMainOp());

6172 }) &&

6173"Does not know how to reorder.");

6174 }

6175// Update ordering of the operands with the smaller VF than the given

6176// one.

6177 reorderNodeWithReuses(*TE, Mask);

6178 }

6179continue;

6180 }

6181if ((TE->State == TreeEntry::Vectorize ||

6182 TE->State == TreeEntry::StridedVectorize) &&

6183 (isa<ExtractElementInst,ExtractValueInst,LoadInst,StoreInst,

6184InsertElementInst>(TE->getMainOp()) ||

6185 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {

6186assert(!TE->isAltShuffle() &&

6187"Alternate instructions are only supported by BinaryOperator "

6188"and CastInst.");

6189// Build correct orders for extract{element,value}, loads and

6190// stores.

6191reorderOrder(TE->ReorderIndices, Mask);

6192if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))

6193 TE->reorderOperands(Mask);

6194 }else {

6195// Reorder the node and its operands.

6196 TE->reorderOperands(Mask);

6197assert(TE->ReorderIndices.empty() &&

6198"Expected empty reorder sequence.");

6199reorderScalars(TE->Scalars, Mask);

6200 }

6201if (!TE->ReuseShuffleIndices.empty()) {

6202// Apply reversed order to keep the original ordering of the reused

6203// elements to avoid extra reorder indices shuffling.

6204OrdersType CurrentOrder;

6205reorderOrder(CurrentOrder, MaskOrder);

6206SmallVector<int> NewReuses;

6207inversePermutation(CurrentOrder, NewReuses);

6208addMask(NewReuses, TE->ReuseShuffleIndices);

6209 TE->ReuseShuffleIndices.swap(NewReuses);

6210 }

6211 }

6212 }

6213}

6214

6215bool BoUpSLP::canReorderOperands(

6216 TreeEntry *UserTE,SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,

6217ArrayRef<TreeEntry *> ReorderableGathers,

6218SmallVectorImpl<TreeEntry *> &GatherOps) {

6219for (unsignedI = 0,E = UserTE->getNumOperands();I <E; ++I) {

6220if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {

6221return OpData.first ==I &&

6222 (OpData.second->State == TreeEntry::Vectorize ||

6223 OpData.second->State == TreeEntry::StridedVectorize);

6224 }))

6225continue;

6226if (TreeEntry *TE = getVectorizedOperand(UserTE,I)) {

6227// Do not reorder if operand node is used by many user nodes.

6228if (any_of(TE->UserTreeIndices,

6229 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))

6230returnfalse;

6231// Add the node to the list of the ordered nodes with the identity

6232// order.

6233 Edges.emplace_back(I, TE);

6234// Add ScatterVectorize nodes to the list of operands, where just

6235// reordering of the scalars is required. Similar to the gathers, so

6236// simply add to the list of gathered ops.

6237// If there are reused scalars, process this node as a regular vectorize

6238// node, just reorder reuses mask.

6239if (TE->State != TreeEntry::Vectorize &&

6240 TE->State != TreeEntry::StridedVectorize &&

6241 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())

6242 GatherOps.push_back(TE);

6243continue;

6244 }

6245 TreeEntry *Gather =nullptr;

6246if (count_if(ReorderableGathers,

6247 [&Gather, UserTE,I](TreeEntry *TE) {

6248assert(TE->State != TreeEntry::Vectorize &&

6249 TE->State != TreeEntry::StridedVectorize &&

6250"Only non-vectorized nodes are expected.");

6251if (any_of(TE->UserTreeIndices,

6252 [UserTE,I](const EdgeInfo &EI) {

6253 return EI.UserTE == UserTE && EI.EdgeIdx == I;

6254 })) {

6255assert(TE->isSame(UserTE->getOperand(I)) &&

6256"Operand entry does not match operands.");

6257Gather = TE;

6258returntrue;

6259 }

6260returnfalse;

6261 }) > 1 &&

6262 !allConstant(UserTE->getOperand(I)))

6263returnfalse;

6264if (Gather)

6265 GatherOps.push_back(Gather);

6266 }

6267returntrue;

6268}

6269

6270voidBoUpSLP::reorderBottomToTop(bool IgnoreReorder) {

6271SetVector<TreeEntry *> OrderedEntries;

6272DenseSet<const TreeEntry *> GathersToOrders;

6273// Find all reorderable leaf nodes with the given VF.

6274// Currently the are vectorized loads,extracts without alternate operands +

6275// some gathering of extracts.

6276SmallVector<TreeEntry *> NonVectorized;

6277for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

6278if (TE->State != TreeEntry::Vectorize &&

6279 TE->State != TreeEntry::StridedVectorize)

6280 NonVectorized.push_back(TE.get());

6281if (std::optional<OrdersType> CurrentOrder =

6282getReorderingData(*TE,/*TopToBottom=*/false)) {

6283 OrderedEntries.insert(TE.get());

6284if (!(TE->State == TreeEntry::Vectorize ||

6285 TE->State == TreeEntry::StridedVectorize) ||

6286 !TE->ReuseShuffleIndices.empty())

6287 GathersToOrders.insert(TE.get());

6288 }

6289 }

6290

6291// 1. Propagate order to the graph nodes, which use only reordered nodes.

6292// I.e., if the node has operands, that are reordered, try to make at least

6293// one operand order in the natural order and reorder others + reorder the

6294// user node itself.

6295SmallPtrSet<const TreeEntry *, 4> Visited;

6296while (!OrderedEntries.empty()) {

6297// 1. Filter out only reordered nodes.

6298// 2. If the entry has multiple uses - skip it and jump to the next node.

6299DenseMap<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>Users;

6300SmallVector<TreeEntry *> Filtered;

6301for (TreeEntry *TE : OrderedEntries) {

6302if (!(TE->State == TreeEntry::Vectorize ||

6303 TE->State == TreeEntry::StridedVectorize ||

6304 (TE->isGather() && GathersToOrders.contains(TE))) ||

6305 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

6306 !all_of(drop_begin(TE->UserTreeIndices),

6307 [TE](constEdgeInfo &EI) {

6308 return EI.UserTE == TE->UserTreeIndices.front().UserTE;

6309 }) ||

6310 !Visited.insert(TE).second) {

6311 Filtered.push_back(TE);

6312continue;

6313 }

6314// Build a map between user nodes and their operands order to speedup

6315// search. The graph currently does not provide this dependency directly.

6316for (EdgeInfo &EI : TE->UserTreeIndices)

6317Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);

6318 }

6319// Erase filtered entries.

6320for (TreeEntry *TE : Filtered)

6321 OrderedEntries.remove(TE);

6322SmallVector<

6323 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>

6324 UsersVec(Users.begin(),Users.end());

6325sort(UsersVec, [](constauto &Data1,constauto &Data2) {

6326return Data1.first->Idx > Data2.first->Idx;

6327 });

6328for (auto &Data : UsersVec) {

6329// Check that operands are used only in the User node.

6330SmallVector<TreeEntry *> GatherOps;

6331if (!canReorderOperands(Data.first,Data.second, NonVectorized,

6332 GatherOps)) {

6333for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)

6334 OrderedEntries.remove(Op.second);

6335continue;

6336 }

6337// All operands are reordered and used only in this node - propagate the

6338// most used order to the user node.

6339MapVector<OrdersType,unsigned,

6340DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo>>

6341 OrdersUses;

6342// Do the analysis for each tree entry only once, otherwise the order of

6343// the same node my be considered several times, though might be not

6344// profitable.

6345SmallPtrSet<const TreeEntry *, 4> VisitedOps;

6346SmallPtrSet<const TreeEntry *, 4> VisitedUsers;

6347for (constauto &Op :Data.second) {

6348 TreeEntry *OpTE =Op.second;

6349if (!VisitedOps.insert(OpTE).second)

6350continue;

6351if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))

6352continue;

6353constauto Order = [&]() ->constOrdersType {

6354if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())

6355returngetReorderingData(*OpTE,/*TopToBottom=*/false)

6356 .value_or(OrdersType(1));

6357return OpTE->ReorderIndices;

6358 }();

6359// The order is partially ordered, skip it in favor of fully non-ordered

6360// orders.

6361if (Order.size() == 1)

6362continue;

6363unsigned NumOps =count_if(

6364Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {

6365 return P.second == OpTE;

6366 });

6367// Stores actually store the mask, not the order, need to invert.

6368if (OpTE->State == TreeEntry::Vectorize &&

6369 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {

6370assert(!OpTE->isAltShuffle() &&

6371"Alternate instructions are only supported by BinaryOperator "

6372"and CastInst.");

6373SmallVector<int> Mask;

6374inversePermutation(Order, Mask);

6375unsignedE = Order.size();

6376OrdersType CurrentOrder(E,E);

6377transform(Mask, CurrentOrder.begin(), [E](intIdx) {

6378 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);

6379 });

6380fixupOrderingIndices(CurrentOrder);

6381 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=

6382 NumOps;

6383 }else {

6384 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;

6385 }

6386auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));

6387constauto AllowsReordering = [&](const TreeEntry *TE) {

6388if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||

6389 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||

6390 (IgnoreReorder && TE->Idx == 0))

6391returntrue;

6392if (TE->isGather()) {

6393if (GathersToOrders.contains(TE))

6394return !getReorderingData(*TE,/*TopToBottom=*/false)

6395 .value_or(OrdersType(1))

6396 .empty();

6397returntrue;

6398 }

6399returnfalse;

6400 };

6401for (constEdgeInfo &EI : OpTE->UserTreeIndices) {

6402 TreeEntry *UserTE = EI.UserTE;

6403if (!VisitedUsers.insert(UserTE).second)

6404continue;

6405// May reorder user node if it requires reordering, has reused

6406// scalars, is an alternate op vectorize node or its op nodes require

6407// reordering.

6408if (AllowsReordering(UserTE))

6409continue;

6410// Check if users allow reordering.

6411// Currently look up just 1 level of operands to avoid increase of

6412// the compile time.

6413// Profitable to reorder if definitely more operands allow

6414// reordering rather than those with natural order.

6415ArrayRef<std::pair<unsigned, TreeEntry *>> Ops =Users[UserTE];

6416if (static_cast<unsigned>(count_if(

6417 Ops, [UserTE, &AllowsReordering](

6418const std::pair<unsigned, TreeEntry *> &Op) {

6419return AllowsReordering(Op.second) &&

6420all_of(Op.second->UserTreeIndices,

6421 [UserTE](constEdgeInfo &EI) {

6422 return EI.UserTE == UserTE;

6423 });

6424 })) <= Ops.size() / 2)

6425 ++Res.first->second;

6426 }

6427 }

6428if (OrdersUses.empty()) {

6429for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)

6430 OrderedEntries.remove(Op.second);

6431continue;

6432 }

6433// Choose the most used order.

6434unsigned IdentityCnt = 0;

6435unsigned VF =Data.second.front().second->getVectorFactor();

6436OrdersType IdentityOrder(VF, VF);

6437for (auto &Pair : OrdersUses) {

6438if (Pair.first.empty() ||isIdentityOrder(Pair.first)) {

6439 IdentityCnt += Pair.second;

6440combineOrders(IdentityOrder, Pair.first);

6441 }

6442 }

6443MutableArrayRef<unsigned> BestOrder = IdentityOrder;

6444unsigned Cnt = IdentityCnt;

6445for (auto &Pair : OrdersUses) {

6446// Prefer identity order. But, if filled identity found (non-empty

6447// order) with same number of uses, as the new candidate order, we can

6448// choose this candidate order.

6449if (Cnt < Pair.second) {

6450combineOrders(Pair.first, BestOrder);

6451 BestOrder = Pair.first;

6452 Cnt = Pair.second;

6453 }else {

6454combineOrders(BestOrder, Pair.first);

6455 }

6456 }

6457// Set order of the user node.

6458if (isIdentityOrder(BestOrder)) {

6459for (const std::pair<unsigned, TreeEntry *> &Op :Data.second)

6460 OrderedEntries.remove(Op.second);

6461continue;

6462 }

6463fixupOrderingIndices(BestOrder);

6464// Erase operands from OrderedEntries list and adjust their orders.

6465 VisitedOps.clear();

6466SmallVector<int> Mask;

6467inversePermutation(BestOrder, Mask);

6468SmallVector<int> MaskOrder(BestOrder.size(),PoisonMaskElem);

6469unsignedE = BestOrder.size();

6470transform(BestOrder, MaskOrder.begin(), [E](unsignedI) {

6471 return I < E ? static_cast<int>(I) : PoisonMaskElem;

6472 });

6473for (const std::pair<unsigned, TreeEntry *> &Op :Data.second) {

6474 TreeEntry *TE =Op.second;

6475 OrderedEntries.remove(TE);

6476if (!VisitedOps.insert(TE).second)

6477continue;

6478if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {

6479 reorderNodeWithReuses(*TE, Mask);

6480continue;

6481 }

6482// Gathers are processed separately.

6483if (TE->State != TreeEntry::Vectorize &&

6484 TE->State != TreeEntry::StridedVectorize &&

6485 (TE->State != TreeEntry::ScatterVectorize ||

6486 TE->ReorderIndices.empty()))

6487continue;

6488assert((BestOrder.size() == TE->ReorderIndices.size() ||

6489 TE->ReorderIndices.empty()) &&

6490"Non-matching sizes of user/operand entries.");

6491reorderOrder(TE->ReorderIndices, Mask);

6492if (IgnoreReorder && TE == VectorizableTree.front().get())

6493 IgnoreReorder =false;

6494 }

6495// For gathers just need to reorder its scalars.

6496for (TreeEntry *Gather : GatherOps) {

6497assert(Gather->ReorderIndices.empty() &&

6498"Unexpected reordering of gathers.");

6499if (!Gather->ReuseShuffleIndices.empty()) {

6500// Just reorder reuses indices.

6501reorderReuses(Gather->ReuseShuffleIndices, Mask);

6502continue;

6503 }

6504reorderScalars(Gather->Scalars, Mask);

6505 OrderedEntries.remove(Gather);

6506 }

6507// Reorder operands of the user node and set the ordering for the user

6508// node itself.

6509if (Data.first->State != TreeEntry::Vectorize ||

6510 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(

6511Data.first->getMainOp()) ||

6512Data.first->isAltShuffle())

6513Data.first->reorderOperands(Mask);

6514if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||

6515Data.first->isAltShuffle() ||

6516Data.first->State == TreeEntry::StridedVectorize) {

6517reorderScalars(Data.first->Scalars, Mask);

6518reorderOrder(Data.first->ReorderIndices, MaskOrder,

6519/*BottomOrder=*/true);

6520if (Data.first->ReuseShuffleIndices.empty() &&

6521 !Data.first->ReorderIndices.empty() &&

6522 !Data.first->isAltShuffle()) {

6523// Insert user node to the list to try to sink reordering deeper in

6524// the graph.

6525 OrderedEntries.insert(Data.first);

6526 }

6527 }else {

6528reorderOrder(Data.first->ReorderIndices, Mask);

6529 }

6530 }

6531 }

6532// If the reordering is unnecessary, just remove the reorder.

6533if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&

6534 VectorizableTree.front()->ReuseShuffleIndices.empty())

6535 VectorizableTree.front()->ReorderIndices.clear();

6536}

6537

6538Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const{

6539if ((Entry.getOpcode() == Instruction::Store ||

6540 Entry.getOpcode() == Instruction::Load) &&

6541 Entry.State == TreeEntry::StridedVectorize &&

6542 !Entry.ReorderIndices.empty() &&isReverseOrder(Entry.ReorderIndices))

6543return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);

6544return dyn_cast<Instruction>(Entry.Scalars.front());

6545}

6546

6547voidBoUpSLP::buildExternalUses(

6548constExtraValueToDebugLocsMap &ExternallyUsedValues) {

6549DenseMap<Value *, unsigned> ScalarToExtUses;

6550// Collect the values that we need to extract from the tree.

6551for (auto &TEPtr : VectorizableTree) {

6552 TreeEntry *Entry = TEPtr.get();

6553

6554// No need to handle users of gathered values.

6555if (Entry->isGather())

6556continue;

6557

6558// For each lane:

6559for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

6560Value *Scalar = Entry->Scalars[Lane];

6561if (!isa<Instruction>(Scalar))

6562continue;

6563// All uses must be replaced already? No need to do it again.

6564auto It = ScalarToExtUses.find(Scalar);

6565if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)

6566continue;

6567

6568// Check if the scalar is externally used as an extra arg.

6569constauto ExtI = ExternallyUsedValues.find(Scalar);

6570if (ExtI != ExternallyUsedValues.end()) {

6571int FoundLane = Entry->findLaneForValue(Scalar);

6572LLVM_DEBUG(dbgs() <<"SLP: Need to extract: Extra arg from lane "

6573 << FoundLane <<" from " << *Scalar <<".\n");

6574 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());

6575 ExternalUses.emplace_back(Scalar,nullptr, FoundLane);

6576continue;

6577 }

6578for (User *U : Scalar->users()) {

6579LLVM_DEBUG(dbgs() <<"SLP: Checking user:" << *U <<".\n");

6580

6581Instruction *UserInst = dyn_cast<Instruction>(U);

6582if (!UserInst ||isDeleted(UserInst))

6583continue;

6584

6585// Ignore users in the user ignore list.

6586if (UserIgnoreList && UserIgnoreList->contains(UserInst))

6587continue;

6588

6589// Skip in-tree scalars that become vectors

6590if (TreeEntry *UseEntry = getTreeEntry(U)) {

6591// Some in-tree scalars will remain as scalar in vectorized

6592// instructions. If that is the case, the one in FoundLane will

6593// be used.

6594if (UseEntry->State == TreeEntry::ScatterVectorize ||

6595 !doesInTreeUserNeedToExtract(

6596 Scalar, getRootEntryInstruction(*UseEntry), TLI,TTI)) {

6597LLVM_DEBUG(dbgs() <<"SLP: \tInternal user will be removed:" << *U

6598 <<".\n");

6599assert(!UseEntry->isGather() &&"Bad state");

6600continue;

6601 }

6602 U =nullptr;

6603if (It != ScalarToExtUses.end()) {

6604 ExternalUses[It->second].User =nullptr;

6605break;

6606 }

6607 }

6608

6609if (U && Scalar->hasNUsesOrMore(UsesLimit))

6610 U =nullptr;

6611int FoundLane = Entry->findLaneForValue(Scalar);

6612LLVM_DEBUG(dbgs() <<"SLP: Need to extract:" << *UserInst

6613 <<" from lane " << FoundLane <<" from " << *Scalar

6614 <<".\n");

6615 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;

6616 ExternalUses.emplace_back(Scalar, U, FoundLane);

6617if (!U)

6618break;

6619 }

6620 }

6621 }

6622}

6623

6624SmallVector<SmallVector<StoreInst *>>

6625BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const{

6626SmallDenseMap<std::tuple<BasicBlock *, Type *, Value *>,

6627SmallVector<StoreInst *>, 8>

6628 PtrToStoresMap;

6629for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {

6630Value *V = TE->Scalars[Lane];

6631// Don't iterate over the users of constant data.

6632if (!isa<Instruction>(V))

6633continue;

6634// To save compilation time we don't visit if we have too many users.

6635if (V->hasNUsesOrMore(UsesLimit))

6636break;

6637

6638// Collect stores per pointer object.

6639for (User *U : V->users()) {

6640auto *SI = dyn_cast<StoreInst>(U);

6641// Test whether we can handle the store. V might be a global, which could

6642// be used in a different function.

6643if (SI ==nullptr || !SI->isSimple() || SI->getFunction() !=F ||

6644 !isValidElementType(SI->getValueOperand()->getType()))

6645continue;

6646// Skip entry if already

6647if (getTreeEntry(U))

6648continue;

6649

6650Value *Ptr =

6651getUnderlyingObject(SI->getPointerOperand(),RecursionMaxDepth);

6652auto &StoresVec = PtrToStoresMap[{SI->getParent(),

6653 SI->getValueOperand()->getType(),Ptr}];

6654// For now just keep one store per pointer object per lane.

6655// TODO: Extend this to support multiple stores per pointer per lane

6656if (StoresVec.size() > Lane)

6657continue;

6658if (!StoresVec.empty()) {

6659 std::optional<int> Diff =getPointersDiff(

6660 SI->getValueOperand()->getType(), SI->getPointerOperand(),

6661 SI->getValueOperand()->getType(),

6662 StoresVec.front()->getPointerOperand(), *DL, *SE,

6663/*StrictCheck=*/true);

6664// We failed to compare the pointers so just abandon this store.

6665if (!Diff)

6666continue;

6667 }

6668 StoresVec.push_back(SI);

6669 }

6670 }

6671SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());

6672unsignedI = 0;

6673for (auto &P : PtrToStoresMap) {

6674 Res[I].swap(P.second);

6675 ++I;

6676 }

6677return Res;

6678}

6679

6680bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,

6681 OrdersType &ReorderIndices) const{

6682// We check whether the stores in StoreVec can form a vector by sorting them

6683// and checking whether they are consecutive.

6684

6685// To avoid calling getPointersDiff() while sorting we create a vector of

6686// pairs {store, offset from first} and sort this instead.

6687SmallVector<std::pair<int, unsigned>> StoreOffsetVec;

6688StoreInst *S0 = StoresVec[0];

6689 StoreOffsetVec.emplace_back(0, 0);

6690Type *S0Ty = S0->getValueOperand()->getType();

6691Value *S0Ptr = S0->getPointerOperand();

6692for (unsignedIdx : seq<unsigned>(1, StoresVec.size())) {

6693StoreInst *SI = StoresVec[Idx];

6694 std::optional<int> Diff =

6695getPointersDiff(S0Ty, S0Ptr,SI->getValueOperand()->getType(),

6696SI->getPointerOperand(), *DL, *SE,

6697/*StrictCheck=*/true);

6698 StoreOffsetVec.emplace_back(*Diff,Idx);

6699 }

6700

6701// Check if the stores are consecutive by checking if their difference is 1.

6702if (StoreOffsetVec.size() != StoresVec.size())

6703returnfalse;

6704sort(StoreOffsetVec,

6705 [](const std::pair<int, unsigned> &L,

6706const std::pair<int, unsigned> &R) {returnL.first <R.first; });

6707unsignedIdx = 0;

6708int PrevDist = 0;

6709for (constauto &P : StoreOffsetVec) {

6710if (Idx > 0 &&P.first != PrevDist + 1)

6711returnfalse;

6712 PrevDist =P.first;

6713 ++Idx;

6714 }

6715

6716// Calculate the shuffle indices according to their offset against the sorted

6717// StoreOffsetVec.

6718 ReorderIndices.assign(StoresVec.size(), 0);

6719bool IsIdentity =true;

6720for (auto [I,P] :enumerate(StoreOffsetVec)) {

6721 ReorderIndices[P.second] =I;

6722 IsIdentity &=P.second ==I;

6723 }

6724// Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in

6725// reorderTopToBottom() and reorderBottomToTop(), so we are following the

6726// same convention here.

6727if (IsIdentity)

6728 ReorderIndices.clear();

6729

6730returntrue;

6731}

6732

6733#ifndef NDEBUG

6734LLVM_DUMP_METHODstaticvoiddumpOrder(constBoUpSLP::OrdersType &Order) {

6735for (unsignedIdx : Order)

6736dbgs() <<Idx <<", ";

6737dbgs() <<"\n";

6738}

6739#endif

6740

6741SmallVector<BoUpSLP::OrdersType, 1>

6742BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const{

6743unsigned NumLanes =TE->Scalars.size();

6744

6745SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);

6746

6747// Holds the reorder indices for each candidate store vector that is a user of

6748// the current TreeEntry.

6749SmallVector<OrdersType, 1> ExternalReorderIndices;

6750

6751// Now inspect the stores collected per pointer and look for vectorization

6752// candidates. For each candidate calculate the reorder index vector and push

6753// it into `ExternalReorderIndices`

6754for (ArrayRef<StoreInst *> StoresVec : Stores) {

6755// If we have fewer than NumLanes stores, then we can't form a vector.

6756if (StoresVec.size() != NumLanes)

6757continue;

6758

6759// If the stores are not consecutive then abandon this StoresVec.

6760OrdersType ReorderIndices;

6761if (!canFormVector(StoresVec, ReorderIndices))

6762continue;

6763

6764// We now know that the scalars in StoresVec can form a vector instruction,

6765// so set the reorder indices.

6766 ExternalReorderIndices.push_back(ReorderIndices);

6767 }

6768return ExternalReorderIndices;

6769}

6770

6771voidBoUpSLP::buildTree(ArrayRef<Value *> Roots,

6772constSmallDenseSet<Value *> &UserIgnoreLst) {

6773deleteTree();

6774 UserIgnoreList = &UserIgnoreLst;

6775if (!allSameType(Roots))

6776return;

6777 buildTree_rec(Roots, 0,EdgeInfo());

6778}

6779

6780voidBoUpSLP::buildTree(ArrayRef<Value *> Roots) {

6781deleteTree();

6782if (!allSameType(Roots))

6783return;

6784 buildTree_rec(Roots, 0,EdgeInfo());

6785}

6786

6787/// Tries to find subvector of loads and builds new vector of only loads if can

6788/// be profitable.

6789staticvoidgatherPossiblyVectorizableLoads(

6790constBoUpSLP &R,ArrayRef<Value *> VL,constDataLayout &DL,

6791ScalarEvolution &SE,constTargetTransformInfo &TTI,

6792SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,

6793bool AddNew =true) {

6794if (VL.empty())

6795return;

6796Type *ScalarTy =getValueType(VL.front());

6797if (!isValidElementType(ScalarTy))

6798return;

6799SmallVector<SmallVector<std::pair<LoadInst *, int>>> ClusteredLoads;

6800SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;

6801for (Value *V : VL) {

6802auto *LI = dyn_cast<LoadInst>(V);

6803if (!LI)

6804continue;

6805if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())

6806continue;

6807bool IsFound =false;

6808for (auto [Map,Data] :zip(ClusteredDistToLoad, ClusteredLoads)) {

6809assert(LI->getParent() ==Data.front().first->getParent() &&

6810 LI->getType() ==Data.front().first->getType() &&

6811getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth) ==

6812getUnderlyingObject(Data.front().first->getPointerOperand(),

6813RecursionMaxDepth) &&

6814"Expected loads with the same type, same parent and same "

6815"underlying pointer.");

6816 std::optional<int> Dist =getPointersDiff(

6817 LI->getType(), LI->getPointerOperand(),Data.front().first->getType(),

6818Data.front().first->getPointerOperand(),DL, SE,

6819/*StrictCheck=*/true);

6820if (!Dist)

6821continue;

6822auto It = Map.find(*Dist);

6823if (It != Map.end() && It->second != LI)

6824continue;

6825if (It == Map.end()) {

6826Data.emplace_back(LI, *Dist);

6827 Map.try_emplace(*Dist, LI);

6828 }

6829 IsFound =true;

6830break;

6831 }

6832if (!IsFound) {

6833 ClusteredLoads.emplace_back().emplace_back(LI, 0);

6834 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);

6835 }

6836 }

6837auto FindMatchingLoads =

6838 [&](ArrayRef<std::pair<LoadInst *, int>> Loads,

6839SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>>

6840 &GatheredLoads,

6841SetVector<unsigned> &ToAdd,SetVector<unsigned> &Repeated,

6842int &Offset,unsigned &Start) {

6843if (Loads.empty())

6844return GatheredLoads.end();

6845SmallVector<std::pair<int, int>> Res;

6846LoadInst *LI = Loads.front().first;

6847for (auto [Idx,Data] :enumerate(GatheredLoads)) {

6848if (Idx < Start)

6849continue;

6850 ToAdd.clear();

6851if (LI->getParent() !=Data.front().first->getParent() ||

6852 LI->getType() !=Data.front().first->getType())

6853continue;

6854 std::optional<int> Dist =

6855getPointersDiff(LI->getType(), LI->getPointerOperand(),

6856Data.front().first->getType(),

6857Data.front().first->getPointerOperand(),DL, SE,

6858/*StrictCheck=*/true);

6859if (!Dist)

6860continue;

6861SmallSet<int, 4> DataDists;

6862SmallPtrSet<LoadInst *, 4> DataLoads;

6863for (std::pair<LoadInst *, int>P :Data) {

6864 DataDists.insert(P.second);

6865 DataLoads.insert(P.first);

6866 }

6867// Found matching gathered loads - check if all loads are unique or

6868// can be effectively vectorized.

6869unsigned NumUniques = 0;

6870for (auto [Cnt, Pair] :enumerate(Loads)) {

6871bool Used = DataLoads.contains(Pair.first);

6872if (!Used && !DataDists.contains(*Dist + Pair.second)) {

6873 ++NumUniques;

6874 ToAdd.insert(Cnt);

6875 }elseif (Used) {

6876 Repeated.insert(Cnt);

6877 }

6878 }

6879if (NumUniques > 0 &&

6880 (Loads.size() == NumUniques ||

6881 (Loads.size() - NumUniques >= 2 &&

6882 Loads.size() - NumUniques >= Loads.size() / 2 &&

6883 (has_single_bit(Data.size() + NumUniques) ||

6884bit_ceil(Data.size()) <

6885bit_ceil(Data.size() + NumUniques))))) {

6886Offset = *Dist;

6887 Start =Idx + 1;

6888return std::next(GatheredLoads.begin(),Idx);

6889 }

6890 }

6891 ToAdd.clear();

6892return GatheredLoads.end();

6893 };

6894for (ArrayRef<std::pair<LoadInst *, int>>Data : ClusteredLoads) {

6895unsigned Start = 0;

6896SetVector<unsigned> ToAdd, LocalToAdd, Repeated;

6897intOffset = 0;

6898auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,

6899Offset, Start);

6900while (It != GatheredLoads.end()) {

6901assert(!LocalToAdd.empty() &&"Expected some elements to add.");

6902for (unsignedIdx : LocalToAdd)

6903 It->emplace_back(Data[Idx].first,Data[Idx].second +Offset);

6904 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());

6905 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,Offset,

6906 Start);

6907 }

6908if (any_of(seq<unsigned>(Data.size()), [&](unsignedIdx) {

6909 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);

6910 })) {

6911auto AddNewLoads =

6912 [&](SmallVectorImpl<std::pair<LoadInst *, int>> &Loads) {

6913for (unsignedIdx : seq<unsigned>(Data.size())) {

6914if (ToAdd.contains(Idx) || Repeated.contains(Idx))

6915continue;

6916 Loads.push_back(Data[Idx]);

6917 }

6918 };

6919if (!AddNew) {

6920LoadInst *LI =Data.front().first;

6921 It =find_if(

6922 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {

6923return PD.front().first->getParent() == LI->getParent() &&

6924 PD.front().first->getType() == LI->getType();

6925 });

6926while (It != GatheredLoads.end()) {

6927 AddNewLoads(*It);

6928 It = std::find_if(

6929 std::next(It), GatheredLoads.end(),

6930 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {

6931 return PD.front().first->getParent() == LI->getParent() &&

6932 PD.front().first->getType() == LI->getType();

6933 });

6934 }

6935 }

6936 GatheredLoads.emplace_back().append(Data.begin(),Data.end());

6937 AddNewLoads(GatheredLoads.emplace_back());

6938 }

6939 }

6940}

6941

6942void BoUpSLP::tryToVectorizeGatheredLoads(

6943constSmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,

6944SmallVector<SmallVector<std::pair<LoadInst *, int>>>,

6945 8> &GatheredLoads) {

6946 GatheredLoadsEntriesFirst = VectorizableTree.size();

6947

6948SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(

6949 LoadEntriesToVectorize.size());

6950for (auto [Idx, Set] :zip(LoadEntriesToVectorize, LoadSetsToVectorize))

6951Set.insert(VectorizableTree[Idx]->Scalars.begin(),

6952 VectorizableTree[Idx]->Scalars.end());

6953

6954// Sort loads by distance.

6955auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,

6956const std::pair<LoadInst *, int> &L2) {

6957return L1.second > L2.second;

6958 };

6959

6960auto IsMaskedGatherSupported = [&,TTI =TTI](ArrayRef<LoadInst *> Loads) {

6961ArrayRef<Value *> Values(reinterpret_cast<Value *const*>(Loads.begin()),

6962 Loads.size());

6963Align Alignment = computeCommonAlignment<LoadInst>(Values);

6964auto *Ty =getWidenedType(Loads.front()->getType(), Loads.size());

6965returnTTI->isLegalMaskedGather(Ty, Alignment) &&

6966 !TTI->forceScalarizeMaskedGather(Ty, Alignment);

6967 };

6968

6969auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,

6970BoUpSLP::ValueSet &VectorizedLoads,

6971SmallVectorImpl<LoadInst *> &NonVectorized,

6972bool Final,unsigned MaxVF) {

6973SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>Results;

6974unsigned StartIdx = 0;

6975SmallVector<int> CandidateVFs;

6976if (VectorizeNonPowerOf2 &&has_single_bit(MaxVF + 1))

6977 CandidateVFs.push_back(MaxVF);

6978for (int NumElts =getFloorFullVectorNumberOfElements(

6979 *TTI, Loads.front()->getType(), MaxVF);

6980 NumElts > 1; NumElts =getFloorFullVectorNumberOfElements(

6981 *TTI, Loads.front()->getType(), NumElts - 1)) {

6982 CandidateVFs.push_back(NumElts);

6983if (VectorizeNonPowerOf2 && NumElts > 2)

6984 CandidateVFs.push_back(NumElts - 1);

6985 }

6986

6987if (Final && CandidateVFs.empty())

6988returnResults;

6989

6990unsigned BestVF = Final ? CandidateVFs.back() : 0;

6991for (unsigned NumElts : CandidateVFs) {

6992if (Final && NumElts > BestVF)

6993continue;

6994SmallVector<unsigned> MaskedGatherVectorized;

6995for (unsigned Cnt = StartIdx,E = Loads.size(); Cnt <E;

6996 ++Cnt) {

6997ArrayRef<LoadInst *> Slice =

6998ArrayRef(Loads).slice(Cnt, std::min(NumElts,E - Cnt));

6999if (VectorizedLoads.count(Slice.front()) ||

7000 VectorizedLoads.count(Slice.back()) ||

7001areKnownNonVectorizableLoads(Slice))

7002continue;

7003// Check if it is profitable to try vectorizing gathered loads. It is

7004// profitable if we have more than 3 consecutive loads or if we have

7005// less but all users are vectorized or deleted.

7006bool AllowToVectorize =false;

7007// Check if it is profitable to vectorize 2-elements loads.

7008if (NumElts == 2) {

7009bool IsLegalBroadcastLoad =TTI->isLegalBroadcastLoad(

7010 Slice.front()->getType(),ElementCount::getFixed(NumElts));

7011auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {

7012for (LoadInst *LI : Slice) {

7013// If single use/user - allow to vectorize.

7014if (LI->hasOneUse())

7015continue;

7016// 1. Check if number of uses equals number of users.

7017// 2. All users are deleted.

7018// 3. The load broadcasts are not allowed or the load is not

7019// broadcasted.

7020if (static_cast<unsignedint>(std::distance(

7021 LI->user_begin(), LI->user_end())) != LI->getNumUses())

7022returnfalse;

7023if (!IsLegalBroadcastLoad)

7024continue;

7025if (LI->hasNUsesOrMore(UsesLimit))

7026returnfalse;

7027for (User *U : LI->users()) {

7028if (auto *UI = dyn_cast<Instruction>(U); UI &&isDeleted(UI))

7029continue;

7030if (const TreeEntry *UTE = getTreeEntry(U)) {

7031for (intI : seq<int>(UTE->getNumOperands())) {

7032if (all_of(UTE->getOperand(I),

7033 [LI](Value *V) { return V == LI; }))

7034// Found legal broadcast - do not vectorize.

7035returnfalse;

7036 }

7037 }

7038 }

7039 }

7040returntrue;

7041 };

7042 AllowToVectorize = CheckIfAllowed(Slice);

7043 }else {

7044 AllowToVectorize =

7045 (NumElts >= 3 ||

7046any_of(ValueToGatherNodes.at(Slice.front()),

7047 [=](const TreeEntry *TE) {

7048 return TE->Scalars.size() == 2 &&

7049 ((TE->Scalars.front() == Slice.front() &&

7050 TE->Scalars.back() == Slice.back()) ||

7051 (TE->Scalars.front() == Slice.back() &&

7052 TE->Scalars.back() == Slice.front()));

7053 })) &&

7054hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),

7055 Slice.size());

7056 }

7057if (AllowToVectorize) {

7058SmallVector<Value *> PointerOps;

7059OrdersType CurrentOrder;

7060// Try to build vector load.

7061ArrayRef<Value *> Values(

7062reinterpret_cast<Value *const*>(Slice.begin()), Slice.size());

7063LoadsState LS =canVectorizeLoads(Values, Slice.front(), CurrentOrder,

7064 PointerOps, &BestVF);

7065if (LS !=LoadsState::Gather ||

7066 (BestVF > 1 &&static_cast<unsigned>(NumElts) == 2 * BestVF)) {

7067if (LS ==LoadsState::ScatterVectorize) {

7068if (MaskedGatherVectorized.empty() ||

7069 Cnt >= MaskedGatherVectorized.back() + NumElts)

7070 MaskedGatherVectorized.push_back(Cnt);

7071continue;

7072 }

7073if (LS !=LoadsState::Gather) {

7074Results.emplace_back(Values, LS);

7075 VectorizedLoads.insert(Slice.begin(), Slice.end());

7076// If we vectorized initial block, no need to try to vectorize it

7077// again.

7078if (Cnt == StartIdx)

7079 StartIdx += NumElts;

7080 }

7081// Check if the whole array was vectorized already - exit.

7082if (StartIdx >= Loads.size())

7083break;

7084// Erase last masked gather candidate, if another candidate within

7085// the range is found to be better.

7086if (!MaskedGatherVectorized.empty() &&

7087 Cnt < MaskedGatherVectorized.back() + NumElts)

7088 MaskedGatherVectorized.pop_back();

7089 Cnt += NumElts - 1;

7090continue;

7091 }

7092 }

7093if (!AllowToVectorize || BestVF == 0)

7094registerNonVectorizableLoads(Slice);

7095 }

7096// Mark masked gathers candidates as vectorized, if any.

7097for (unsigned Cnt : MaskedGatherVectorized) {

7098ArrayRef<LoadInst *> Slice =ArrayRef(Loads).slice(

7099 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));

7100ArrayRef<Value *> Values(

7101reinterpret_cast<Value *const*>(Slice.begin()), Slice.size());

7102Results.emplace_back(Values,LoadsState::ScatterVectorize);

7103 VectorizedLoads.insert(Slice.begin(), Slice.end());

7104// If we vectorized initial block, no need to try to vectorize it again.

7105if (Cnt == StartIdx)

7106 StartIdx += NumElts;

7107 }

7108 }

7109for (LoadInst *LI : Loads) {

7110if (!VectorizedLoads.contains(LI))

7111 NonVectorized.push_back(LI);

7112 }

7113returnResults;

7114 };

7115auto ProcessGatheredLoads =

7116 [&, &TTI = *TTI](

7117ArrayRef<SmallVector<std::pair<LoadInst *, int>>> GatheredLoads,

7118bool Final =false) {

7119SmallVector<LoadInst *> NonVectorized;

7120for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {

7121if (LoadsDists.size() <= 1) {

7122 NonVectorized.push_back(LoadsDists.back().first);

7123continue;

7124 }

7125SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);

7126SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());

7127transform(LoadsDists, OriginalLoads.begin(),

7128 [](const std::pair<LoadInst *, int> &L) ->LoadInst * {

7129 return L.first;

7130 });

7131stable_sort(LocalLoadsDists, LoadSorter);

7132SmallVector<LoadInst *> Loads;

7133unsigned MaxConsecutiveDistance = 0;

7134unsigned CurrentConsecutiveDist = 1;

7135int LastDist = LocalLoadsDists.front().second;

7136bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);

7137for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {

7138if (getTreeEntry(L.first))

7139continue;

7140assert(LastDist >=L.second &&

7141"Expected first distance always not less than second");

7142if (static_cast<unsigned>(LastDist -L.second) ==

7143 CurrentConsecutiveDist) {

7144 ++CurrentConsecutiveDist;

7145 MaxConsecutiveDistance =

7146 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);

7147 Loads.push_back(L.first);

7148continue;

7149 }

7150if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&

7151 !Loads.empty())

7152 Loads.pop_back();

7153 CurrentConsecutiveDist = 1;

7154 LastDist =L.second;

7155 Loads.push_back(L.first);

7156 }

7157if (Loads.size() <= 1)

7158continue;

7159if (AllowMaskedGather)

7160 MaxConsecutiveDistance = Loads.size();

7161elseif (MaxConsecutiveDistance < 2)

7162continue;

7163BoUpSLP::ValueSet VectorizedLoads;

7164SmallVector<LoadInst *> SortedNonVectorized;

7165SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>Results =

7166 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,

7167 Final, MaxConsecutiveDistance);

7168if (!Results.empty() && !SortedNonVectorized.empty() &&

7169 OriginalLoads.size() == Loads.size() &&

7170 MaxConsecutiveDistance == Loads.size() &&

7171all_of(Results,

7172 [](const std::pair<ArrayRef<Value *>,LoadsState> &P) {

7173returnP.second ==LoadsState::ScatterVectorize;

7174 })) {

7175 VectorizedLoads.clear();

7176SmallVector<LoadInst *> UnsortedNonVectorized;

7177SmallVector<std::pair<ArrayRef<Value *>,LoadsState>>

7178 UnsortedResults =

7179 GetVectorizedRanges(OriginalLoads, VectorizedLoads,

7180 UnsortedNonVectorized, Final,

7181 OriginalLoads.size());

7182if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {

7183 SortedNonVectorized.swap(UnsortedNonVectorized);

7184Results.swap(UnsortedResults);

7185 }

7186 }

7187for (auto [Slice,_] :Results) {

7188LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize gathered loads ("

7189 << Slice.size() <<")\n");

7190if (any_of(Slice, [&](Value *V) {return getTreeEntry(V); })) {

7191for (Value *L : Slice)

7192if (!getTreeEntry(L))

7193 SortedNonVectorized.push_back(cast<LoadInst>(L));

7194continue;

7195 }

7196

7197// Select maximum VF as a maximum of user gathered nodes and

7198// distance between scalar loads in these nodes.

7199unsigned MaxVF = Slice.size();

7200unsigned UserMaxVF = 0;

7201unsigned InterleaveFactor = 0;

7202if (MaxVF == 2) {

7203 UserMaxVF = MaxVF;

7204 }else {

7205// Found distance between segments of the interleaved loads.

7206 std::optional<unsigned> InterleavedLoadsDistance = 0;

7207unsigned Order = 0;

7208 std::optional<unsigned> CommonVF = 0;

7209DenseMap<const TreeEntry *, unsigned> EntryToPosition;

7210SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;

7211for (auto [Idx, V] :enumerate(Slice)) {

7212for (const TreeEntry *E : ValueToGatherNodes.at(V)) {

7213 UserMaxVF = std::max<unsigned>(UserMaxVF,E->Scalars.size());

7214unsigned Pos =

7215 EntryToPosition.try_emplace(E,Idx).first->second;

7216 UserMaxVF = std::max<unsigned>(UserMaxVF,Idx - Pos + 1);

7217if (CommonVF) {

7218if (*CommonVF == 0) {

7219 CommonVF =E->Scalars.size();

7220continue;

7221 }

7222if (*CommonVF !=E->Scalars.size())

7223 CommonVF.reset();

7224 }

7225// Check if the load is the part of the interleaved load.

7226if (Pos !=Idx && InterleavedLoadsDistance) {

7227if (!DeinterleavedNodes.contains(E) &&

7228any_of(E->Scalars, [&, Slice = Slice](Value *V) {

7229 if (isa<Constant>(V))

7230 return false;

7231 if (getTreeEntry(V))

7232 return true;

7233 const auto &Nodes = ValueToGatherNodes.at(V);

7234 return (Nodes.size() != 1 || !Nodes.contains(E)) &&

7235 !is_contained(Slice, V);

7236 })) {

7237 InterleavedLoadsDistance.reset();

7238continue;

7239 }

7240 DeinterleavedNodes.insert(E);

7241if (*InterleavedLoadsDistance == 0) {

7242 InterleavedLoadsDistance =Idx - Pos;

7243continue;

7244 }

7245if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||

7246 (Idx - Pos) / *InterleavedLoadsDistance < Order)

7247 InterleavedLoadsDistance.reset();

7248 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);

7249 }

7250 }

7251 }

7252 DeinterleavedNodes.clear();

7253// Check if the large load represents interleaved load operation.

7254if (InterleavedLoadsDistance.value_or(0) > 1 &&

7255 CommonVF.value_or(0) != 0) {

7256 InterleaveFactor =bit_ceil(*InterleavedLoadsDistance);

7257unsigned VF = *CommonVF;

7258OrdersType Order;

7259SmallVector<Value *> PointerOps;

7260// Segmented load detected - vectorize at maximum vector factor.

7261if (InterleaveFactor <= Slice.size() &&

7262TTI.isLegalInterleavedAccessType(

7263getWidenedType(Slice.front()->getType(), VF),

7264 InterleaveFactor,

7265 cast<LoadInst>(Slice.front())->getAlign(),

7266 cast<LoadInst>(Slice.front())

7267 ->getPointerAddressSpace()) &&

7268canVectorizeLoads(Slice, Slice.front(), Order,

7269 PointerOps) ==LoadsState::Vectorize) {

7270 UserMaxVF = InterleaveFactor * VF;

7271 }else {

7272 InterleaveFactor = 0;

7273 }

7274 }

7275// Cannot represent the loads as consecutive vectorizable nodes -

7276// just exit.

7277unsigned ConsecutiveNodesSize = 0;

7278if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&

7279any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),

7280 [&, Slice = Slice](constauto &P) {

7281constauto *It =find_if(Slice, [&](Value *V) {

7282return std::get<1>(P).contains(V);

7283 });

7284if (It == Slice.end())

7285returnfalse;

7286ArrayRef<Value *> VL =

7287 VectorizableTree[std::get<0>(P)]->Scalars;

7288 ConsecutiveNodesSize += VL.size();

7289unsigned Start = std::distance(Slice.begin(), It);

7290unsigned Sz = Slice.size() - Start;

7291return Sz < VL.size() ||

7292 Slice.slice(std::distance(Slice.begin(), It),

7293 VL.size()) != VL;

7294 }))

7295continue;

7296// Try to build long masked gather loads.

7297 UserMaxVF =bit_ceil(UserMaxVF);

7298if (InterleaveFactor == 0 &&

7299any_of(seq<unsigned>(Slice.size() / UserMaxVF),

7300 [&, Slice = Slice](unsignedIdx) {

7301 OrdersType Order;

7302 SmallVector<Value *> PointerOps;

7303 return canVectorizeLoads(

7304 Slice.slice(Idx * UserMaxVF, UserMaxVF),

7305 Slice[Idx * UserMaxVF], Order,

7306 PointerOps) ==

7307 LoadsState::ScatterVectorize;

7308 }))

7309 UserMaxVF = MaxVF;

7310if (Slice.size() != ConsecutiveNodesSize)

7311 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);

7312 }

7313for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {

7314bool IsVectorized =true;

7315for (unsignedI = 0,E = Slice.size();I <E;I += VF) {

7316ArrayRef<Value *> SubSlice =

7317 Slice.slice(I, std::min(VF,E -I));

7318if (getTreeEntry(SubSlice.front()))

7319continue;

7320// Check if the subslice is to be-vectorized entry, which is not

7321// equal to entry.

7322if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),

7323 [&](constauto &P) {

7324return !SubSlice.equals(

7325 VectorizableTree[std::get<0>(P)]

7326 ->Scalars) &&

7327set_is_subset(SubSlice, std::get<1>(P));

7328 }))

7329continue;

7330unsigned Sz = VectorizableTree.size();

7331 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);

7332if (Sz == VectorizableTree.size()) {

7333 IsVectorized =false;

7334// Try non-interleaved vectorization with smaller vector

7335// factor.

7336if (InterleaveFactor > 0) {

7337 VF = 2 * (MaxVF / InterleaveFactor);

7338 InterleaveFactor = 0;

7339 }

7340continue;

7341 }

7342 }

7343if (IsVectorized)

7344break;

7345 }

7346 }

7347 NonVectorized.append(SortedNonVectorized);

7348 }

7349return NonVectorized;

7350 };

7351for (constauto &GLs : GatheredLoads) {

7352constauto &Ref = GLs.second;

7353SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);

7354if (!Ref.empty() && !NonVectorized.empty() &&

7355 std::accumulate(

7356Ref.begin(),Ref.end(), 0u,

7357 [](unsigned S,

7358ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->unsigned {

7359 return S + LoadsDists.size();

7360 }) != NonVectorized.size() &&

7361 IsMaskedGatherSupported(NonVectorized)) {

7362SmallVector<SmallVector<std::pair<LoadInst *, int>>> FinalGatheredLoads;

7363for (LoadInst *LI : NonVectorized) {

7364// Reinsert non-vectorized loads to other list of loads with the same

7365// base pointers.

7366gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,

7367 FinalGatheredLoads,

7368/*AddNew=*/false);

7369 }

7370// Final attempt to vectorize non-vectorized loads.

7371 (void)ProcessGatheredLoads(FinalGatheredLoads,/*Final=*/true);

7372 }

7373 }

7374// Try to vectorize postponed load entries, previously marked as gathered.

7375for (unsignedIdx : LoadEntriesToVectorize) {

7376const TreeEntry &E = *VectorizableTree[Idx];

7377SmallVector<Value *> GatheredScalars(E.Scalars.begin(),E.Scalars.end());

7378// Avoid reordering, if possible.

7379if (!E.ReorderIndices.empty()) {

7380// Build a mask out of the reorder indices and reorder scalars per this

7381// mask.

7382SmallVector<int> ReorderMask;

7383inversePermutation(E.ReorderIndices, ReorderMask);

7384reorderScalars(GatheredScalars, ReorderMask);

7385 }

7386 buildTree_rec(GatheredScalars, 0, EdgeInfo());

7387 }

7388// If no new entries created, consider it as no gathered loads entries must be

7389// handled.

7390if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==

7391 VectorizableTree.size())

7392 GatheredLoadsEntriesFirst.reset();

7393}

7394

7395/// \return true if the specified list of values has only one instruction that

7396/// requires scheduling, false otherwise.

7397#ifndef NDEBUG

7398staticboolneedToScheduleSingleInstruction(ArrayRef<Value *> VL) {

7399Value *NeedsScheduling =nullptr;

7400for (Value *V : VL) {

7401if (doesNotNeedToBeScheduled(V))

7402continue;

7403if (!NeedsScheduling) {

7404 NeedsScheduling = V;

7405continue;

7406 }

7407returnfalse;

7408 }

7409return NeedsScheduling;

7410}

7411#endif

7412

7413/// Generates key/subkey pair for the given value to provide effective sorting

7414/// of the values and better detection of the vectorizable values sequences. The

7415/// keys/subkeys can be used for better sorting of the values themselves (keys)

7416/// and in values subgroups (subkeys).

7417static std::pair<size_t, size_t>generateKeySubkey(

7418Value *V,constTargetLibraryInfo *TLI,

7419function_ref<hash_code(size_t,LoadInst *)> LoadsSubkeyGenerator,

7420bool AllowAlternate) {

7421hash_code Key =hash_value(V->getValueID() + 2);

7422hash_code SubKey =hash_value(0);

7423// Sort the loads by the distance between the pointers.

7424if (auto *LI = dyn_cast<LoadInst>(V)) {

7425 Key =hash_combine(LI->getType(),hash_value(Instruction::Load), Key);

7426if (LI->isSimple())

7427 SubKey =hash_value(LoadsSubkeyGenerator(Key, LI));

7428else

7429 Key = SubKey =hash_value(LI);

7430 }elseif (isVectorLikeInstWithConstOps(V)) {

7431// Sort extracts by the vector operands.

7432if (isa<ExtractElementInst, UndefValue>(V))

7433 Key =hash_value(Value::UndefValueVal + 1);

7434if (auto *EI = dyn_cast<ExtractElementInst>(V)) {

7435if (!isUndefVector(EI->getVectorOperand()).all() &&

7436 !isa<UndefValue>(EI->getIndexOperand()))

7437 SubKey =hash_value(EI->getVectorOperand());

7438 }

7439 }elseif (auto *I = dyn_cast<Instruction>(V)) {

7440// Sort other instructions just by the opcodes except for CMPInst.

7441// For CMP also sort by the predicate kind.

7442if ((isa<BinaryOperator, CastInst>(I)) &&

7443isValidForAlternation(I->getOpcode())) {

7444if (AllowAlternate)

7445 Key =hash_value(isa<BinaryOperator>(I) ? 1 : 0);

7446else

7447 Key =hash_combine(hash_value(I->getOpcode()), Key);

7448 SubKey =hash_combine(

7449hash_value(I->getOpcode()),hash_value(I->getType()),

7450hash_value(isa<BinaryOperator>(I)

7451 ?I->getType()

7452 : cast<CastInst>(I)->getOperand(0)->getType()));

7453// For casts, look through the only operand to improve compile time.

7454if (isa<CastInst>(I)) {

7455 std::pair<size_t, size_t> OpVals =

7456generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,

7457/*AllowAlternate=*/true);

7458 Key =hash_combine(OpVals.first, Key);

7459 SubKey =hash_combine(OpVals.first, SubKey);

7460 }

7461 }elseif (auto *CI = dyn_cast<CmpInst>(I)) {

7462CmpInst::Predicate Pred = CI->getPredicate();

7463if (CI->isCommutative())

7464 Pred = std::min(Pred,CmpInst::getInversePredicate(Pred));

7465CmpInst::Predicate SwapPred =CmpInst::getSwappedPredicate(Pred);

7466 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(Pred),

7467hash_value(SwapPred),

7468hash_value(CI->getOperand(0)->getType()));

7469 }elseif (auto *Call = dyn_cast<CallInst>(I)) {

7470Intrinsic::ID ID =getVectorIntrinsicIDForCall(Call, TLI);

7471if (isTriviallyVectorizable(ID)) {

7472 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(ID));

7473 }elseif (!VFDatabase(*Call).getMappings(*Call).empty()) {

7474 SubKey =hash_combine(hash_value(I->getOpcode()),

7475hash_value(Call->getCalledFunction()));

7476 }else {

7477 Key =hash_combine(hash_value(Call), Key);

7478 SubKey =hash_combine(hash_value(I->getOpcode()),hash_value(Call));

7479 }

7480for (constCallBase::BundleOpInfo &Op : Call->bundle_op_infos())

7481 SubKey =hash_combine(hash_value(Op.Begin),hash_value(Op.End),

7482hash_value(Op.Tag), SubKey);

7483 }elseif (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {

7484if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))

7485 SubKey =hash_value(Gep->getPointerOperand());

7486else

7487 SubKey =hash_value(Gep);

7488 }elseif (BinaryOperator::isIntDivRem(I->getOpcode()) &&

7489 !isa<ConstantInt>(I->getOperand(1))) {

7490// Do not try to vectorize instructions with potentially high cost.

7491 SubKey =hash_value(I);

7492 }else {

7493 SubKey =hash_value(I->getOpcode());

7494 }

7495 Key =hash_combine(hash_value(I->getParent()), Key);

7496 }

7497return std::make_pair(Key, SubKey);

7498}

7499

7500/// Checks if the specified instruction \p I is an alternate operation for

7501/// the given \p MainOp and \p AltOp instructions.

7502staticboolisAlternateInstruction(constInstruction *I,

7503constInstruction *MainOp,

7504constInstruction *AltOp,

7505constTargetLibraryInfo &TLI);

7506

7507bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,

7508ArrayRef<Value *> VL) const{

7509unsigned Opcode0 = S.getOpcode();

7510unsigned Opcode1 = S.getAltOpcode();

7511SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));

7512// If this pattern is supported by the target then consider it profitable.

7513if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),

7514 Opcode0, Opcode1, OpcodeMask))

7515returntrue;

7516SmallVector<ValueList>Operands;

7517for (unsignedI : seq<unsigned>(S.getMainOp()->getNumOperands())) {

7518Operands.emplace_back();

7519// Prepare the operand vector.

7520for (Value *V : VL) {

7521if (isa<PoisonValue>(V)) {

7522Operands.back().push_back(

7523PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));

7524continue;

7525 }

7526Operands.back().push_back(cast<Instruction>(V)->getOperand(I));

7527 }

7528 }

7529if (Operands.size() == 2) {

7530// Try find best operands candidates.

7531for (unsignedI : seq<unsigned>(0, VL.size() - 1)) {

7532SmallVector<std::pair<Value *, Value *>> Candidates(3);

7533 Candidates[0] = std::make_pair(Operands[0][I],Operands[0][I + 1]);

7534 Candidates[1] = std::make_pair(Operands[0][I],Operands[1][I + 1]);

7535 Candidates[2] = std::make_pair(Operands[1][I],Operands[0][I + 1]);

7536 std::optional<int> Res =findBestRootPair(Candidates);

7537switch (Res.value_or(0)) {

7538case 0:

7539break;

7540case 1:

7541std::swap(Operands[0][I + 1],Operands[1][I + 1]);

7542break;

7543case 2:

7544std::swap(Operands[0][I],Operands[1][I]);

7545break;

7546default:

7547llvm_unreachable("Unexpected index.");

7548 }

7549 }

7550 }

7551DenseSet<unsigned> UniqueOpcodes;

7552constexprunsigned NumAltInsts = 3;// main + alt + shuffle.

7553unsigned NonInstCnt = 0;

7554// Estimate number of instructions, required for the vectorized node and for

7555// the buildvector node.

7556unsigned UndefCnt = 0;

7557// Count the number of extra shuffles, required for vector nodes.

7558unsigned ExtraShuffleInsts = 0;

7559// Check that operands do not contain same values and create either perfect

7560// diamond match or shuffled match.

7561if (Operands.size() == 2) {

7562// Do not count same operands twice.

7563if (Operands.front() ==Operands.back()) {

7564Operands.erase(Operands.begin());

7565 }elseif (!allConstant(Operands.front()) &&

7566all_of(Operands.front(), [&](Value *V) {

7567 return is_contained(Operands.back(), V);

7568 })) {

7569Operands.erase(Operands.begin());

7570 ++ExtraShuffleInsts;

7571 }

7572 }

7573constLoop *L = LI->getLoopFor(S.getMainOp()->getParent());

7574// Vectorize node, if:

7575// 1. at least single operand is constant or splat.

7576// 2. Operands have many loop invariants (the instructions are not loop

7577// invariants).

7578// 3. At least single unique operands is supposed to vectorized.

7579returnnone_of(Operands,

7580 [&](ArrayRef<Value *>Op) {

7581if (allConstant(Op) ||

7582 (!isSplat(Op) &&allSameBlock(Op) &&allSameType(Op) &&

7583getSameOpcode(Op, *TLI)))

7584returnfalse;

7585DenseMap<Value *, unsigned> Uniques;

7586for (Value *V :Op) {

7587if (isa<Constant, ExtractElementInst>(V) ||

7588 getTreeEntry(V) || (L &&L->isLoopInvariant(V))) {

7589 if (isa<UndefValue>(V))

7590 ++UndefCnt;

7591 continue;

7592 }

7593auto Res = Uniques.try_emplace(V, 0);

7594// Found first duplicate - need to add shuffle.

7595if (!Res.second && Res.first->second == 1)

7596 ++ExtraShuffleInsts;

7597 ++Res.first->getSecond();

7598if (auto *I = dyn_cast<Instruction>(V))

7599 UniqueOpcodes.insert(I->getOpcode());

7600elseif (Res.second)

7601 ++NonInstCnt;

7602 }

7603returnnone_of(Uniques, [&](constauto &P) {

7604returnP.first->hasNUsesOrMore(P.second + 1) &&

7605none_of(P.first->users(), [&](User *U) {

7606 return getTreeEntry(U) || Uniques.contains(U);

7607 });

7608 });

7609 }) ||

7610// Do not vectorize node, if estimated number of vector instructions is

7611// more than estimated number of buildvector instructions. Number of

7612// vector operands is number of vector instructions + number of vector

7613// instructions for operands (buildvectors). Number of buildvector

7614// instructions is just number_of_operands * number_of_scalars.

7615 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&

7616 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +

7617 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());

7618}

7619

7620BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(

7621const InstructionsState &S,ArrayRef<Value *> VL,

7622bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,

7623SmallVectorImpl<Value *> &PointerOps) {

7624assert(S.getMainOp() &&

7625"Expected instructions with same/alternate opcodes only.");

7626

7627unsigned ShuffleOrOp =

7628 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();

7629Instruction *VL0 = S.getMainOp();

7630switch (ShuffleOrOp) {

7631case Instruction::PHI: {

7632// Too many operands - gather, most probably won't be vectorized.

7633if (VL0->getNumOperands() >MaxPHINumOperands)

7634return TreeEntry::NeedToGather;

7635// Check for terminator values (e.g. invoke).

7636for (Value *V : VL) {

7637auto *PHI = dyn_cast<PHINode>(V);

7638if (!PHI)

7639continue;

7640for (Value *Incoming :PHI->incoming_values()) {

7641Instruction *Term = dyn_cast<Instruction>(Incoming);

7642if (Term &&Term->isTerminator()) {

7643LLVM_DEBUG(dbgs()

7644 <<"SLP: Need to swizzle PHINodes (terminator use).\n");

7645return TreeEntry::NeedToGather;

7646 }

7647 }

7648 }

7649

7650return TreeEntry::Vectorize;

7651 }

7652case Instruction::ExtractValue:

7653case Instruction::ExtractElement: {

7654bool Reuse = canReuseExtract(VL, CurrentOrder);

7655// FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and

7656// non-full registers).

7657if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))

7658return TreeEntry::NeedToGather;

7659if (Reuse || !CurrentOrder.empty())

7660return TreeEntry::Vectorize;

7661LLVM_DEBUG(dbgs() <<"SLP: Gather extract sequence.\n");

7662return TreeEntry::NeedToGather;

7663 }

7664case Instruction::InsertElement: {

7665// Check that we have a buildvector and not a shuffle of 2 or more

7666// different vectors.

7667ValueSet SourceVectors;

7668for (Value *V : VL) {

7669 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));

7670assert(getElementIndex(V) != std::nullopt &&

7671"Non-constant or undef index?");

7672 }

7673

7674if (count_if(VL, [&SourceVectors](Value *V) {

7675return !SourceVectors.contains(V);

7676 }) >= 2) {

7677// Found 2nd source vector - cancel.

7678LLVM_DEBUG(dbgs() <<"SLP: Gather of insertelement vectors with "

7679"different source vectors.\n");

7680return TreeEntry::NeedToGather;

7681 }

7682

7683if (any_of(VL, [&SourceVectors](Value *V) {

7684// The last InsertElement can have multiple uses.

7685return SourceVectors.contains(V) && !V->hasOneUse();

7686 })) {

7687assert(SLPReVec &&"Only supported by REVEC.");

7688LLVM_DEBUG(dbgs() <<"SLP: Gather of insertelement vectors with "

7689"multiple uses.\n");

7690return TreeEntry::NeedToGather;

7691 }

7692

7693return TreeEntry::Vectorize;

7694 }

7695case Instruction::Load: {

7696// Check that a vectorized load would load the same memory as a scalar

7697// load. For example, we don't want to vectorize loads that are smaller

7698// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

7699// treats loading/storing it as an i8 struct. If we vectorize loads/stores

7700// from such a struct, we read/write packed bits disagreeing with the

7701// unvectorized version.

7702switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {

7703caseLoadsState::Vectorize:

7704return TreeEntry::Vectorize;

7705caseLoadsState::ScatterVectorize:

7706if (!IsGraphTransformMode && !VectorizableTree.empty()) {

7707// Delay slow vectorized nodes for better vectorization attempts.

7708 LoadEntriesToVectorize.insert(VectorizableTree.size());

7709return TreeEntry::NeedToGather;

7710 }

7711return TreeEntry::ScatterVectorize;

7712caseLoadsState::StridedVectorize:

7713if (!IsGraphTransformMode && VectorizableTree.size() > 1) {

7714// Delay slow vectorized nodes for better vectorization attempts.

7715 LoadEntriesToVectorize.insert(VectorizableTree.size());

7716return TreeEntry::NeedToGather;

7717 }

7718return TreeEntry::StridedVectorize;

7719caseLoadsState::Gather:

7720#ifndef NDEBUG

7721Type *ScalarTy = VL0->getType();

7722if (DL->getTypeSizeInBits(ScalarTy) !=

7723DL->getTypeAllocSizeInBits(ScalarTy))

7724LLVM_DEBUG(dbgs() <<"SLP: Gathering loads of non-packed type.\n");

7725elseif (any_of(VL, [](Value *V) {

7726auto *LI = dyn_cast<LoadInst>(V);

7727return !LI || !LI->isSimple();

7728 }))

7729LLVM_DEBUG(dbgs() <<"SLP: Gathering non-simple loads.\n");

7730else

7731LLVM_DEBUG(dbgs() <<"SLP: Gathering non-consecutive loads.\n");

7732#endif// NDEBUG

7733registerNonVectorizableLoads(VL);

7734return TreeEntry::NeedToGather;

7735 }

7736llvm_unreachable("Unexpected state of loads");

7737 }

7738case Instruction::ZExt:

7739case Instruction::SExt:

7740case Instruction::FPToUI:

7741case Instruction::FPToSI:

7742case Instruction::FPExt:

7743case Instruction::PtrToInt:

7744case Instruction::IntToPtr:

7745case Instruction::SIToFP:

7746case Instruction::UIToFP:

7747case Instruction::Trunc:

7748case Instruction::FPTrunc:

7749case Instruction::BitCast: {

7750Type *SrcTy = VL0->getOperand(0)->getType();

7751for (Value *V : VL) {

7752if (isa<PoisonValue>(V))

7753continue;

7754Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();

7755if (Ty != SrcTy || !isValidElementType(Ty)) {

7756LLVM_DEBUG(

7757dbgs() <<"SLP: Gathering casts with different src types.\n");

7758return TreeEntry::NeedToGather;

7759 }

7760 }

7761return TreeEntry::Vectorize;

7762 }

7763case Instruction::ICmp:

7764case Instruction::FCmp: {

7765// Check that all of the compares have the same predicate.

7766CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

7767CmpInst::Predicate SwapP0 =CmpInst::getSwappedPredicate(P0);

7768Type *ComparedTy = VL0->getOperand(0)->getType();

7769for (Value *V : VL) {

7770if (isa<PoisonValue>(V))

7771continue;

7772auto *Cmp = cast<CmpInst>(V);

7773if ((Cmp->getPredicate() != P0 &&Cmp->getPredicate() != SwapP0) ||

7774Cmp->getOperand(0)->getType() != ComparedTy) {

7775LLVM_DEBUG(dbgs() <<"SLP: Gathering cmp with different predicate.\n");

7776return TreeEntry::NeedToGather;

7777 }

7778 }

7779return TreeEntry::Vectorize;

7780 }

7781case Instruction::Select:

7782case Instruction::FNeg:

7783case Instruction::Add:

7784case Instruction::FAdd:

7785case Instruction::Sub:

7786case Instruction::FSub:

7787case Instruction::Mul:

7788case Instruction::FMul:

7789case Instruction::UDiv:

7790case Instruction::SDiv:

7791case Instruction::FDiv:

7792case Instruction::URem:

7793case Instruction::SRem:

7794case Instruction::FRem:

7795case Instruction::Shl:

7796case Instruction::LShr:

7797case Instruction::AShr:

7798case Instruction::And:

7799case Instruction::Or:

7800case Instruction::Xor:

7801case Instruction::Freeze:

7802if (S.getMainOp()->getType()->isFloatingPointTy() &&

7803TTI->isFPVectorizationPotentiallyUnsafe() &&any_of(VL, [](Value *V) {

7804auto *I = dyn_cast<Instruction>(V);

7805returnI &&I->isBinaryOp() && !I->isFast();

7806 }))

7807return TreeEntry::NeedToGather;

7808return TreeEntry::Vectorize;

7809case Instruction::GetElementPtr: {

7810// We don't combine GEPs with complicated (nested) indexing.

7811for (Value *V : VL) {

7812auto *I = dyn_cast<GetElementPtrInst>(V);

7813if (!I)

7814continue;

7815if (I->getNumOperands() != 2) {

7816LLVM_DEBUG(dbgs() <<"SLP: not-vectorizable GEP (nested indexes).\n");

7817return TreeEntry::NeedToGather;

7818 }

7819 }

7820

7821// We can't combine several GEPs into one vector if they operate on

7822// different types.

7823Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();

7824for (Value *V : VL) {

7825auto *GEP = dyn_cast<GEPOperator>(V);

7826if (!GEP)

7827continue;

7828Type *CurTy =GEP->getSourceElementType();

7829if (Ty0 != CurTy) {

7830LLVM_DEBUG(dbgs() <<"SLP: not-vectorizable GEP (different types).\n");

7831return TreeEntry::NeedToGather;

7832 }

7833 }

7834

7835// We don't combine GEPs with non-constant indexes.

7836Type *Ty1 = VL0->getOperand(1)->getType();

7837for (Value *V : VL) {

7838auto *I = dyn_cast<GetElementPtrInst>(V);

7839if (!I)

7840continue;

7841auto *Op =I->getOperand(1);

7842if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

7843 (Op->getType() != Ty1 &&

7844 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||

7845Op->getType()->getScalarSizeInBits() >

7846DL->getIndexSizeInBits(

7847V->getType()->getPointerAddressSpace())))) {

7848LLVM_DEBUG(

7849dbgs() <<"SLP: not-vectorizable GEP (non-constant indexes).\n");

7850return TreeEntry::NeedToGather;

7851 }

7852 }

7853

7854return TreeEntry::Vectorize;

7855 }

7856case Instruction::Store: {

7857// Check if the stores are consecutive or if we need to swizzle them.

7858llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();

7859// Avoid types that are padded when being allocated as scalars, while

7860// being packed together in a vector (such as i1).

7861if (DL->getTypeSizeInBits(ScalarTy) !=

7862DL->getTypeAllocSizeInBits(ScalarTy)) {

7863LLVM_DEBUG(dbgs() <<"SLP: Gathering stores of non-packed type.\n");

7864return TreeEntry::NeedToGather;

7865 }

7866// Make sure all stores in the bundle are simple - we can't vectorize

7867// atomic or volatile stores.

7868for (Value *V : VL) {

7869auto *SI = cast<StoreInst>(V);

7870if (!SI->isSimple()) {

7871LLVM_DEBUG(dbgs() <<"SLP: Gathering non-simple stores.\n");

7872return TreeEntry::NeedToGather;

7873 }

7874 PointerOps.push_back(SI->getPointerOperand());

7875 }

7876

7877// Check the order of pointer operands.

7878if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {

7879Value *Ptr0;

7880Value *PtrN;

7881if (CurrentOrder.empty()) {

7882 Ptr0 = PointerOps.front();

7883 PtrN = PointerOps.back();

7884 }else {

7885 Ptr0 = PointerOps[CurrentOrder.front()];

7886 PtrN = PointerOps[CurrentOrder.back()];

7887 }

7888 std::optional<int> Dist =

7889getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);

7890// Check that the sorted pointer operands are consecutive.

7891if (static_cast<unsigned>(*Dist) == VL.size() - 1)

7892return TreeEntry::Vectorize;

7893 }

7894

7895LLVM_DEBUG(dbgs() <<"SLP: Non-consecutive store.\n");

7896return TreeEntry::NeedToGather;

7897 }

7898case Instruction::Call: {

7899if (S.getMainOp()->getType()->isFloatingPointTy() &&

7900TTI->isFPVectorizationPotentiallyUnsafe() &&any_of(VL, [](Value *V) {

7901auto *I = dyn_cast<Instruction>(V);

7902returnI && !I->isFast();

7903 }))

7904return TreeEntry::NeedToGather;

7905// Check if the calls are all to the same vectorizable intrinsic or

7906// library function.

7907CallInst *CI = cast<CallInst>(VL0);

7908Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

7909

7910VFShape Shape =VFShape::get(

7911 CI->getFunctionType(),

7912ElementCount::getFixed(static_cast<unsignedint>(VL.size())),

7913false/*HasGlobalPred*/);

7914Function *VecFunc =VFDatabase(*CI).getVectorizedFunction(Shape);

7915

7916if (!VecFunc && !isTriviallyVectorizable(ID)) {

7917LLVM_DEBUG(dbgs() <<"SLP: Non-vectorizable call.\n");

7918return TreeEntry::NeedToGather;

7919 }

7920Function *F = CI->getCalledFunction();

7921unsigned NumArgs = CI->arg_size();

7922SmallVector<Value *, 4> ScalarArgs(NumArgs,nullptr);

7923for (unsigned J = 0; J != NumArgs; ++J)

7924if (isVectorIntrinsicWithScalarOpAtArg(ID, J,TTI))

7925 ScalarArgs[J] = CI->getArgOperand(J);

7926for (Value *V : VL) {

7927CallInst *CI2 = dyn_cast<CallInst>(V);

7928if (!CI2 || CI2->getCalledFunction() !=F ||

7929getVectorIntrinsicIDForCall(CI2, TLI) !=ID ||

7930 (VecFunc &&

7931 VecFunc !=VFDatabase(*CI2).getVectorizedFunction(Shape)) ||

7932 !CI->hasIdenticalOperandBundleSchema(*CI2)) {

7933LLVM_DEBUG(dbgs() <<"SLP: mismatched calls:" << *CI <<"!=" << *V

7934 <<"\n");

7935return TreeEntry::NeedToGather;

7936 }

7937// Some intrinsics have scalar arguments and should be same in order for

7938// them to be vectorized.

7939for (unsigned J = 0; J != NumArgs; ++J) {

7940if (isVectorIntrinsicWithScalarOpAtArg(ID, J,TTI)) {

7941Value *A1J = CI2->getArgOperand(J);

7942if (ScalarArgs[J] != A1J) {

7943LLVM_DEBUG(dbgs()

7944 <<"SLP: mismatched arguments in call:" << *CI

7945 <<" argument " << ScalarArgs[J] <<"!=" << A1J <<"\n");

7946return TreeEntry::NeedToGather;

7947 }

7948 }

7949 }

7950// Verify that the bundle operands are identical between the two calls.

7951if (CI->hasOperandBundles() &&

7952 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),

7953 CI->op_begin() + CI->getBundleOperandsEndIndex(),

7954 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {

7955LLVM_DEBUG(dbgs() <<"SLP: mismatched bundle operands in calls:" << *CI

7956 <<"!=" << *V <<'\n');

7957return TreeEntry::NeedToGather;

7958 }

7959 }

7960

7961return TreeEntry::Vectorize;

7962 }

7963case Instruction::ShuffleVector: {

7964if (!S.isAltShuffle()) {

7965// REVEC can support non alternate shuffle.

7966if (SLPReVec &&getShufflevectorNumGroups(VL))

7967return TreeEntry::Vectorize;

7968// If this is not an alternate sequence of opcode like add-sub

7969// then do not vectorize this instruction.

7970LLVM_DEBUG(dbgs() <<"SLP: ShuffleVector are not vectorized.\n");

7971return TreeEntry::NeedToGather;

7972 }

7973if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {

7974LLVM_DEBUG(

7975dbgs()

7976 <<"SLP: ShuffleVector not vectorized, operands are buildvector and "

7977"the whole alt sequence is not profitable.\n");

7978return TreeEntry::NeedToGather;

7979 }

7980

7981return TreeEntry::Vectorize;

7982 }

7983default:

7984LLVM_DEBUG(dbgs() <<"SLP: Gathering unknown instruction.\n");

7985return TreeEntry::NeedToGather;

7986 }

7987}

7988

7989namespace{

7990/// Allows to correctly handle operands of the phi nodes based on the \p Main

7991/// PHINode order of incoming basic blocks/values.

7992classPHIHandler {

7993DominatorTree &DT;

7994PHINode *Main =nullptr;

7995SmallVector<Value *> Phis;

7996SmallVector<SmallVector<Value *>>Operands;

7997

7998public:

7999 PHIHandler() =delete;

8000 PHIHandler(DominatorTree &DT,PHINode *Main,ArrayRef<Value *> Phis)

8001 : DT(DT), Main(Main), Phis(Phis),

8002Operands(Main->getNumIncomingValues(),

8003SmallVector<Value *>(Phis.size(), nullptr)) {}

8004void buildOperands() {

8005constexprunsigned FastLimit = 4;

8006if (Main->getNumIncomingValues() <= FastLimit) {

8007for (unsignedI : seq<unsigned>(0, Main->getNumIncomingValues())) {

8008BasicBlock *InBB = Main->getIncomingBlock(I);

8009if (!DT.isReachableFromEntry(InBB)) {

8010Operands[I].assign(Phis.size(),PoisonValue::get(Main->getType()));

8011continue;

8012 }

8013// Prepare the operand vector.

8014for (auto [Idx, V] :enumerate(Phis)) {

8015auto *P = dyn_cast<PHINode>(V);

8016if (!P) {

8017assert(isa<PoisonValue>(V) &&

8018"Expected isa instruction or poison value.");

8019Operands[I][Idx] =V;

8020continue;

8021 }

8022if (P->getIncomingBlock(I) == InBB)

8023Operands[I][Idx] =P->getIncomingValue(I);

8024else

8025Operands[I][Idx] =P->getIncomingValueForBlock(InBB);

8026 }

8027 }

8028return;

8029 }

8030SmallDenseMap<BasicBlock *, SmallVector<unsigned>, 4>Blocks;

8031for (unsignedI : seq<unsigned>(0, Main->getNumIncomingValues())) {

8032BasicBlock *InBB = Main->getIncomingBlock(I);

8033if (!DT.isReachableFromEntry(InBB)) {

8034Operands[I].assign(Phis.size(),PoisonValue::get(Main->getType()));

8035continue;

8036 }

8037Blocks.try_emplace(InBB).first->second.push_back(I);

8038 }

8039for (auto [Idx, V] :enumerate(Phis)) {

8040if (isa<PoisonValue>(V)) {

8041for (unsignedI : seq<unsigned>(Main->getNumIncomingValues()))

8042Operands[I][Idx] =V;

8043continue;

8044 }

8045auto *P = cast<PHINode>(V);

8046for (unsignedI : seq<unsigned>(0,P->getNumIncomingValues())) {

8047BasicBlock *InBB =P->getIncomingBlock(I);

8048if (InBB == Main->getIncomingBlock(I)) {

8049if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))

8050continue;

8051Operands[I][Idx] =P->getIncomingValue(I);

8052continue;

8053 }

8054auto It =Blocks.find(InBB);

8055if (It ==Blocks.end())

8056continue;

8057Operands[It->second.front()][Idx] =P->getIncomingValue(I);

8058 }

8059 }

8060for (constauto &P :Blocks) {

8061if (P.getSecond().size() <= 1)

8062continue;

8063unsigned BasicI =P.getSecond().front();

8064for (unsignedI :ArrayRef(P.getSecond()).drop_front()) {

8065assert(all_of(enumerate(Operands[I]),

8066 [&](constauto &Data) {

8067return !Data.value() ||

8068 Data.value() ==Operands[BasicI][Data.index()];

8069 }) &&

8070"Expected empty operands list.");

8071Operands[I] =Operands[BasicI];

8072 }

8073 }

8074 }

8075ArrayRef<Value *>getOperands(unsignedI) const{returnOperands[I]; }

8076};

8077}// namespace

8078

8079void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL,unsignedDepth,

8080const EdgeInfo &UserTreeIdx,

8081unsigned InterleaveFactor) {

8082assert((allConstant(VL) ||allSameType(VL)) &&"Invalid types!");

8083

8084SmallVector<int> ReuseShuffleIndices;

8085SmallVector<Value *> UniqueValues;

8086SmallVector<Value *> NonUniqueValueVL;

8087auto TryToFindDuplicates = [&](const InstructionsState &S,

8088bool DoNotFail =false) {

8089// Check that every instruction appears once in this bundle.

8090SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());

8091for (Value *V : VL) {

8092if (isConstant(V)) {

8093 ReuseShuffleIndices.emplace_back(

8094 isa<PoisonValue>(V) ?PoisonMaskElem : UniqueValues.size());

8095 UniqueValues.emplace_back(V);

8096continue;

8097 }

8098auto Res = UniquePositions.try_emplace(V, UniqueValues.size());

8099 ReuseShuffleIndices.emplace_back(Res.first->second);

8100if (Res.second)

8101 UniqueValues.emplace_back(V);

8102 }

8103size_t NumUniqueScalarValues = UniqueValues.size();

8104bool IsFullVectors =hasFullVectorsOrPowerOf2(

8105 *TTI,getValueType(UniqueValues.front()), NumUniqueScalarValues);

8106if (NumUniqueScalarValues == VL.size() &&

8107 (VectorizeNonPowerOf2 || IsFullVectors)) {

8108 ReuseShuffleIndices.clear();

8109 }else {

8110// FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.

8111if ((UserTreeIdx.UserTE &&

8112 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||

8113 !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {

8114LLVM_DEBUG(dbgs() <<"SLP: Reshuffling scalars not yet supported "

8115"for nodes with padding.\n");

8116 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8117returnfalse;

8118 }

8119LLVM_DEBUG(dbgs() <<"SLP: Shuffle for reused scalars.\n");

8120if (NumUniqueScalarValues <= 1 || !IsFullVectors ||

8121 (UniquePositions.size() == 1 &&all_of(UniqueValues, [](Value *V) {

8122return isa<UndefValue>(V) || !isConstant(V);

8123 }))) {

8124if (DoNotFail && UniquePositions.size() > 1 &&

8125 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&

8126all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {

8127// Find the number of elements, which forms full vectors.

8128unsigned PWSz =getFullVectorNumberOfElements(

8129 *TTI, UniqueValues.front()->getType(), UniqueValues.size());

8130if (PWSz == VL.size()) {

8131 ReuseShuffleIndices.clear();

8132 }else {

8133 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());

8134 NonUniqueValueVL.append(

8135 PWSz - UniqueValues.size(),

8136PoisonValue::get(UniqueValues.front()->getType()));

8137// Check that extended with poisons operations are still valid for

8138// vectorization (div/rem are not allowed).

8139if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {

8140LLVM_DEBUG(dbgs() <<"SLP: Scalar used twice in bundle.\n");

8141 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8142returnfalse;

8143 }

8144 VL = NonUniqueValueVL;

8145 }

8146returntrue;

8147 }

8148LLVM_DEBUG(dbgs() <<"SLP: Scalar used twice in bundle.\n");

8149 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8150returnfalse;

8151 }

8152 VL = UniqueValues;

8153 }

8154returntrue;

8155 };

8156

8157 InstructionsState S =getSameOpcode(VL, *TLI);

8158

8159// Don't go into catchswitch blocks, which can happen with PHIs.

8160// Such blocks can only have PHIs and the catchswitch. There is no

8161// place to insert a shuffle if we need to, so just avoid that issue.

8162if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {

8163LLVM_DEBUG(dbgs() <<"SLP: bundle in catchswitch block.\n");

8164 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8165return;

8166 }

8167

8168// Check if this is a duplicate of another entry.

8169if (S) {

8170if (TreeEntry *E = getTreeEntry(S.getMainOp())) {

8171LLVM_DEBUG(dbgs() <<"SLP: \tChecking bundle: " << *S.getMainOp()

8172 <<".\n");

8173if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {

8174auto It = MultiNodeScalars.find(S.getMainOp());

8175if (It != MultiNodeScalars.end()) {

8176auto *TEIt =find_if(It->getSecond(),

8177 [&](TreeEntry *ME) { return ME->isSame(VL); });

8178if (TEIt != It->getSecond().end())

8179 E = *TEIt;

8180else

8181 E =nullptr;

8182 }else {

8183 E =nullptr;

8184 }

8185 }

8186if (!E) {

8187if (!doesNotNeedToBeScheduled(S.getMainOp())) {

8188LLVM_DEBUG(dbgs() <<"SLP: Gathering due to partial overlap.\n");

8189if (TryToFindDuplicates(S))

8190 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8191 ReuseShuffleIndices);

8192return;

8193 }

8194SmallPtrSet<const TreeEntry *, 4> Nodes;

8195 Nodes.insert(getTreeEntry(S.getMainOp()));

8196for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))

8197 Nodes.insert(E);

8198SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());

8199if (any_of(Nodes, [&](const TreeEntry *E) {

8200if (all_of(E->Scalars,

8201 [&](Value *V) { return Values.contains(V); }))

8202returntrue;

8203SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),

8204 E->Scalars.end());

8205 return (

8206all_of(VL, [&](Value *V) {return EValues.contains(V); }));

8207 })) {

8208LLVM_DEBUG(dbgs() <<"SLP: Gathering due to full overlap.\n");

8209if (TryToFindDuplicates(S))

8210 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8211 ReuseShuffleIndices);

8212return;

8213 }

8214 }else {

8215// Record the reuse of the tree node. FIXME, currently this is only

8216// used to properly draw the graph rather than for the actual

8217// vectorization.

8218 E->UserTreeIndices.push_back(UserTreeIdx);

8219LLVM_DEBUG(dbgs() <<"SLP: Perfect diamond merge at " << *S.getMainOp()

8220 <<".\n");

8221return;

8222 }

8223 }

8224 }

8225

8226// Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of

8227// a load), in which case peek through to include it in the tree, without

8228// ballooning over-budget.

8229if (Depth >=RecursionMaxDepth &&

8230 !(S && !S.isAltShuffle() && VL.size() >= 4 &&

8231 (match(S.getMainOp(),m_Load(m_Value())) ||

8232all_of(VL, [&S](constValue *I) {

8233returnmatch(I,

8234m_OneUse(m_ZExtOrSExt(m_OneUse(m_Load(m_Value()))))) &&

8235 cast<Instruction>(I)->getOpcode() == S.getOpcode();

8236 })))) {

8237LLVM_DEBUG(dbgs() <<"SLP: Gathering due to max recursion depth.\n");

8238if (TryToFindDuplicates(S))

8239 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8240 ReuseShuffleIndices);

8241return;

8242 }

8243

8244// Don't handle scalable vectors

8245if (S && S.getOpcode() == Instruction::ExtractElement &&

8246 isa<ScalableVectorType>(

8247 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {

8248LLVM_DEBUG(dbgs() <<"SLP: Gathering due to scalable vector type.\n");

8249if (TryToFindDuplicates(S))

8250 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8251 ReuseShuffleIndices);

8252return;

8253 }

8254

8255// Don't handle vectors.

8256if (!SLPReVec &&getValueType(VL.front())->isVectorTy()) {

8257LLVM_DEBUG(dbgs() <<"SLP: Gathering due to vector type.\n");

8258 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8259return;

8260 }

8261

8262// If all of the operands are identical or constant we have a simple solution.

8263// If we deal with insert/extract instructions, they all must have constant

8264// indices, otherwise we should gather them, not try to vectorize.

8265// If alternate op node with 2 elements with gathered operands - do not

8266// vectorize.

8267auto &&NotProfitableForVectorization = [&S,this,

8268Depth](ArrayRef<Value *> VL) {

8269if (!S || !S.isAltShuffle() || VL.size() > 2)

8270returnfalse;

8271if (VectorizableTree.size() <MinTreeSize)

8272returnfalse;

8273if (Depth >=RecursionMaxDepth - 1)

8274returntrue;

8275// Check if all operands are extracts, part of vector node or can build a

8276// regular vectorize node.

8277SmallVector<unsigned, 8> InstsCount;

8278for (Value *V : VL) {

8279auto *I = cast<Instruction>(V);

8280 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {

8281 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);

8282 }));

8283 }

8284bool IsCommutative =

8285isCommutative(S.getMainOp()) ||isCommutative(S.getAltOp());

8286if ((IsCommutative &&

8287 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||

8288 (!IsCommutative &&

8289all_of(InstsCount, [](unsigned ICnt) {return ICnt < 2; })))

8290returntrue;

8291assert(VL.size() == 2 &&"Expected only 2 alternate op instructions.");

8292SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;

8293auto *I1 = cast<Instruction>(VL.front());

8294auto *I2 = cast<Instruction>(VL.back());

8295for (intOp : seq<int>(S.getMainOp()->getNumOperands()))

8296 Candidates.emplace_back().emplace_back(I1->getOperand(Op),

8297 I2->getOperand(Op));

8298if (static_cast<unsigned>(count_if(

8299 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

8300returnfindBestRootPair(Cand,LookAheadHeuristics::ScoreSplat);

8301 })) >= S.getMainOp()->getNumOperands() / 2)

8302returnfalse;

8303if (S.getMainOp()->getNumOperands() > 2)

8304returntrue;

8305if (IsCommutative) {

8306// Check permuted operands.

8307 Candidates.clear();

8308for (intOp = 0, E = S.getMainOp()->getNumOperands();Op < E; ++Op)

8309 Candidates.emplace_back().emplace_back(I1->getOperand(Op),

8310 I2->getOperand((Op + 1) % E));

8311if (any_of(

8312 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

8313returnfindBestRootPair(Cand,LookAheadHeuristics::ScoreSplat);

8314 }))

8315returnfalse;

8316 }

8317returntrue;

8318 };

8319SmallVector<unsigned> SortedIndices;

8320BasicBlock *BB =nullptr;

8321bool IsScatterVectorizeUserTE =

8322 UserTreeIdx.UserTE &&

8323 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;

8324bool AreAllSameBlock = S &&allSameBlock(VL);

8325bool AreScatterAllGEPSameBlock =

8326 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&

8327 VL.size() > 2 &&

8328all_of(VL,

8329 [&BB](Value *V) {

8330auto *I = dyn_cast<GetElementPtrInst>(V);

8331if (!I)

8332returndoesNotNeedToBeScheduled(V);

8333if (!BB)

8334 BB =I->getParent();

8335return BB ==I->getParent() &&I->getNumOperands() == 2;

8336 }) &&

8337 BB &&

8338sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,

8339 SortedIndices));

8340bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;

8341if (!AreAllSameInsts || (!S &&allConstant(VL)) ||isSplat(VL) ||

8342 (S &&

8343 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(

8344 S.getMainOp()) &&

8345 !all_of(VL,isVectorLikeInstWithConstOps)) ||

8346 NotProfitableForVectorization(VL)) {

8347LLVM_DEBUG(dbgs() <<"SLP: Gathering due to C,S,B,O, small shuffle. \n");

8348if (TryToFindDuplicates(S))

8349 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8350 ReuseShuffleIndices);

8351return;

8352 }

8353

8354// Don't vectorize ephemeral values.

8355if (S && !EphValues.empty()) {

8356for (Value *V : VL) {

8357if (EphValues.count(V)) {

8358LLVM_DEBUG(dbgs() <<"SLP: The instruction (" << *V

8359 <<") is ephemeral.\n");

8360 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8361return;

8362 }

8363 }

8364 }

8365

8366// We now know that this is a vector of instructions of the same type from

8367// the same block.

8368

8369// Check that none of the instructions in the bundle are already in the tree.

8370for (Value *V : VL) {

8371if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||

8372doesNotNeedToBeScheduled(V))

8373continue;

8374if (getTreeEntry(V)) {

8375LLVM_DEBUG(dbgs() <<"SLP: The instruction (" << *V

8376 <<") is already in tree.\n");

8377if (TryToFindDuplicates(S))

8378 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8379 ReuseShuffleIndices);

8380return;

8381 }

8382 }

8383

8384// The reduction nodes (stored in UserIgnoreList) also should stay scalar.

8385if (UserIgnoreList && !UserIgnoreList->empty()) {

8386for (Value *V : VL) {

8387if (UserIgnoreList->contains(V)) {

8388LLVM_DEBUG(dbgs() <<"SLP: Gathering due to gathered scalar.\n");

8389if (TryToFindDuplicates(S))

8390 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8391 ReuseShuffleIndices);

8392return;

8393 }

8394 }

8395 }

8396

8397// Special processing for sorted pointers for ScatterVectorize node with

8398// constant indeces only.

8399if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {

8400assert(VL.front()->getType()->isPointerTy() &&

8401count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&

8402"Expected pointers only.");

8403// Reset S to make it GetElementPtr kind of node.

8404constauto *It =find_if(VL, IsaPred<GetElementPtrInst>);

8405assert(It != VL.end() &&"Expected at least one GEP.");

8406 S =getSameOpcode(*It, *TLI);

8407 }

8408

8409// Check that all of the users of the scalars that we want to vectorize are

8410// schedulable.

8411Instruction *VL0 = S.getMainOp();

8412 BB = VL0->getParent();

8413

8414if (S &&

8415 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||

8416 !DT->isReachableFromEntry(BB))) {

8417// Don't go into unreachable blocks. They may contain instructions with

8418// dependency cycles which confuse the final scheduling.

8419// Do not vectorize EH and non-returning blocks, not profitable in most

8420// cases.

8421LLVM_DEBUG(dbgs() <<"SLP: bundle in unreachable block.\n");

8422 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx);

8423return;

8424 }

8425

8426// Check that every instruction appears once in this bundle.

8427if (!TryToFindDuplicates(S,/*DoNotFail=*/true))

8428return;

8429

8430// Perform specific checks for each particular instruction kind.

8431OrdersType CurrentOrder;

8432SmallVector<Value *> PointerOps;

8433 TreeEntry::EntryState State = getScalarsVectorizationState(

8434 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);

8435if (State == TreeEntry::NeedToGather) {

8436 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8437 ReuseShuffleIndices);

8438return;

8439 }

8440

8441auto &BSRef = BlocksSchedules[BB];

8442if (!BSRef)

8443 BSRef = std::make_unique<BlockScheduling>(BB);

8444

8445 BlockScheduling &BS = *BSRef;

8446

8447 std::optional<ScheduleData *> Bundle =

8448 BS.tryScheduleBundle(UniqueValues,this, S);

8449#ifdef EXPENSIVE_CHECKS

8450// Make sure we didn't break any internal invariants

8451 BS.verify();

8452#endif

8453if (!Bundle) {

8454LLVM_DEBUG(dbgs() <<"SLP: We are not able to schedule this bundle!\n");

8455assert((!BS.getScheduleData(VL0) ||

8456 !BS.getScheduleData(VL0)->isPartOfBundle()) &&

8457"tryScheduleBundle should cancelScheduling on failure");

8458 newTreeEntry(VL, std::nullopt/*not vectorized*/, S, UserTreeIdx,

8459 ReuseShuffleIndices);

8460 NonScheduledFirst.insert(VL.front());

8461if (S.getOpcode() == Instruction::Load &&

8462 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)

8463registerNonVectorizableLoads(VL);

8464return;

8465 }

8466LLVM_DEBUG(dbgs() <<"SLP: We are able to schedule this bundle.\n");

8467

8468unsigned ShuffleOrOp =

8469 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();

8470auto CreateOperandNodes = [&](TreeEntry *TE,constauto &Operands) {

8471// Postpone PHI nodes creation

8472SmallVector<unsigned> PHIOps;

8473for (unsignedI : seq<unsigned>(Operands.size())) {

8474ArrayRef<Value *>Op =Operands[I];

8475if (Op.empty())

8476continue;

8477 InstructionsState S =getSameOpcode(Op, *TLI);

8478if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())

8479 buildTree_rec(Op,Depth + 1, {TE,I});

8480else

8481 PHIOps.push_back(I);

8482 }

8483for (unsignedI : PHIOps)

8484 buildTree_rec(Operands[I],Depth + 1, {TE,I});

8485 };

8486switch (ShuffleOrOp) {

8487case Instruction::PHI: {

8488auto *PH = cast<PHINode>(VL0);

8489

8490 TreeEntry *TE =

8491 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);

8492LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (PHINode).\n";

8493TE->dump());

8494

8495// Keeps the reordered operands to avoid code duplication.

8496 PHIHandler Handler(*DT, PH, VL);

8497 Handler.buildOperands();

8498for (unsignedI : seq<unsigned>(PH->getNumOperands()))

8499TE->setOperand(I, Handler.getOperands(I));

8500SmallVector<ArrayRef<Value *>>Operands(PH->getNumOperands());

8501for (unsignedI : seq<unsigned>(PH->getNumOperands()))

8502Operands[I] = Handler.getOperands(I);

8503 CreateOperandNodes(TE,Operands);

8504return;

8505 }

8506case Instruction::ExtractValue:

8507case Instruction::ExtractElement: {

8508if (CurrentOrder.empty()) {

8509LLVM_DEBUG(dbgs() <<"SLP: Reusing or shuffling extract sequence.\n");

8510 }else {

8511LLVM_DEBUG({

8512dbgs() <<"SLP: Reusing or shuffling of reordered extract sequence "

8513"with order";

8514for (unsignedIdx : CurrentOrder)

8515dbgs() <<" " <<Idx;

8516dbgs() <<"\n";

8517 });

8518fixupOrderingIndices(CurrentOrder);

8519 }

8520// Insert new order with initial value 0, if it does not exist,

8521// otherwise return the iterator to the existing one.

8522 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8523 ReuseShuffleIndices, CurrentOrder);

8524LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry "

8525"(ExtractValueInst/ExtractElementInst).\n";

8526TE->dump());

8527// This is a special case, as it does not gather, but at the same time

8528// we are not extending buildTree_rec() towards the operands.

8529TE->setOperand(*this);

8530return;

8531 }

8532case Instruction::InsertElement: {

8533assert(ReuseShuffleIndices.empty() &&"All inserts should be unique");

8534

8535auto OrdCompare = [](const std::pair<int, int> &P1,

8536const std::pair<int, int> &P2) {

8537returnP1.first > P2.first;

8538 };

8539PriorityQueue<std::pair<int, int>,SmallVector<std::pair<int, int>>,

8540decltype(OrdCompare)>

8541 Indices(OrdCompare);

8542for (intI = 0, E = VL.size();I < E; ++I) {

8543unsignedIdx = *getElementIndex(VL[I]);

8544 Indices.emplace(Idx,I);

8545 }

8546OrdersType CurrentOrder(VL.size(), VL.size());

8547bool IsIdentity =true;

8548for (intI = 0, E = VL.size();I < E; ++I) {

8549 CurrentOrder[Indices.top().second] =I;

8550 IsIdentity &= Indices.top().second ==I;

8551 Indices.pop();

8552 }

8553if (IsIdentity)

8554 CurrentOrder.clear();

8555 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8556 {}, CurrentOrder);

8557LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (InsertElementInst).\n";

8558TE->dump());

8559

8560TE->setOperand(*this);

8561 buildTree_rec(TE->getOperand(1),Depth + 1, {TE, 1});

8562return;

8563 }

8564case Instruction::Load: {

8565// Check that a vectorized load would load the same memory as a scalar

8566// load. For example, we don't want to vectorize loads that are smaller

8567// than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM

8568// treats loading/storing it as an i8 struct. If we vectorize loads/stores

8569// from such a struct, we read/write packed bits disagreeing with the

8570// unvectorized version.

8571 TreeEntry *TE =nullptr;

8572fixupOrderingIndices(CurrentOrder);

8573switch (State) {

8574case TreeEntry::Vectorize:

8575TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8576 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);

8577if (CurrentOrder.empty())

8578LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (LoadInst).\n";

8579TE->dump());

8580else

8581LLVM_DEBUG(dbgs()

8582 <<"SLP: added a new TreeEntry (jumbled LoadInst).\n";

8583TE->dump());

8584break;

8585case TreeEntry::StridedVectorize:

8586// Vectorizing non-consecutive loads with `llvm.masked.gather`.

8587TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,

8588 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);

8589LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (strided LoadInst).\n";

8590TE->dump());

8591break;

8592case TreeEntry::ScatterVectorize:

8593// Vectorizing non-consecutive loads with `llvm.masked.gather`.

8594TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,

8595 UserTreeIdx, ReuseShuffleIndices);

8596LLVM_DEBUG(

8597dbgs()

8598 <<"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";

8599TE->dump());

8600break;

8601case TreeEntry::CombinedVectorize:

8602case TreeEntry::NeedToGather:

8603llvm_unreachable("Unexpected loads state.");

8604 }

8605TE->setOperand(*this);

8606if (State == TreeEntry::ScatterVectorize)

8607 buildTree_rec(PointerOps,Depth + 1, {TE, 0});

8608return;

8609 }

8610case Instruction::ZExt:

8611case Instruction::SExt:

8612case Instruction::FPToUI:

8613case Instruction::FPToSI:

8614case Instruction::FPExt:

8615case Instruction::PtrToInt:

8616case Instruction::IntToPtr:

8617case Instruction::SIToFP:

8618case Instruction::UIToFP:

8619case Instruction::Trunc:

8620case Instruction::FPTrunc:

8621case Instruction::BitCast: {

8622auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(

8623 std::make_pair(std::numeric_limits<unsigned>::min(),

8624 std::numeric_limits<unsigned>::max()));

8625if (ShuffleOrOp == Instruction::ZExt ||

8626 ShuffleOrOp == Instruction::SExt) {

8627 CastMaxMinBWSizes = std::make_pair(

8628 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),

8629 PrevMaxBW),

8630 std::min<unsigned>(

8631 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),

8632 PrevMinBW));

8633 }elseif (ShuffleOrOp == Instruction::Trunc) {

8634 CastMaxMinBWSizes = std::make_pair(

8635 std::max<unsigned>(

8636 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),

8637 PrevMaxBW),

8638 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),

8639 PrevMinBW));

8640 }

8641 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8642 ReuseShuffleIndices);

8643LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CastInst).\n";

8644TE->dump());

8645

8646TE->setOperand(*this);

8647for (unsignedI : seq<unsigned>(VL0->getNumOperands()))

8648 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});

8649if (ShuffleOrOp == Instruction::Trunc) {

8650 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);

8651 }elseif (ShuffleOrOp == Instruction::SIToFP ||

8652 ShuffleOrOp == Instruction::UIToFP) {

8653unsigned NumSignBits =

8654ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC,nullptr, DT);

8655if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {

8656APInt Mask = DB->getDemandedBits(OpI);

8657 NumSignBits = std::max(NumSignBits,Mask.countl_zero());

8658 }

8659if (NumSignBits * 2 >=

8660 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))

8661 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);

8662 }

8663return;

8664 }

8665case Instruction::ICmp:

8666case Instruction::FCmp: {

8667// Check that all of the compares have the same predicate.

8668CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

8669 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8670 ReuseShuffleIndices);

8671LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CmpInst).\n";

8672TE->dump());

8673

8674ValueList Left,Right;

8675 VLOperands Ops(VL, S, *this);

8676if (cast<CmpInst>(VL0)->isCommutative()) {

8677// Commutative predicate - collect + sort operands of the instructions

8678// so that each side is more likely to have the same opcode.

8679assert(P0 ==CmpInst::getSwappedPredicate(P0) &&

8680"Commutative Predicate mismatch");

8681 Ops.reorder();

8682Left = Ops.getVL(0);

8683Right = Ops.getVL(1);

8684 }else {

8685// Collect operands - commute if it uses the swapped predicate.

8686for (Value *V : VL) {

8687if (isa<PoisonValue>(V)) {

8688Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));

8689Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));

8690continue;

8691 }

8692auto *Cmp = cast<CmpInst>(V);

8693Value *LHS =Cmp->getOperand(0);

8694Value *RHS =Cmp->getOperand(1);

8695if (Cmp->getPredicate() != P0)

8696std::swap(LHS, RHS);

8697Left.push_back(LHS);

8698Right.push_back(RHS);

8699 }

8700 }

8701TE->setOperand(0,Left);

8702TE->setOperand(1,Right);

8703 buildTree_rec(Left,Depth + 1, {TE, 0});

8704 buildTree_rec(Right,Depth + 1, {TE, 1});

8705if (ShuffleOrOp == Instruction::ICmp) {

8706unsigned NumSignBits0 =

8707ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC,nullptr, DT);

8708if (NumSignBits0 * 2 >=

8709 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))

8710 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);

8711unsigned NumSignBits1 =

8712ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC,nullptr, DT);

8713if (NumSignBits1 * 2 >=

8714 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))

8715 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);

8716 }

8717return;

8718 }

8719case Instruction::Select:

8720case Instruction::FNeg:

8721case Instruction::Add:

8722case Instruction::FAdd:

8723case Instruction::Sub:

8724case Instruction::FSub:

8725case Instruction::Mul:

8726case Instruction::FMul:

8727case Instruction::UDiv:

8728case Instruction::SDiv:

8729case Instruction::FDiv:

8730case Instruction::URem:

8731case Instruction::SRem:

8732case Instruction::FRem:

8733case Instruction::Shl:

8734case Instruction::LShr:

8735case Instruction::AShr:

8736case Instruction::And:

8737case Instruction::Or:

8738case Instruction::Xor:

8739case Instruction::Freeze: {

8740 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8741 ReuseShuffleIndices);

8742LLVM_DEBUG(

8743dbgs() <<"SLP: added a new TreeEntry "

8744"(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";

8745TE->dump());

8746

8747TE->setOperand(*this, isa<BinaryOperator>(VL0) &&isCommutative(VL0));

8748for (unsignedI : seq<unsigned>(VL0->getNumOperands()))

8749 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});

8750return;

8751 }

8752case Instruction::GetElementPtr: {

8753 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8754 ReuseShuffleIndices);

8755LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (GetElementPtrInst).\n";

8756TE->dump());

8757SmallVector<ValueList, 2>Operands(2);

8758// Prepare the operand vector for pointer operands.

8759for (Value *V : VL) {

8760auto *GEP = dyn_cast<GetElementPtrInst>(V);

8761if (!GEP) {

8762Operands.front().push_back(V);

8763continue;

8764 }

8765Operands.front().push_back(GEP->getPointerOperand());

8766 }

8767TE->setOperand(0,Operands.front());

8768// Need to cast all indices to the same type before vectorization to

8769// avoid crash.

8770// Required to be able to find correct matches between different gather

8771// nodes and reuse the vectorized values rather than trying to gather them

8772// again.

8773int IndexIdx = 1;

8774Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();

8775Type *Ty =all_of(VL,

8776 [VL0Ty, IndexIdx](Value *V) {

8777auto *GEP = dyn_cast<GetElementPtrInst>(V);

8778if (!GEP)

8779returntrue;

8780return VL0Ty ==GEP->getOperand(IndexIdx)->getType();

8781 })

8782 ? VL0Ty

8783 : DL->getIndexType(cast<GetElementPtrInst>(VL0)

8784 ->getPointerOperandType()

8785 ->getScalarType());

8786// Prepare the operand vector.

8787for (Value *V : VL) {

8788auto *I = dyn_cast<GetElementPtrInst>(V);

8789if (!I) {

8790Operands.back().push_back(

8791 ConstantInt::get(Ty, 0,/*isSigned=*/false));

8792continue;

8793 }

8794auto *Op =I->getOperand(IndexIdx);

8795auto *CI = dyn_cast<ConstantInt>(Op);

8796if (!CI)

8797Operands.back().push_back(Op);

8798else

8799Operands.back().push_back(ConstantFoldIntegerCast(

8800 CI, Ty, CI->getValue().isSignBitSet(), *DL));

8801 }

8802TE->setOperand(IndexIdx,Operands.back());

8803

8804for (unsignedI = 0, Ops =Operands.size();I < Ops; ++I)

8805 buildTree_rec(Operands[I],Depth + 1, {TE,I});

8806return;

8807 }

8808case Instruction::Store: {

8809bool Consecutive = CurrentOrder.empty();

8810if (!Consecutive)

8811fixupOrderingIndices(CurrentOrder);

8812 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8813 ReuseShuffleIndices, CurrentOrder);

8814if (Consecutive)

8815LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (StoreInst).\n";

8816TE->dump());

8817else

8818LLVM_DEBUG(

8819dbgs() <<"SLP: added a new TreeEntry (jumbled StoreInst).\n";

8820TE->dump());

8821TE->setOperand(*this);

8822 buildTree_rec(TE->getOperand(0),Depth + 1, {TE, 0});

8823return;

8824 }

8825case Instruction::Call: {

8826// Check if the calls are all to the same vectorizable intrinsic or

8827// library function.

8828CallInst *CI = cast<CallInst>(VL0);

8829Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

8830

8831 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8832 ReuseShuffleIndices);

8833LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (CallInst).\n";

8834TE->dump());

8835TE->setOperand(*this,isCommutative(VL0));

8836for (unsignedI : seq<unsigned>(CI->arg_size())) {

8837// For scalar operands no need to create an entry since no need to

8838// vectorize it.

8839if (isVectorIntrinsicWithScalarOpAtArg(ID,I,TTI))

8840continue;

8841 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});

8842 }

8843return;

8844 }

8845case Instruction::ShuffleVector: {

8846 TreeEntry *TE = newTreeEntry(VL, Bundle/*vectorized*/, S, UserTreeIdx,

8847 ReuseShuffleIndices);

8848if (S.isAltShuffle()) {

8849LLVM_DEBUG(dbgs() <<"SLP: added a new TreeEntry (isAltShuffle).\n";

8850TE->dump());

8851 }else {

8852assert(SLPReVec &&"Only supported by REVEC.");

8853LLVM_DEBUG(

8854dbgs() <<"SLP: added a new TreeEntry (ShuffleVectorInst).\n";

8855TE->dump());

8856 }

8857

8858// Reorder operands if reordering would enable vectorization.

8859auto *CI = dyn_cast<CmpInst>(VL0);

8860if (CI &&any_of(VL, [](Value *V) {

8861return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();

8862 })) {

8863auto *MainCI = cast<CmpInst>(S.getMainOp());

8864auto *AltCI = cast<CmpInst>(S.getAltOp());

8865CmpInst::Predicate MainP = MainCI->getPredicate();

8866CmpInst::Predicate AltP = AltCI->getPredicate();

8867assert(MainP != AltP &&

8868"Expected different main/alternate predicates.");

8869ValueList Left,Right;

8870// Collect operands - commute if it uses the swapped predicate or

8871// alternate operation.

8872for (Value *V : VL) {

8873if (isa<PoisonValue>(V)) {

8874Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));

8875Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));

8876continue;

8877 }

8878auto *Cmp = cast<CmpInst>(V);

8879Value *LHS =Cmp->getOperand(0);

8880Value *RHS =Cmp->getOperand(1);

8881

8882if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {

8883if (AltP ==CmpInst::getSwappedPredicate(Cmp->getPredicate()))

8884std::swap(LHS, RHS);

8885 }else {

8886if (MainP ==CmpInst::getSwappedPredicate(Cmp->getPredicate()))

8887std::swap(LHS, RHS);

8888 }

8889Left.push_back(LHS);

8890Right.push_back(RHS);

8891 }

8892TE->setOperand(0,Left);

8893TE->setOperand(1,Right);

8894 buildTree_rec(Left,Depth + 1, {TE, 0});

8895 buildTree_rec(Right,Depth + 1, {TE, 1});

8896return;

8897 }

8898

8899TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);

8900for (unsignedI : seq<unsigned>(VL0->getNumOperands()))

8901 buildTree_rec(TE->getOperand(I),Depth + 1, {TE, I});

8902return;

8903 }

8904default:

8905break;

8906 }

8907llvm_unreachable("Unexpected vectorization of the instructions.");

8908}

8909

8910unsignedBoUpSLP::canMapToVector(Type *T) const{

8911unsignedN = 1;

8912Type *EltTy =T;

8913

8914while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {

8915if (EltTy->isEmptyTy())

8916return 0;

8917if (auto *ST = dyn_cast<StructType>(EltTy)) {

8918// Check that struct is homogeneous.

8919for (constauto *Ty : ST->elements())

8920if (Ty != *ST->element_begin())

8921return 0;

8922N *= ST->getNumElements();

8923 EltTy = *ST->element_begin();

8924 }elseif (auto *AT = dyn_cast<ArrayType>(EltTy)) {

8925N *= AT->getNumElements();

8926 EltTy = AT->getElementType();

8927 }else {

8928auto *VT = cast<FixedVectorType>(EltTy);

8929N *= VT->getNumElements();

8930 EltTy = VT->getElementType();

8931 }

8932 }

8933

8934if (!isValidElementType(EltTy))

8935return 0;

8936uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy,N));

8937if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||

8938 VTSize != DL->getTypeStoreSizeInBits(T))

8939return 0;

8940returnN;

8941}

8942

8943bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,

8944SmallVectorImpl<unsigned> &CurrentOrder,

8945bool ResizeAllowed) const{

8946constauto *It =find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);

8947assert(It != VL.end() &&"Expected at least one extract instruction.");

8948auto *E0 = cast<Instruction>(*It);

8949assert(

8950all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&

8951"Invalid opcode");

8952// Check if all of the extracts come from the same vector and from the

8953// correct offset.

8954Value *Vec = E0->getOperand(0);

8955

8956 CurrentOrder.clear();

8957

8958// We have to extract from a vector/aggregate with the same number of elements.

8959unsigned NElts;

8960if (E0->getOpcode() == Instruction::ExtractValue) {

8961 NElts =canMapToVector(Vec->getType());

8962if (!NElts)

8963returnfalse;

8964// Check if load can be rewritten as load of vector.

8965LoadInst *LI = dyn_cast<LoadInst>(Vec);

8966if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))

8967returnfalse;

8968 }else {

8969 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();

8970 }

8971

8972unsigned E = VL.size();

8973if (!ResizeAllowed && NElts != E)

8974returnfalse;

8975SmallVector<int> Indices(E,PoisonMaskElem);

8976unsigned MinIdx = NElts, MaxIdx = 0;

8977for (auto [I, V] :enumerate(VL)) {

8978auto *Inst = dyn_cast<Instruction>(V);

8979if (!Inst)

8980continue;

8981if (Inst->getOperand(0) != Vec)

8982returnfalse;

8983if (auto *EE = dyn_cast<ExtractElementInst>(Inst))

8984if (isa<UndefValue>(EE->getIndexOperand()))

8985continue;

8986 std::optional<unsigned>Idx =getExtractIndex(Inst);

8987if (!Idx)

8988returnfalse;

8989constunsigned ExtIdx = *Idx;

8990if (ExtIdx >= NElts)

8991continue;

8992 Indices[I] = ExtIdx;

8993if (MinIdx > ExtIdx)

8994 MinIdx = ExtIdx;

8995if (MaxIdx < ExtIdx)

8996 MaxIdx = ExtIdx;

8997 }

8998if (MaxIdx - MinIdx + 1 > E)

8999returnfalse;

9000if (MaxIdx + 1 <= E)

9001 MinIdx = 0;

9002

9003// Check that all of the indices extract from the correct offset.

9004bool ShouldKeepOrder =true;

9005// Assign to all items the initial value E + 1 so we can check if the extract

9006// instruction index was used already.

9007// Also, later we can check that all the indices are used and we have a

9008// consecutive access in the extract instructions, by checking that no

9009// element of CurrentOrder still has value E + 1.

9010 CurrentOrder.assign(E, E);

9011for (unsignedI = 0;I < E; ++I) {

9012if (Indices[I] ==PoisonMaskElem)

9013continue;

9014constunsigned ExtIdx = Indices[I] - MinIdx;

9015if (CurrentOrder[ExtIdx] != E) {

9016 CurrentOrder.clear();

9017returnfalse;

9018 }

9019 ShouldKeepOrder &= ExtIdx ==I;

9020 CurrentOrder[ExtIdx] =I;

9021 }

9022if (ShouldKeepOrder)

9023 CurrentOrder.clear();

9024

9025return ShouldKeepOrder;

9026}

9027

9028bool BoUpSLP::areAllUsersVectorized(

9029Instruction *I,constSmallDenseSet<Value *> *VectorizedVals) const{

9030return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||

9031all_of(I->users(), [this](User *U) {

9032 return ScalarToTreeEntry.contains(U) ||

9033 isVectorLikeInstWithConstOps(U) ||

9034 (isa<ExtractElementInst>(U) && MustGather.contains(U));

9035 });

9036}

9037

9038static std::pair<InstructionCost, InstructionCost>

9039getVectorCallCosts(CallInst *CI,FixedVectorType *VecTy,

9040TargetTransformInfo *TTI,TargetLibraryInfo *TLI,

9041ArrayRef<Type *> ArgTys) {

9042Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

9043

9044// Calculate the cost of the scalar and vector calls.

9045FastMathFlags FMF;

9046if (auto *FPCI = dyn_cast<FPMathOperator>(CI))

9047 FMF = FPCI->getFastMathFlags();

9048IntrinsicCostAttributes CostAttrs(ID, VecTy, ArgTys, FMF);

9049auto IntrinsicCost =

9050TTI->getIntrinsicInstrCost(CostAttrs,TTI::TCK_RecipThroughput);

9051

9052auto Shape =VFShape::get(CI->getFunctionType(),

9053ElementCount::getFixed(VecTy->getNumElements()),

9054false/*HasGlobalPred*/);

9055Function *VecFunc =VFDatabase(*CI).getVectorizedFunction(Shape);

9056auto LibCost = IntrinsicCost;

9057if (!CI->isNoBuiltin() && VecFunc) {

9058// Calculate the cost of the vector library call.

9059// If the corresponding vector call is cheaper, return its cost.

9060 LibCost =

9061TTI->getCallInstrCost(nullptr, VecTy, ArgTys,TTI::TCK_RecipThroughput);

9062 }

9063return {IntrinsicCost, LibCost};

9064}

9065

9066void BoUpSLP::TreeEntry::buildAltOpShuffleMask(

9067constfunction_ref<bool(Instruction *)> IsAltOp,SmallVectorImpl<int> &Mask,

9068SmallVectorImpl<Value *> *OpScalars,

9069SmallVectorImpl<Value *> *AltScalars) const{

9070unsigned Sz = Scalars.size();

9071Mask.assign(Sz,PoisonMaskElem);

9072SmallVector<int> OrderMask;

9073if (!ReorderIndices.empty())

9074inversePermutation(ReorderIndices, OrderMask);

9075for (unsignedI = 0;I < Sz; ++I) {

9076unsignedIdx =I;

9077if (!ReorderIndices.empty())

9078Idx = OrderMask[I];

9079if (isa<PoisonValue>(Scalars[Idx]))

9080continue;

9081auto *OpInst = cast<Instruction>(Scalars[Idx]);

9082if (IsAltOp(OpInst)) {

9083Mask[I] = Sz +Idx;

9084if (AltScalars)

9085 AltScalars->push_back(OpInst);

9086 }else {

9087Mask[I] =Idx;

9088if (OpScalars)

9089 OpScalars->push_back(OpInst);

9090 }

9091 }

9092if (!ReuseShuffleIndices.empty()) {

9093SmallVector<int> NewMask(ReuseShuffleIndices.size(),PoisonMaskElem);

9094transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](intIdx) {

9095 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;

9096 });

9097Mask.swap(NewMask);

9098 }

9099}

9100

9101staticboolisAlternateInstruction(constInstruction *I,

9102constInstruction *MainOp,

9103constInstruction *AltOp,

9104constTargetLibraryInfo &TLI) {

9105if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {

9106auto *AltCI = cast<CmpInst>(AltOp);

9107CmpInst::Predicate MainP = MainCI->getPredicate();

9108 [[maybe_unused]]CmpInst::Predicate AltP = AltCI->getPredicate();

9109assert(MainP != AltP &&"Expected different main/alternate predicates.");

9110auto *CI = cast<CmpInst>(I);

9111if (isCmpSameOrSwapped(MainCI, CI, TLI))

9112returnfalse;

9113if (isCmpSameOrSwapped(AltCI, CI, TLI))

9114returntrue;

9115CmpInst::Predicate P = CI->getPredicate();

9116CmpInst::Predicate SwappedP =CmpInst::getSwappedPredicate(P);

9117

9118assert((MainP ==P || AltP ==P || MainP == SwappedP || AltP == SwappedP) &&

9119"CmpInst expected to match either main or alternate predicate or "

9120"their swap.");

9121return MainP !=P && MainP != SwappedP;

9122 }

9123returnI->getOpcode() == AltOp->getOpcode();

9124}

9125

9126TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {

9127assert(!Ops.empty());

9128constauto *Op0 = Ops.front();

9129

9130constboolIsConstant =all_of(Ops, [](Value *V) {

9131// TODO: We should allow undef elements here

9132returnisConstant(V) && !isa<UndefValue>(V);

9133 });

9134constbool IsUniform =all_of(Ops, [=](Value *V) {

9135// TODO: We should allow undef elements here

9136returnV == Op0;

9137 });

9138constbool IsPowerOfTwo =all_of(Ops, [](Value *V) {

9139// TODO: We should allow undef elements here

9140if (auto *CI = dyn_cast<ConstantInt>(V))

9141return CI->getValue().isPowerOf2();

9142returnfalse;

9143 });

9144constbool IsNegatedPowerOfTwo =all_of(Ops, [](Value *V) {

9145// TODO: We should allow undef elements here

9146if (auto *CI = dyn_cast<ConstantInt>(V))

9147return CI->getValue().isNegatedPowerOf2();

9148returnfalse;

9149 });

9150

9151TTI::OperandValueKind VK =TTI::OK_AnyValue;

9152if (IsConstant && IsUniform)

9153 VK =TTI::OK_UniformConstantValue;

9154elseif (IsConstant)

9155 VK =TTI::OK_NonUniformConstantValue;

9156elseif (IsUniform)

9157 VK =TTI::OK_UniformValue;

9158

9159TTI::OperandValueProperties VP =TTI::OP_None;

9160 VP = IsPowerOfTwo ?TTI::OP_PowerOf2 : VP;

9161 VP = IsNegatedPowerOfTwo ?TTI::OP_NegatedPowerOf2 : VP;

9162

9163return {VK, VP};

9164}

9165

9166namespace{

9167/// The base class for shuffle instruction emission and shuffle cost estimation.

9168classBaseShuffleAnalysis {

9169protected:

9170Type *ScalarTy =nullptr;

9171

9172 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}

9173

9174 /// V is expected to be a vectorized value.

9175 /// When REVEC is disabled, there is no difference between VF and

9176 /// VNumElements.

9177 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.

9178 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead

9179 /// of 8.

9180unsigned getVF(Value *V) const{

9181assert(V &&"V cannot be nullptr");

9182assert(isa<FixedVectorType>(V->getType()) &&

9183"V does not have FixedVectorType");

9184assert(ScalarTy &&"ScalarTy cannot be nullptr");

9185unsigned ScalarTyNumElements =getNumElements(ScalarTy);

9186unsigned VNumElements =

9187 cast<FixedVectorType>(V->getType())->getNumElements();

9188assert(VNumElements > ScalarTyNumElements &&

9189"the number of elements of V is not large enough");

9190assert(VNumElements % ScalarTyNumElements == 0 &&

9191"the number of elements of V is not a vectorized value");

9192return VNumElements / ScalarTyNumElements;

9193 }

9194

9195 /// Checks if the mask is an identity mask.

9196 /// \param IsStrict if is true the function returns false if mask size does

9197 /// not match vector size.

9198staticbool isIdentityMask(ArrayRef<int> Mask,constFixedVectorType *VecTy,

9199bool IsStrict) {

9200int Limit =Mask.size();

9201int VF = VecTy->getNumElements();

9202intIndex = -1;

9203if (VF == Limit &&ShuffleVectorInst::isIdentityMask(Mask, Limit))

9204returntrue;

9205if (!IsStrict) {

9206// Consider extract subvector starting from index 0.

9207if (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF,Index) &&

9208Index == 0)

9209returntrue;

9210// All VF-size submasks are identity (e.g.

9211// <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).

9212if (Limit % VF == 0 &&all_of(seq<int>(0, Limit / VF), [=](intIdx) {

9213ArrayRef<int> Slice =Mask.slice(Idx * VF, VF);

9214returnall_of(Slice, [](intI) {returnI ==PoisonMaskElem; }) ||

9215ShuffleVectorInst::isIdentityMask(Slice, VF);

9216 }))

9217returntrue;

9218 }

9219returnfalse;

9220 }

9221

9222 /// Tries to combine 2 different masks into single one.

9223 /// \param LocalVF Vector length of the permuted input vector. \p Mask may

9224 /// change the size of the vector, \p LocalVF is the original size of the

9225 /// shuffled vector.

9226staticvoid combineMasks(unsigned LocalVF,SmallVectorImpl<int> &Mask,

9227ArrayRef<int> ExtMask) {

9228unsigned VF =Mask.size();

9229SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);

9230for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {

9231if (ExtMask[I] ==PoisonMaskElem)

9232continue;

9233int MaskedIdx =Mask[ExtMask[I] % VF];

9234 NewMask[I] =

9235 MaskedIdx ==PoisonMaskElem ?PoisonMaskElem : MaskedIdx % LocalVF;

9236 }

9237Mask.swap(NewMask);

9238 }

9239

9240 /// Looks through shuffles trying to reduce final number of shuffles in the

9241 /// code. The function looks through the previously emitted shuffle

9242 /// instructions and properly mark indices in mask as undef.

9243 /// For example, given the code

9244 /// \code

9245 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>

9246 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>

9247 /// \endcode

9248 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will

9249 /// look through %s1 and %s2 and select vectors %0 and %1 with mask

9250 /// <0, 1, 2, 3> for the shuffle.

9251 /// If 2 operands are of different size, the smallest one will be resized and

9252 /// the mask recalculated properly.

9253 /// For example, given the code

9254 /// \code

9255 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>

9256 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>

9257 /// \endcode

9258 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will

9259 /// look through %s1 and %s2 and select vectors %0 and %1 with mask

9260 /// <0, 1, 2, 3> for the shuffle.

9261 /// So, it tries to transform permutations to simple vector merge, if

9262 /// possible.

9263 /// \param V The input vector which must be shuffled using the given \p Mask.

9264 /// If the better candidate is found, \p V is set to this best candidate

9265 /// vector.

9266 /// \param Mask The input mask for the shuffle. If the best candidate is found

9267 /// during looking-through-shuffles attempt, it is updated accordingly.

9268 /// \param SinglePermute true if the shuffle operation is originally a

9269 /// single-value-permutation. In this case the look-through-shuffles procedure

9270 /// may look for resizing shuffles as the best candidates.

9271 /// \return true if the shuffle results in the non-resizing identity shuffle

9272 /// (and thus can be ignored), false - otherwise.

9273staticbool peekThroughShuffles(Value *&V,SmallVectorImpl<int> &Mask,

9274bool SinglePermute) {

9275Value *Op =V;

9276ShuffleVectorInst *IdentityOp =nullptr;

9277SmallVector<int> IdentityMask;

9278while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {

9279// Exit if not a fixed vector type or changing size shuffle.

9280auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());

9281if (!SVTy)

9282break;

9283// Remember the identity or broadcast mask, if it is not a resizing

9284// shuffle. If no better candidates are found, this Op and Mask will be

9285// used in the final shuffle.

9286if (isIdentityMask(Mask, SVTy,/*IsStrict=*/false)) {

9287if (!IdentityOp || !SinglePermute ||

9288 (isIdentityMask(Mask, SVTy,/*IsStrict=*/true) &&

9289 !ShuffleVectorInst::isZeroEltSplatMask(IdentityMask,

9290 IdentityMask.size()))) {

9291 IdentityOp = SV;

9292// Store current mask in the IdentityMask so later we did not lost

9293// this info if IdentityOp is selected as the best candidate for the

9294// permutation.

9295 IdentityMask.assign(Mask);

9296 }

9297 }

9298// Remember the broadcast mask. If no better candidates are found, this Op

9299// and Mask will be used in the final shuffle.

9300// Zero splat can be used as identity too, since it might be used with

9301// mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.

9302// E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is

9303// expensive, the analysis founds out, that the source vector is just a

9304// broadcast, this original mask can be transformed to identity mask <0,

9305// 1, 2, 3>.

9306// \code

9307// %0 = shuffle %v, poison, zeroinitalizer

9308// %res = shuffle %0, poison, <3, 1, 2, 0>

9309// \endcode

9310// may be transformed to

9311// \code

9312// %0 = shuffle %v, poison, zeroinitalizer

9313// %res = shuffle %0, poison, <0, 1, 2, 3>

9314// \endcode

9315if (SV->isZeroEltSplat()) {

9316 IdentityOp = SV;

9317 IdentityMask.assign(Mask);

9318 }

9319int LocalVF =Mask.size();

9320if (auto *SVOpTy =

9321 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))

9322 LocalVF = SVOpTy->getNumElements();

9323SmallVector<int> ExtMask(Mask.size(),PoisonMaskElem);

9324for (auto [Idx,I] :enumerate(Mask)) {

9325if (I ==PoisonMaskElem ||

9326static_cast<unsigned>(I) >= SV->getShuffleMask().size())

9327continue;

9328 ExtMask[Idx] = SV->getMaskValue(I);

9329 }

9330bool IsOp1Undef =isUndefVector</*isPoisonOnly=*/true>(

9331 SV->getOperand(0),

9332buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))

9333 .all();

9334bool IsOp2Undef =isUndefVector</*isPoisonOnly=*/true>(

9335 SV->getOperand(1),

9336buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))

9337 .all();

9338if (!IsOp1Undef && !IsOp2Undef) {

9339// Update mask and mark undef elems.

9340for (int &I : Mask) {

9341if (I ==PoisonMaskElem)

9342continue;

9343if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==

9344PoisonMaskElem)

9345I =PoisonMaskElem;

9346 }

9347break;

9348 }

9349SmallVector<int> ShuffleMask(SV->getShuffleMask());

9350 combineMasks(LocalVF, ShuffleMask, Mask);

9351Mask.swap(ShuffleMask);

9352if (IsOp2Undef)

9353Op = SV->getOperand(0);

9354else

9355Op = SV->getOperand(1);

9356 }

9357if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());

9358 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||

9359ShuffleVectorInst::isZeroEltSplatMask(Mask,Mask.size())) {

9360if (IdentityOp) {

9361V = IdentityOp;

9362assert(Mask.size() == IdentityMask.size() &&

9363"Expected masks of same sizes.");

9364// Clear known poison elements.

9365for (auto [I,Idx] :enumerate(Mask))

9366if (Idx ==PoisonMaskElem)

9367 IdentityMask[I] =PoisonMaskElem;

9368Mask.swap(IdentityMask);

9369auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);

9370return SinglePermute &&

9371 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),

9372/*IsStrict=*/true) ||

9373 (Shuffle &&Mask.size() == Shuffle->getShuffleMask().size() &&

9374 Shuffle->isZeroEltSplat() &&

9375ShuffleVectorInst::isZeroEltSplatMask(Mask,Mask.size())));

9376 }

9377V =Op;

9378returnfalse;

9379 }

9380V =Op;

9381returntrue;

9382 }

9383

9384 /// Smart shuffle instruction emission, walks through shuffles trees and

9385 /// tries to find the best matching vector for the actual shuffle

9386 /// instruction.

9387template <typename T,typename ShuffleBuilderTy>

9388staticT createShuffle(Value *V1,Value *V2,ArrayRef<int> Mask,

9389 ShuffleBuilderTy &Builder,Type *ScalarTy) {

9390assert(V1 &&"Expected at least one vector value.");

9391unsigned ScalarTyNumElements =getNumElements(ScalarTy);

9392SmallVector<int> NewMask(Mask);

9393if (ScalarTyNumElements != 1) {

9394assert(SLPReVec &&"FixedVectorType is not expected.");

9395transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewMask);

9396Mask = NewMask;

9397 }

9398if (V2)

9399 Builder.resizeToMatch(V1, V2);

9400int VF =Mask.size();

9401if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))

9402 VF = FTy->getNumElements();

9403if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(

9404 V2,buildUseMask(VF, Mask, UseMask::SecondArg))

9405 .all()) {

9406// Peek through shuffles.

9407Value *Op1 = V1;

9408Value *Op2 =V2;

9409int VF =

9410 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

9411SmallVector<int> CombinedMask1(Mask.size(),PoisonMaskElem);

9412SmallVector<int> CombinedMask2(Mask.size(),PoisonMaskElem);

9413for (intI = 0,E =Mask.size();I <E; ++I) {

9414if (Mask[I] < VF)

9415 CombinedMask1[I] =Mask[I];

9416else

9417 CombinedMask2[I] =Mask[I] - VF;

9418 }

9419Value *PrevOp1;

9420Value *PrevOp2;

9421do {

9422 PrevOp1 = Op1;

9423 PrevOp2 = Op2;

9424 (void)peekThroughShuffles(Op1, CombinedMask1,/*SinglePermute=*/false);

9425 (void)peekThroughShuffles(Op2, CombinedMask2,/*SinglePermute=*/false);

9426// Check if we have 2 resizing shuffles - need to peek through operands

9427// again.

9428if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))

9429if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {

9430SmallVector<int> ExtMask1(Mask.size(),PoisonMaskElem);

9431for (auto [Idx,I] :enumerate(CombinedMask1)) {

9432if (I ==PoisonMaskElem)

9433continue;

9434 ExtMask1[Idx] = SV1->getMaskValue(I);

9435 }

9436SmallBitVector UseMask1 =buildUseMask(

9437 cast<FixedVectorType>(SV1->getOperand(1)->getType())

9438 ->getNumElements(),

9439 ExtMask1, UseMask::SecondArg);

9440SmallVector<int> ExtMask2(CombinedMask2.size(),PoisonMaskElem);

9441for (auto [Idx,I] :enumerate(CombinedMask2)) {

9442if (I ==PoisonMaskElem)

9443continue;

9444 ExtMask2[Idx] = SV2->getMaskValue(I);

9445 }

9446SmallBitVector UseMask2 =buildUseMask(

9447 cast<FixedVectorType>(SV2->getOperand(1)->getType())

9448 ->getNumElements(),

9449 ExtMask2, UseMask::SecondArg);

9450if (SV1->getOperand(0)->getType() ==

9451 SV2->getOperand(0)->getType() &&

9452 SV1->getOperand(0)->getType() != SV1->getType() &&

9453isUndefVector(SV1->getOperand(1), UseMask1).all() &&

9454isUndefVector(SV2->getOperand(1), UseMask2).all()) {

9455 Op1 = SV1->getOperand(0);

9456 Op2 = SV2->getOperand(0);

9457SmallVector<int> ShuffleMask1(SV1->getShuffleMask());

9458int LocalVF = ShuffleMask1.size();

9459if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))

9460 LocalVF = FTy->getNumElements();

9461 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);

9462 CombinedMask1.swap(ShuffleMask1);

9463SmallVector<int> ShuffleMask2(SV2->getShuffleMask());

9464 LocalVF = ShuffleMask2.size();

9465if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))

9466 LocalVF = FTy->getNumElements();

9467 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);

9468 CombinedMask2.swap(ShuffleMask2);

9469 }

9470 }

9471 }while (PrevOp1 != Op1 || PrevOp2 != Op2);

9472 Builder.resizeToMatch(Op1, Op2);

9473 VF = std::max(cast<VectorType>(Op1->getType())

9474 ->getElementCount()

9475 .getKnownMinValue(),

9476 cast<VectorType>(Op2->getType())

9477 ->getElementCount()

9478 .getKnownMinValue());

9479for (intI = 0,E =Mask.size();I <E; ++I) {

9480if (CombinedMask2[I] !=PoisonMaskElem) {

9481assert(CombinedMask1[I] ==PoisonMaskElem &&

9482"Expected undefined mask element");

9483 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);

9484 }

9485 }

9486if (Op1 == Op2 &&

9487 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||

9488 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&

9489 isa<ShuffleVectorInst>(Op1) &&

9490 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==

9491ArrayRef(CombinedMask1))))

9492return Builder.createIdentity(Op1);

9493return Builder.createShuffleVector(

9494 Op1, Op1 == Op2 ?PoisonValue::get(Op1->getType()) : Op2,

9495 CombinedMask1);

9496 }

9497if (isa<PoisonValue>(V1))

9498return Builder.createPoison(

9499 cast<VectorType>(V1->getType())->getElementType(),Mask.size());

9500bool IsIdentity = peekThroughShuffles(V1, NewMask,/*SinglePermute=*/true);

9501assert(V1 &&"Expected non-null value after looking through shuffles.");

9502

9503if (!IsIdentity)

9504return Builder.createShuffleVector(V1, NewMask);

9505return Builder.createIdentity(V1);

9506 }

9507

9508 /// Transforms mask \p CommonMask per given \p Mask to make proper set after

9509 /// shuffle emission.

9510staticvoid transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,

9511ArrayRef<int> Mask) {

9512for (unsignedI : seq<unsigned>(CommonMask.size()))

9513if (Mask[I] !=PoisonMaskElem)

9514 CommonMask[I] =I;

9515 }

9516};

9517}// namespace

9518

9519/// Calculate the scalar and the vector costs from vectorizing set of GEPs.

9520static std::pair<InstructionCost, InstructionCost>

9521getGEPCosts(constTargetTransformInfo &TTI,ArrayRef<Value *> Ptrs,

9522Value *BasePtr,unsigned Opcode,TTI::TargetCostKind CostKind,

9523Type *ScalarTy,VectorType *VecTy) {

9524InstructionCost ScalarCost = 0;

9525InstructionCost VecCost = 0;

9526// Here we differentiate two cases: (1) when Ptrs represent a regular

9527// vectorization tree node (as they are pointer arguments of scattered

9528// loads) or (2) when Ptrs are the arguments of loads or stores being

9529// vectorized as plane wide unit-stride load/store since all the

9530// loads/stores are known to be from/to adjacent locations.

9531if (Opcode == Instruction::Load || Opcode == Instruction::Store) {

9532// Case 2: estimate costs for pointer related costs when vectorizing to

9533// a wide load/store.

9534// Scalar cost is estimated as a set of pointers with known relationship

9535// between them.

9536// For vector code we will use BasePtr as argument for the wide load/store

9537// but we also need to account all the instructions which are going to

9538// stay in vectorized code due to uses outside of these scalar

9539// loads/stores.

9540 ScalarCost =TTI.getPointersChainCost(

9541 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,

9542CostKind);

9543

9544SmallVector<const Value *> PtrsRetainedInVecCode;

9545for (Value *V : Ptrs) {

9546if (V == BasePtr) {

9547 PtrsRetainedInVecCode.push_back(V);

9548continue;

9549 }

9550auto *Ptr = dyn_cast<GetElementPtrInst>(V);

9551// For simplicity assume Ptr to stay in vectorized code if it's not a

9552// GEP instruction. We don't care since it's cost considered free.

9553// TODO: We should check for any uses outside of vectorizable tree

9554// rather than just single use.

9555if (!Ptr || !Ptr->hasOneUse())

9556 PtrsRetainedInVecCode.push_back(V);

9557 }

9558

9559if (PtrsRetainedInVecCode.size() == Ptrs.size()) {

9560// If all pointers stay in vectorized code then we don't have

9561// any savings on that.

9562return std::make_pair(TTI::TCC_Free,TTI::TCC_Free);

9563 }

9564 VecCost =TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,

9565 TTI::PointersChainInfo::getKnownStride(),

9566 VecTy,CostKind);

9567 }else {

9568// Case 1: Ptrs are the arguments of loads that we are going to transform

9569// into masked gather load intrinsic.

9570// All the scalar GEPs will be removed as a result of vectorization.

9571// For any external uses of some lanes extract element instructions will

9572// be generated (which cost is estimated separately).

9573TTI::PointersChainInfo PtrsInfo =

9574all_of(Ptrs,

9575 [](constValue *V) {

9576auto *Ptr = dyn_cast<GetElementPtrInst>(V);

9577returnPtr && !Ptr->hasAllConstantIndices();

9578 })

9579 ? TTI::PointersChainInfo::getUnknownStride()

9580 : TTI::PointersChainInfo::getKnownStride();

9581

9582 ScalarCost =

9583TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy,CostKind);

9584auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);

9585if (!BaseGEP) {

9586auto *It =find_if(Ptrs, IsaPred<GEPOperator>);

9587if (It != Ptrs.end())

9588 BaseGEP = cast<GEPOperator>(*It);

9589 }

9590if (BaseGEP) {

9591SmallVector<const Value *> Indices(BaseGEP->indices());

9592 VecCost =TTI.getGEPCost(BaseGEP->getSourceElementType(),

9593 BaseGEP->getPointerOperand(), Indices, VecTy,

9594CostKind);

9595 }

9596 }

9597

9598return std::make_pair(ScalarCost, VecCost);

9599}

9600

9601void BoUpSLP::reorderGatherNode(TreeEntry &TE) {

9602assert(TE.isGather() &&TE.ReorderIndices.empty() &&

9603"Expected gather node without reordering.");

9604DenseMap<std::pair<size_t, Value *>,SmallVector<LoadInst *>> LoadsMap;

9605SmallSet<size_t, 2> LoadKeyUsed;

9606

9607// Do not reorder nodes if it small (just 2 elements), all-constant or all

9608// instructions have same opcode already.

9609if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||

9610all_of(TE.Scalars,isConstant))

9611return;

9612

9613if (any_of(seq<unsigned>(TE.Idx), [&](unsignedIdx) {

9614 return VectorizableTree[Idx]->isSame(TE.Scalars);

9615 }))

9616return;

9617

9618auto GenerateLoadsSubkey = [&](size_tKey,LoadInst *LI) {

9619Key =hash_combine(hash_value(LI->getParent()), Key);

9620Value *Ptr =

9621getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth);

9622if (LoadKeyUsed.contains(Key)) {

9623auto LIt = LoadsMap.find(std::make_pair(Key,Ptr));

9624if (LIt != LoadsMap.end()) {

9625for (LoadInst *RLI : LIt->second) {

9626if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),

9627 LI->getType(), LI->getPointerOperand(), *DL, *SE,

9628/*StrictCheck=*/true))

9629returnhash_value(RLI->getPointerOperand());

9630 }

9631for (LoadInst *RLI : LIt->second) {

9632if (arePointersCompatible(RLI->getPointerOperand(),

9633 LI->getPointerOperand(), *TLI)) {

9634hash_code SubKey =hash_value(RLI->getPointerOperand());

9635return SubKey;

9636 }

9637 }

9638if (LIt->second.size() > 2) {

9639hash_code SubKey =

9640hash_value(LIt->second.back()->getPointerOperand());

9641return SubKey;

9642 }

9643 }

9644 }

9645 LoadKeyUsed.insert(Key);

9646 LoadsMap.try_emplace(std::make_pair(Key,Ptr)).first->second.push_back(LI);

9647returnhash_value(LI->getPointerOperand());

9648 };

9649MapVector<size_t, MapVector<size_t, SmallVector<Value *>>> SortedValues;

9650SmallDenseMap<Value *, SmallVector<unsigned>, 8> KeyToIndex;

9651bool IsOrdered =true;

9652unsigned NumInstructions = 0;

9653// Try to "cluster" scalar instructions, to be able to build extra vectorized

9654// nodes.

9655for (auto [I, V] :enumerate(TE.Scalars)) {

9656size_tKey = 1,Idx = 1;

9657if (auto *Inst = dyn_cast<Instruction>(V);

9658 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&

9659 !isDeleted(Inst) && !isVectorized(V)) {

9660 std::tie(Key,Idx) =generateKeySubkey(V, TLI, GenerateLoadsSubkey,

9661/*AllowAlternate=*/false);

9662 ++NumInstructions;

9663 }

9664auto &Container = SortedValues[Key];

9665if (IsOrdered && !KeyToIndex.contains(V) &&

9666 !(isa<Constant, ExtractElementInst>(V) ||

9667isVectorLikeInstWithConstOps(V)) &&

9668 ((Container.contains(Idx) &&

9669 KeyToIndex.at(Container[Idx].back()).back() !=I - 1) ||

9670 (!Container.empty() && !Container.contains(Idx) &&

9671 KeyToIndex.at(Container.back().second.back()).back() !=I - 1)))

9672 IsOrdered =false;

9673auto &KTI = KeyToIndex[V];

9674if (KTI.empty())

9675 Container[Idx].push_back(V);

9676 KTI.push_back(I);

9677 }

9678SmallVector<std::pair<unsigned, unsigned>> SubVectors;

9679APInt DemandedElts =APInt::getAllOnes(TE.Scalars.size());

9680if (!IsOrdered && NumInstructions > 1) {

9681unsigned Cnt = 0;

9682TE.ReorderIndices.resize(TE.Scalars.size(),TE.Scalars.size());

9683for (constauto &D : SortedValues) {

9684for (constauto &P :D.second) {

9685unsigned Sz = 0;

9686for (Value *V :P.second) {

9687ArrayRef<unsigned> Indices = KeyToIndex.at(V);

9688for (auto [K,Idx] :enumerate(Indices)) {

9689TE.ReorderIndices[Cnt +K] =Idx;

9690TE.Scalars[Cnt +K] =V;

9691 }

9692 Sz += Indices.size();

9693 Cnt += Indices.size();

9694 }

9695if (Sz > 1 && isa<Instruction>(P.second.front())) {

9696constunsigned SubVF =getFloorFullVectorNumberOfElements(

9697 *TTI,TE.Scalars.front()->getType(), Sz);

9698 SubVectors.emplace_back(Cnt - Sz, SubVF);

9699for (unsignedI : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))

9700 DemandedElts.clearBit(I);

9701 }elseif (!P.second.empty() &&isConstant(P.second.front())) {

9702for (unsignedI : seq<unsigned>(Cnt - Sz, Cnt))

9703 DemandedElts.clearBit(I);

9704 }

9705 }

9706 }

9707 }

9708// Reuses always require shuffles, so consider it as profitable.

9709if (!TE.ReuseShuffleIndices.empty() ||TE.ReorderIndices.empty())

9710return;

9711// Do simple cost estimation.

9712constexprTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

9713InstructionCost Cost = 0;

9714auto *ScalarTy =TE.Scalars.front()->getType();

9715auto *VecTy =getWidenedType(ScalarTy,TE.Scalars.size());

9716for (auto [Idx, Sz] : SubVectors) {

9717Cost +=::getShuffleCost(*TTI,TTI::SK_InsertSubvector, VecTy, {},CostKind,

9718Idx,getWidenedType(ScalarTy, Sz));

9719 }

9720if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {

9721assert(SLPReVec &&"Only supported by REVEC.");

9722// If ScalarTy is FixedVectorType, we should use CreateInsertVector instead

9723// of CreateInsertElement.

9724unsigned ScalarTyNumElements =getNumElements(ScalarTy);

9725for (unsignedI : seq<unsigned>(TE.Scalars.size()))

9726if (DemandedElts[I])

9727Cost +=

9728TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,

9729CostKind,I * ScalarTyNumElements, FTy);

9730 }else {

9731Cost +=TTI->getScalarizationOverhead(VecTy, DemandedElts,/*Insert=*/true,

9732/*Extract=*/false,CostKind);

9733 }

9734int Sz =TE.Scalars.size();

9735SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),

9736TE.ReorderIndices.end());

9737for (unsignedI : seq<unsigned>(Sz)) {

9738Value *V =TE.getOrdered(I);

9739if (isa<PoisonValue>(V)) {

9740 ReorderMask[I] =PoisonMaskElem;

9741 }elseif (isConstant(V) || DemandedElts[I]) {

9742 ReorderMask[I] =I +TE.ReorderIndices.size();

9743 }

9744 }

9745Cost +=::getShuffleCost(*TTI,

9746any_of(ReorderMask, [&](intI) {returnI >= Sz; })

9747 ?TTI::SK_PermuteTwoSrc

9748 :TTI::SK_PermuteSingleSrc,

9749 VecTy, ReorderMask);

9750 DemandedElts =APInt::getAllOnes(VecTy->getNumElements());

9751 ReorderMask.assign(Sz,PoisonMaskElem);

9752for (unsignedI : seq<unsigned>(Sz)) {

9753Value *V =TE.getOrdered(I);

9754if (isConstant(V)) {

9755 DemandedElts.clearBit(I);

9756if (!isa<PoisonValue>(V))

9757 ReorderMask[I] =I;

9758 }else {

9759 ReorderMask[I] =I + Sz;

9760 }

9761 }

9762InstructionCost BVCost =TTI->getScalarizationOverhead(

9763 VecTy, DemandedElts,/*Insert=*/true,/*Extract=*/false,CostKind);

9764if (!DemandedElts.isAllOnes())

9765 BVCost +=::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);

9766if (Cost >= BVCost) {

9767SmallVector<int>Mask(TE.ReorderIndices.begin(),TE.ReorderIndices.end());

9768reorderScalars(TE.Scalars, Mask);

9769TE.ReorderIndices.clear();

9770 }

9771}

9772

9773voidBoUpSLP::transformNodes() {

9774constexprTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

9775 BaseGraphSize = VectorizableTree.size();

9776// Turn graph transforming mode on and off, when done.

9777classGraphTransformModeRAAI {

9778bool &SavedIsGraphTransformMode;

9779

9780public:

9781 GraphTransformModeRAAI(bool &IsGraphTransformMode)

9782 : SavedIsGraphTransformMode(IsGraphTransformMode) {

9783 IsGraphTransformMode =true;

9784 }

9785 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =false; }

9786 } TransformContext(IsGraphTransformMode);

9787// Operands are profitable if they are:

9788// 1. At least one constant

9789// or

9790// 2. Splats

9791// or

9792// 3. Results in good vectorization opportunity, i.e. may generate vector

9793// nodes and reduce cost of the graph.

9794auto CheckOperandsProfitability = [this](Instruction *I1,Instruction *I2,

9795const InstructionsState &S) {

9796SmallVector<SmallVector<std::pair<Value *, Value *>>> Candidates;

9797for (unsignedOp : seq<unsigned>(S.getMainOp()->getNumOperands()))

9798 Candidates.emplace_back().emplace_back(I1->getOperand(Op),

9799 I2->getOperand(Op));

9800returnall_of(

9801 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {

9802returnall_of(Cand,

9803 [](const std::pair<Value *, Value *> &P) {

9804return isa<Constant>(P.first) ||

9805 isa<Constant>(P.second) ||P.first ==P.second;

9806 }) ||

9807findBestRootPair(Cand,LookAheadHeuristics::ScoreSplatLoads);

9808 });

9809 };

9810

9811// Try to reorder gather nodes for better vectorization opportunities.

9812for (unsignedIdx : seq<unsigned>(BaseGraphSize)) {

9813 TreeEntry &E = *VectorizableTree[Idx];

9814if (E.isGather())

9815 reorderGatherNode(E);

9816 }

9817

9818// The tree may grow here, so iterate over nodes, built before.

9819for (unsignedIdx : seq<unsigned>(BaseGraphSize)) {

9820 TreeEntry &E = *VectorizableTree[Idx];

9821if (E.isGather()) {

9822ArrayRef<Value *> VL = E.Scalars;

9823constunsigned Sz =getVectorElementSize(VL.front());

9824unsigned MinVF =getMinVF(2 * Sz);

9825// Do not try partial vectorization for small nodes (<= 2), nodes with the

9826// same opcode and same parent block or all constants.

9827if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||

9828 !(!E.hasState() || E.getOpcode() == Instruction::Load ||

9829 E.isAltShuffle() || !allSameBlock(VL)) ||

9830allConstant(VL) ||isSplat(VL))

9831continue;

9832// Try to find vectorizable sequences and transform them into a series of

9833// insertvector instructions.

9834unsigned StartIdx = 0;

9835unsignedEnd = VL.size();

9836for (unsigned VF =getFloorFullVectorNumberOfElements(

9837 *TTI, VL.front()->getType(), VL.size() - 1);

9838 VF >= MinVF; VF =getFloorFullVectorNumberOfElements(

9839 *TTI, VL.front()->getType(), VF - 1)) {

9840if (StartIdx + VF >End)

9841continue;

9842SmallVector<std::pair<unsigned, unsigned>> Slices;

9843for (unsigned Cnt = StartIdx; Cnt + VF <=End; Cnt += VF) {

9844ArrayRef<Value *> Slice = VL.slice(Cnt, VF);

9845// If any instruction is vectorized already - do not try again.

9846// Reuse the existing node, if it fully matches the slice.

9847if (const TreeEntry *SE = getTreeEntry(Slice.front());

9848 SE || getTreeEntry(Slice.back())) {

9849if (!SE)

9850continue;

9851if (VF != SE->getVectorFactor() || !SE->isSame(Slice))

9852continue;

9853 }

9854// Constant already handled effectively - skip.

9855if (allConstant(Slice))

9856continue;

9857// Do not try to vectorize small splats (less than vector register and

9858// only with the single non-undef element).

9859bool IsSplat =isSplat(Slice);

9860bool IsTwoRegisterSplat =true;

9861if (IsSplat && VF == 2) {

9862unsigned NumRegs2VF =::getNumberOfParts(

9863 *TTI,getWidenedType(Slice.front()->getType(), 2 * VF));

9864 IsTwoRegisterSplat = NumRegs2VF == 2;

9865 }

9866if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat ||

9867count(Slice, Slice.front()) ==

9868static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1

9869 : 1)) {

9870if (IsSplat)

9871continue;

9872 InstructionsState S =getSameOpcode(Slice, *TLI);

9873if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||

9874 (S.getOpcode() == Instruction::Load &&

9875areKnownNonVectorizableLoads(Slice)) ||

9876 (S.getOpcode() != Instruction::Load &&

9877 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))

9878continue;

9879if (VF == 2) {

9880// Try to vectorize reduced values or if all users are vectorized.

9881// For expensive instructions extra extracts might be profitable.

9882if ((!UserIgnoreList || E.Idx != 0) &&

9883TTI->getInstructionCost(S.getMainOp(),CostKind) <

9884TTI::TCC_Expensive &&

9885 !all_of(Slice, [&](Value *V) {

9886if (isa<PoisonValue>(V))

9887returntrue;

9888return areAllUsersVectorized(cast<Instruction>(V),

9889 UserIgnoreList);

9890 }))

9891continue;

9892if (S.getOpcode() == Instruction::Load) {

9893OrdersType Order;

9894SmallVector<Value *> PointerOps;

9895LoadsState Res =

9896canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);

9897// Do not vectorize gathers.

9898if (Res ==LoadsState::ScatterVectorize ||

9899 Res ==LoadsState::Gather) {

9900if (Res ==LoadsState::Gather) {

9901registerNonVectorizableLoads(Slice);

9902// If reductions and the scalars from the root node are

9903// analyzed - mark as non-vectorizable reduction.

9904if (UserIgnoreList && E.Idx == 0)

9905analyzedReductionVals(Slice);

9906 }

9907continue;

9908 }

9909 }elseif (S.getOpcode() == Instruction::ExtractElement ||

9910 (TTI->getInstructionCost(S.getMainOp(),CostKind) <

9911TTI::TCC_Expensive &&

9912 !CheckOperandsProfitability(

9913 S.getMainOp(),

9914 cast<Instruction>(*find_if(reverse(Slice),

9915 IsaPred<Instruction>)),

9916 S))) {

9917// Do not vectorize extractelements (handled effectively

9918// alread). Do not vectorize non-profitable instructions (with

9919// low cost and non-vectorizable operands.)

9920continue;

9921 }

9922 }

9923 }

9924 Slices.emplace_back(Cnt, Slice.size());

9925 }

9926auto AddCombinedNode = [&](unsignedIdx,unsigned Cnt,unsigned Sz) {

9927 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);

9928if (StartIdx == Cnt)

9929 StartIdx = Cnt + Sz;

9930if (End == Cnt + Sz)

9931End = Cnt;

9932 };

9933for (auto [Cnt, Sz] : Slices) {

9934ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);

9935// If any instruction is vectorized already - do not try again.

9936if (TreeEntry *SE = getTreeEntry(Slice.front());

9937 SE || getTreeEntry(Slice.back())) {

9938if (!SE)

9939continue;

9940if (VF != SE->getVectorFactor() || !SE->isSame(Slice))

9941continue;

9942 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);

9943 AddCombinedNode(SE->Idx, Cnt, Sz);

9944continue;

9945 }

9946unsigned PrevSize = VectorizableTree.size();

9947 [[maybe_unused]]unsigned PrevEntriesSize =

9948 LoadEntriesToVectorize.size();

9949 buildTree_rec(Slice, 0,EdgeInfo(&E, UINT_MAX));

9950if (PrevSize + 1 == VectorizableTree.size() &&

9951 VectorizableTree[PrevSize]->isGather() &&

9952 VectorizableTree[PrevSize]->hasState() &&

9953 VectorizableTree[PrevSize]->getOpcode() !=

9954 Instruction::ExtractElement &&

9955 !isSplat(Slice)) {

9956if (UserIgnoreList && E.Idx == 0 && VF == 2)

9957analyzedReductionVals(Slice);

9958 VectorizableTree.pop_back();

9959assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&

9960"LoadEntriesToVectorize expected to remain the same");

9961continue;

9962 }

9963 AddCombinedNode(PrevSize, Cnt, Sz);

9964 }

9965 }

9966// Restore ordering, if no extra vectorization happened.

9967if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {

9968SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());

9969reorderScalars(E.Scalars, Mask);

9970 E.ReorderIndices.clear();

9971 }

9972 }

9973if (!E.hasState())

9974continue;

9975switch (E.getOpcode()) {

9976case Instruction::Load: {

9977// No need to reorder masked gather loads, just reorder the scalar

9978// operands.

9979if (E.State != TreeEntry::Vectorize)

9980break;

9981Type *ScalarTy = E.getMainOp()->getType();

9982auto *VecTy =getWidenedType(ScalarTy, E.Scalars.size());

9983Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);

9984// Check if profitable to represent consecutive load + reverse as strided

9985// load with stride -1.

9986if (!E.ReorderIndices.empty() &&isReverseOrder(E.ReorderIndices) &&

9987TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {

9988SmallVector<int> Mask;

9989inversePermutation(E.ReorderIndices, Mask);

9990auto *BaseLI = cast<LoadInst>(E.Scalars.back());

9991InstructionCost OriginalVecCost =

9992TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),

9993 BaseLI->getPointerAddressSpace(),CostKind,

9994TTI::OperandValueInfo()) +

9995::getShuffleCost(*TTI,TTI::SK_Reverse, VecTy, Mask,CostKind);

9996InstructionCost StridedCost =TTI->getStridedMemoryOpCost(

9997 Instruction::Load, VecTy, BaseLI->getPointerOperand(),

9998/*VariableMask=*/false, CommonAlignment,CostKind, BaseLI);

9999if (StridedCost < OriginalVecCost)

10000// Strided load is more profitable than consecutive load + reverse -

10001// transform the node to strided load.

10002 E.State = TreeEntry::StridedVectorize;

10003 }

10004break;

10005 }

10006case Instruction::Store: {

10007Type *ScalarTy =

10008 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();

10009auto *VecTy =getWidenedType(ScalarTy, E.Scalars.size());

10010Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);

10011// Check if profitable to represent consecutive load + reverse as strided

10012// load with stride -1.

10013if (!E.ReorderIndices.empty() &&isReverseOrder(E.ReorderIndices) &&

10014TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {

10015SmallVector<int> Mask;

10016inversePermutation(E.ReorderIndices, Mask);

10017auto *BaseSI = cast<StoreInst>(E.Scalars.back());

10018InstructionCost OriginalVecCost =

10019TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),

10020 BaseSI->getPointerAddressSpace(),CostKind,

10021TTI::OperandValueInfo()) +

10022::getShuffleCost(*TTI,TTI::SK_Reverse, VecTy, Mask,CostKind);

10023InstructionCost StridedCost =TTI->getStridedMemoryOpCost(

10024 Instruction::Store, VecTy, BaseSI->getPointerOperand(),

10025/*VariableMask=*/false, CommonAlignment,CostKind, BaseSI);

10026if (StridedCost < OriginalVecCost)

10027// Strided store is more profitable than reverse + consecutive store -

10028// transform the node to strided store.

10029 E.State = TreeEntry::StridedVectorize;

10030 }elseif (!E.ReorderIndices.empty()) {

10031// Check for interleaved stores.

10032auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {

10033auto *BaseSI = cast<StoreInst>(E.Scalars.front());

10034assert(Mask.size() > 1 &&"Expected mask greater than 1 element.");

10035if (Mask.size() < 4)

10036return 0u;

10037for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {

10038if (ShuffleVectorInst::isInterleaveMask(

10039 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&

10040TTI.isLegalInterleavedAccessType(

10041 VecTy, Factor, BaseSI->getAlign(),

10042 BaseSI->getPointerAddressSpace()))

10043return Factor;

10044 }

10045

10046return 0u;

10047 };

10048SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());

10049unsigned InterleaveFactor = IsInterleaveMask(Mask);

10050if (InterleaveFactor != 0)

10051 E.setInterleave(InterleaveFactor);

10052 }

10053break;

10054 }

10055case Instruction::Select: {

10056if (E.State != TreeEntry::Vectorize)

10057break;

10058auto [MinMaxID, SelectOnly] =canConvertToMinOrMaxIntrinsic(E.Scalars);

10059if (MinMaxID ==Intrinsic::not_intrinsic)

10060break;

10061// This node is a minmax node.

10062 E.CombinedOp = TreeEntry::MinMax;

10063 TreeEntry *CondEntry =const_cast<TreeEntry *>(getOperandEntry(&E, 0));

10064if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&

10065 CondEntry->State == TreeEntry::Vectorize) {

10066// The condition node is part of the combined minmax node.

10067 CondEntry->State = TreeEntry::CombinedVectorize;

10068 }

10069break;

10070 }

10071default:

10072break;

10073 }

10074 }

10075

10076if (LoadEntriesToVectorize.empty()) {

10077// Single load node - exit.

10078if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&

10079 VectorizableTree.front()->getOpcode() == Instruction::Load)

10080return;

10081// Small graph with small VF - exit.

10082constexprunsigned SmallTree = 3;

10083constexprunsigned SmallVF = 2;

10084if ((VectorizableTree.size() <= SmallTree &&

10085 VectorizableTree.front()->Scalars.size() == SmallVF) ||

10086 (VectorizableTree.size() <= 2 && UserIgnoreList))

10087return;

10088

10089if (VectorizableTree.front()->isNonPowOf2Vec() &&

10090getCanonicalGraphSize() !=getTreeSize() && UserIgnoreList &&

10091getCanonicalGraphSize() <= SmallTree &&

10092count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),

10093 [](const std::unique_ptr<TreeEntry> &TE) {

10094return TE->isGather() && TE->hasState() &&

10095 TE->getOpcode() == Instruction::Load &&

10096 !allSameBlock(TE->Scalars);

10097 }) == 1)

10098return;

10099 }

10100

10101// A list of loads to be gathered during the vectorization process. We can

10102// try to vectorize them at the end, if profitable.

10103SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,

10104SmallVector<SmallVector<std::pair<LoadInst *, int>>>, 8>

10105 GatheredLoads;

10106

10107for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

10108 TreeEntry &E = *TE;

10109if (E.isGather() &&

10110 ((E.hasState() && E.getOpcode() == Instruction::Load) ||

10111 (!E.hasState() &&any_of(E.Scalars,

10112 [&](Value *V) {

10113 return isa<LoadInst>(V) &&

10114 !isVectorized(V) &&

10115 !isDeleted(cast<Instruction>(V));

10116 }))) &&

10117 !isSplat(E.Scalars)) {

10118for (Value *V : E.Scalars) {

10119auto *LI = dyn_cast<LoadInst>(V);

10120if (!LI)

10121continue;

10122if (isDeleted(LI) ||isVectorized(LI) || !LI->isSimple())

10123continue;

10124gatherPossiblyVectorizableLoads(

10125 *this, V, *DL, *SE, *TTI,

10126 GatheredLoads[std::make_tuple(

10127 LI->getParent(),

10128getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth),

10129 LI->getType())]);

10130 }

10131 }

10132 }

10133// Try to vectorize gathered loads if this is not just a gather of loads.

10134if (!GatheredLoads.empty())

10135 tryToVectorizeGatheredLoads(GatheredLoads);

10136}

10137

10138/// Merges shuffle masks and emits final shuffle instruction, if required. It

10139/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,

10140/// when the actual shuffle instruction is generated only if this is actually

10141/// required. Otherwise, the shuffle instruction emission is delayed till the

10142/// end of the process, to reduce the number of emitted instructions and further

10143/// analysis/transformations.

10144classBoUpSLP::ShuffleCostEstimator :public BaseShuffleAnalysis {

10145bool IsFinalized =false;

10146SmallVector<int> CommonMask;

10147SmallVector<PointerUnion<Value *, const TreeEntry *>, 2> InVectors;

10148constTargetTransformInfo &TTI;

10149InstructionCost Cost = 0;

10150SmallDenseSet<Value *> VectorizedVals;

10151BoUpSLP &R;

10152SmallPtrSetImpl<Value *> &CheckedExtracts;

10153constexprstaticTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

10154 /// While set, still trying to estimate the cost for the same nodes and we

10155 /// can delay actual cost estimation (virtual shuffle instruction emission).

10156 /// May help better estimate the cost if same nodes must be permuted + allows

10157 /// to move most of the long shuffles cost estimation to TTI.

10158bool SameNodesEstimated =true;

10159

10160staticConstant *getAllOnesValue(constDataLayout &DL,Type *Ty) {

10161if (Ty->getScalarType()->isPointerTy()) {

10162Constant *Res =ConstantExpr::getIntToPtr(

10163ConstantInt::getAllOnesValue(

10164IntegerType::get(Ty->getContext(),

10165DL.getTypeStoreSizeInBits(Ty->getScalarType()))),

10166 Ty->getScalarType());

10167if (auto *VTy = dyn_cast<VectorType>(Ty))

10168 Res =ConstantVector::getSplat(VTy->getElementCount(), Res);

10169return Res;

10170 }

10171returnConstant::getAllOnesValue(Ty);

10172 }

10173

10174InstructionCost getBuildVectorCost(ArrayRef<Value *> VL,Value *Root) {

10175if ((!Root &&allConstant(VL)) ||all_of(VL, IsaPred<UndefValue>))

10176returnTTI::TCC_Free;

10177auto *VecTy =getWidenedType(ScalarTy, VL.size());

10178InstructionCost GatherCost = 0;

10179SmallVector<Value *> Gathers(VL);

10180if (!Root &&isSplat(VL)) {

10181// Found the broadcasting of the single scalar, calculate the cost as

10182// the broadcast.

10183constauto *It =find_if_not(VL, IsaPred<UndefValue>);

10184assert(It != VL.end() &&"Expected at least one non-undef value.");

10185// Add broadcast for non-identity shuffle only.

10186bool NeedShuffle =

10187count(VL, *It) > 1 &&

10188 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));

10189if (!NeedShuffle) {

10190if (isa<FixedVectorType>(ScalarTy)) {

10191assert(SLPReVec &&"FixedVectorType is not expected.");

10192returnTTI.getShuffleCost(

10193TTI::SK_InsertSubvector, VecTy, {}, CostKind,

10194 std::distance(VL.begin(), It) *getNumElements(ScalarTy),

10195 cast<FixedVectorType>(ScalarTy));

10196 }

10197returnTTI.getVectorInstrCost(Instruction::InsertElement, VecTy,

10198 CostKind, std::distance(VL.begin(), It),

10199PoisonValue::get(VecTy), *It);

10200 }

10201

10202SmallVector<int> ShuffleMask(VL.size(),PoisonMaskElem);

10203transform(VL, ShuffleMask.begin(), [](Value *V) {

10204 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;

10205 });

10206InstructionCost InsertCost =

10207TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,

10208PoisonValue::get(VecTy), *It);

10209return InsertCost +::getShuffleCost(TTI,

10210TargetTransformInfo::SK_Broadcast,

10211 VecTy, ShuffleMask, CostKind,

10212/*Index=*/0,/*SubTp=*/nullptr,

10213/*Args=*/*It);

10214 }

10215return GatherCost +

10216 (all_of(Gathers, IsaPred<UndefValue>)

10217 ?TTI::TCC_Free

10218 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),

10219 ScalarTy));

10220 };

10221

10222 /// Compute the cost of creating a vector containing the extracted values from

10223 /// \p VL.

10224InstructionCost

10225 computeExtractCost(ArrayRef<Value *> VL,ArrayRef<int> Mask,

10226ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

10227unsigned NumParts) {

10228assert(VL.size() > NumParts &&"Unexpected scalarized shuffle.");

10229unsigned NumElts =

10230 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz,Value *V) {

10231 auto *EE = dyn_cast<ExtractElementInst>(V);

10232 if (!EE)

10233 return Sz;

10234 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());

10235 if (!VecTy)

10236 return Sz;

10237 return std::max(Sz, VecTy->getNumElements());

10238 });

10239// FIXME: this must be moved to TTI for better estimation.

10240unsigned EltsPerVector =getPartNumElems(VL.size(), NumParts);

10241auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,

10242SmallVectorImpl<unsigned> &Indices)

10243 -> std::optional<TTI::ShuffleKind> {

10244if (NumElts <= EltsPerVector)

10245return std::nullopt;

10246int OffsetReg0 =

10247alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,

10248 [](int S,intI) {

10249 if (I == PoisonMaskElem)

10250 return S;

10251 return std::min(S, I);

10252 }),

10253 EltsPerVector);

10254int OffsetReg1 = OffsetReg0;

10255DenseSet<int> RegIndices;

10256// Check that if trying to permute same single/2 input vectors.

10257TTI::ShuffleKind ShuffleKind =TTI::SK_PermuteSingleSrc;

10258int FirstRegId = -1;

10259 Indices.assign(1, OffsetReg0);

10260for (auto [Pos,I] :enumerate(Mask)) {

10261if (I ==PoisonMaskElem)

10262continue;

10263intIdx =I - OffsetReg0;

10264int RegId =

10265 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;

10266if (FirstRegId < 0)

10267 FirstRegId = RegId;

10268 RegIndices.insert(RegId);

10269if (RegIndices.size() > 2)

10270return std::nullopt;

10271if (RegIndices.size() == 2) {

10272 ShuffleKind =TTI::SK_PermuteTwoSrc;

10273if (Indices.size() == 1) {

10274 OffsetReg1 =alignDown(

10275 std::accumulate(

10276 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,

10277 [&](int S,intI) {

10278 if (I == PoisonMaskElem)

10279 return S;

10280 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +

10281 ((I - OffsetReg0) % NumElts) / EltsPerVector;

10282 if (RegId == FirstRegId)

10283 return S;

10284 return std::min(S, I);

10285 }),

10286 EltsPerVector);

10287 Indices.push_back(OffsetReg1 % NumElts);

10288 }

10289Idx =I - OffsetReg1;

10290 }

10291I = (Idx % NumElts) % EltsPerVector +

10292 (RegId == FirstRegId ? 0 : EltsPerVector);

10293 }

10294return ShuffleKind;

10295 };

10296InstructionCost Cost = 0;

10297

10298// Process extracts in blocks of EltsPerVector to check if the source vector

10299// operand can be re-used directly. If not, add the cost of creating a

10300// shuffle to extract the values into a vector register.

10301for (unsigned Part : seq<unsigned>(NumParts)) {

10302if (!ShuffleKinds[Part])

10303continue;

10304ArrayRef<int> MaskSlice = Mask.slice(

10305 Part * EltsPerVector,getNumElems(Mask.size(), EltsPerVector, Part));

10306SmallVector<int> SubMask(EltsPerVector,PoisonMaskElem);

10307copy(MaskSlice, SubMask.begin());

10308SmallVector<unsigned, 2> Indices;

10309 std::optional<TTI::ShuffleKind> RegShuffleKind =

10310 CheckPerRegistersShuffle(SubMask, Indices);

10311if (!RegShuffleKind) {

10312if (*ShuffleKinds[Part] !=TTI::SK_PermuteSingleSrc ||

10313 !ShuffleVectorInst::isIdentityMask(

10314 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))

10315Cost +=

10316::getShuffleCost(TTI, *ShuffleKinds[Part],

10317getWidenedType(ScalarTy, NumElts), MaskSlice);

10318continue;

10319 }

10320if (*RegShuffleKind !=TTI::SK_PermuteSingleSrc ||

10321 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {

10322Cost +=

10323::getShuffleCost(TTI, *RegShuffleKind,

10324getWidenedType(ScalarTy, EltsPerVector), SubMask);

10325 }

10326constunsigned BaseVF =getFullVectorNumberOfElements(

10327 *R.TTI, VL.front()->getType(),alignTo(NumElts, EltsPerVector));

10328for (unsignedIdx : Indices) {

10329assert((Idx + EltsPerVector) <= BaseVF &&

10330"SK_ExtractSubvector index out of range");

10331Cost +=::getShuffleCost(TTI,TTI::SK_ExtractSubvector,

10332getWidenedType(ScalarTy, BaseVF), {}, CostKind,

10333Idx,getWidenedType(ScalarTy, EltsPerVector));

10334 }

10335// Second attempt to check, if just a permute is better estimated than

10336// subvector extract.

10337 SubMask.assign(NumElts,PoisonMaskElem);

10338copy(MaskSlice, SubMask.begin());

10339InstructionCost OriginalCost =::getShuffleCost(

10340TTI, *ShuffleKinds[Part],getWidenedType(ScalarTy, NumElts), SubMask);

10341if (OriginalCost <Cost)

10342Cost = OriginalCost;

10343 }

10344returnCost;

10345 }

10346 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given

10347 /// mask \p Mask, register number \p Part, that includes \p SliceSize

10348 /// elements.

10349void estimateNodesPermuteCost(const TreeEntry &E1,const TreeEntry *E2,

10350ArrayRef<int> Mask,unsigned Part,

10351unsigned SliceSize) {

10352if (SameNodesEstimated) {

10353// Delay the cost estimation if the same nodes are reshuffling.

10354// If we already requested the cost of reshuffling of E1 and E2 before, no

10355// need to estimate another cost with the sub-Mask, instead include this

10356// sub-Mask into the CommonMask to estimate it later and avoid double cost

10357// estimation.

10358if ((InVectors.size() == 2 &&

10359 cast<const TreeEntry *>(InVectors.front()) == &E1 &&

10360 cast<const TreeEntry *>(InVectors.back()) == E2) ||

10361 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {

10362unsigned Limit =getNumElems(Mask.size(), SliceSize, Part);

10363assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),

10364 [](intIdx) {returnIdx ==PoisonMaskElem; }) &&

10365"Expected all poisoned elements.");

10366ArrayRef<int> SubMask =ArrayRef(Mask).slice(Part * SliceSize, Limit);

10367copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));

10368return;

10369 }

10370// Found non-matching nodes - need to estimate the cost for the matched

10371// and transform mask.

10372Cost += createShuffle(InVectors.front(),

10373 InVectors.size() == 1 ?nullptr : InVectors.back(),

10374 CommonMask);

10375 transformMaskAfterShuffle(CommonMask, CommonMask);

10376 }elseif (InVectors.size() == 2) {

10377Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);

10378 transformMaskAfterShuffle(CommonMask, CommonMask);

10379 }

10380 SameNodesEstimated =false;

10381if (!E2 && InVectors.size() == 1) {

10382unsigned VF = E1.getVectorFactor();

10383if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {

10384 VF = std::max(VF,

10385 cast<FixedVectorType>(V1->getType())->getNumElements());

10386 }else {

10387constauto *E = cast<const TreeEntry *>(InVectors.front());

10388 VF = std::max(VF, E->getVectorFactor());

10389 }

10390for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

10391if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)

10392 CommonMask[Idx] = Mask[Idx] + VF;

10393Cost += createShuffle(InVectors.front(), &E1, CommonMask);

10394 transformMaskAfterShuffle(CommonMask, CommonMask);

10395 }else {

10396autoP = InVectors.front();

10397Cost += createShuffle(&E1, E2, Mask);

10398unsigned VF = Mask.size();

10399if (Value *V1 =P.dyn_cast<Value *>()) {

10400 VF = std::max(VF,

10401getNumElements(V1->getType()));

10402 }else {

10403constauto *E = cast<const TreeEntry *>(P);

10404 VF = std::max(VF, E->getVectorFactor());

10405 }

10406for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

10407if (Mask[Idx] !=PoisonMaskElem)

10408 CommonMask[Idx] =Idx + (InVectors.empty() ? 0 : VF);

10409Cost += createShuffle(P, InVectors.front(), CommonMask);

10410 transformMaskAfterShuffle(CommonMask, CommonMask);

10411 }

10412 }

10413

10414classShuffleCostBuilder {

10415constTargetTransformInfo &TTI;

10416

10417staticbool isEmptyOrIdentity(ArrayRef<int> Mask,unsigned VF) {

10418int Index = -1;

10419return Mask.empty() ||

10420 (VF == Mask.size() &&

10421ShuffleVectorInst::isIdentityMask(Mask, VF)) ||

10422 (ShuffleVectorInst::isExtractSubvectorMask(Mask, VF, Index) &&

10423 Index == 0);

10424 }

10425

10426public:

10427 ShuffleCostBuilder(constTargetTransformInfo &TTI) :TTI(TTI) {}

10428 ~ShuffleCostBuilder() =default;

10429InstructionCost createShuffleVector(Value *V1,Value *,

10430ArrayRef<int> Mask) const{

10431// Empty mask or identity mask are free.

10432unsigned VF =

10433 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

10434if (isEmptyOrIdentity(Mask, VF))

10435returnTTI::TCC_Free;

10436 return ::getShuffleCost(TTI,TTI::SK_PermuteTwoSrc,

10437 cast<VectorType>(V1->getType()), Mask);

10438 }

10439InstructionCost createShuffleVector(Value *V1,ArrayRef<int> Mask) const{

10440// Empty mask or identity mask are free.

10441unsigned VF =

10442 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();

10443if (isEmptyOrIdentity(Mask, VF))

10444returnTTI::TCC_Free;

10445 return ::getShuffleCost(TTI,TTI::SK_PermuteSingleSrc,

10446 cast<VectorType>(V1->getType()), Mask);

10447 }

10448InstructionCost createIdentity(Value *) const{returnTTI::TCC_Free; }

10449InstructionCost createPoison(Type *Ty,unsigned VF) const{

10450returnTTI::TCC_Free;

10451 }

10452void resizeToMatch(Value *&,Value *&) const{}

10453 };

10454

10455 /// Smart shuffle instruction emission, walks through shuffles trees and

10456 /// tries to find the best matching vector for the actual shuffle

10457 /// instruction.

10458InstructionCost

10459 createShuffle(constPointerUnion<Value *, const TreeEntry *> &P1,

10460constPointerUnion<Value *, const TreeEntry *> &P2,

10461ArrayRef<int> Mask) {

10462 ShuffleCostBuilder Builder(TTI);

10463SmallVector<int> CommonMask(Mask);

10464Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();

10465unsigned CommonVF = Mask.size();

10466InstructionCost ExtraCost = 0;

10467auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,

10468unsigned VF) ->InstructionCost {

10469if (E.isGather() &&allConstant(E.Scalars))

10470returnTTI::TCC_Free;

10471Type *EScalarTy = E.Scalars.front()->getType();

10472bool IsSigned =true;

10473if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {

10474 EScalarTy =IntegerType::get(EScalarTy->getContext(), It->second.first);

10475 IsSigned = It->second.second;

10476 }

10477if (EScalarTy != ScalarTy) {

10478unsigned CastOpcode = Instruction::Trunc;

10479unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);

10480unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);

10481if (DstSz > SrcSz)

10482 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

10483returnTTI.getCastInstrCost(CastOpcode,getWidenedType(ScalarTy, VF),

10484getWidenedType(EScalarTy, VF),

10485TTI::CastContextHint::None, CostKind);

10486 }

10487returnTTI::TCC_Free;

10488 };

10489auto GetValueMinBWAffectedCost = [&](constValue *V) ->InstructionCost {

10490if (isa<Constant>(V))

10491returnTTI::TCC_Free;

10492auto *VecTy = cast<VectorType>(V->getType());

10493Type *EScalarTy = VecTy->getElementType();

10494if (EScalarTy != ScalarTy) {

10495bool IsSigned = !isKnownNonNegative(V,SimplifyQuery(*R.DL));

10496unsigned CastOpcode = Instruction::Trunc;

10497unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);

10498unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);

10499if (DstSz > SrcSz)

10500 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

10501returnTTI.getCastInstrCost(

10502 CastOpcode,VectorType::get(ScalarTy, VecTy->getElementCount()),

10503 VecTy,TTI::CastContextHint::None, CostKind);

10504 }

10505returnTTI::TCC_Free;

10506 };

10507if (!V1 && !V2 && !P2.isNull()) {

10508// Shuffle 2 entry nodes.

10509const TreeEntry *E = cast<const TreeEntry *>(P1);

10510unsigned VF = E->getVectorFactor();

10511const TreeEntry *E2 = cast<const TreeEntry *>(P2);

10512 CommonVF = std::max(VF, E2->getVectorFactor());

10513assert(all_of(Mask,

10514 [=](intIdx) {

10515return Idx < 2 * static_cast<int>(CommonVF);

10516 }) &&

10517"All elements in mask must be less than 2 * CommonVF.");

10518if (E->Scalars.size() == E2->Scalars.size()) {

10519SmallVector<int> EMask = E->getCommonMask();

10520SmallVector<int> E2Mask = E2->getCommonMask();

10521if (!EMask.empty() || !E2Mask.empty()) {

10522for (int &Idx : CommonMask) {

10523if (Idx ==PoisonMaskElem)

10524continue;

10525if (Idx <static_cast<int>(CommonVF) && !EMask.empty())

10526Idx = EMask[Idx];

10527elseif (Idx >=static_cast<int>(CommonVF))

10528Idx = (E2Mask.empty() ?Idx - CommonVF : E2Mask[Idx - CommonVF]) +

10529 E->Scalars.size();

10530 }

10531 }

10532 CommonVF = E->Scalars.size();

10533 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +

10534 GetNodeMinBWAffectedCost(*E2, CommonVF);

10535 }else {

10536 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +

10537 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());

10538 }

10539 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10540 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10541 }elseif (!V1 && P2.isNull()) {

10542// Shuffle single entry node.

10543const TreeEntry *E = cast<const TreeEntry *>(P1);

10544unsigned VF = E->getVectorFactor();

10545 CommonVF = VF;

10546assert(

10547all_of(Mask,

10548 [=](intIdx) {return Idx < static_cast<int>(CommonVF); }) &&

10549"All elements in mask must be less than CommonVF.");

10550if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {

10551SmallVector<int> EMask = E->getCommonMask();

10552assert(!EMask.empty() &&"Expected non-empty common mask.");

10553for (int &Idx : CommonMask) {

10554if (Idx !=PoisonMaskElem)

10555Idx = EMask[Idx];

10556 }

10557 CommonVF = E->Scalars.size();

10558 }elseif (unsigned Factor = E->getInterleaveFactor();

10559 Factor > 0 && E->Scalars.size() != Mask.size() &&

10560ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask,

10561 Factor)) {

10562// Deinterleaved nodes are free.

10563 std::iota(CommonMask.begin(), CommonMask.end(), 0);

10564 }

10565 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);

10566 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10567// Not identity/broadcast? Try to see if the original vector is better.

10568if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&

10569 CommonVF == CommonMask.size() &&

10570any_of(enumerate(CommonMask),

10571 [](constauto &&P) {

10572returnP.value() !=PoisonMaskElem &&

10573static_cast<unsigned>(P.value()) !=P.index();

10574 }) &&

10575any_of(CommonMask,

10576 [](intIdx) {returnIdx !=PoisonMaskElem &&Idx != 0; })) {

10577SmallVector<int> ReorderMask;

10578inversePermutation(E->ReorderIndices, ReorderMask);

10579::addMask(CommonMask, ReorderMask);

10580 }

10581 }elseif (V1 && P2.isNull()) {

10582// Shuffle single vector.

10583 ExtraCost += GetValueMinBWAffectedCost(V1);

10584 CommonVF = getVF(V1);

10585assert(

10586all_of(Mask,

10587 [=](intIdx) {return Idx < static_cast<int>(CommonVF); }) &&

10588"All elements in mask must be less than CommonVF.");

10589 }elseif (V1 && !V2) {

10590// Shuffle vector and tree node.

10591unsigned VF = getVF(V1);

10592const TreeEntry *E2 = cast<const TreeEntry *>(P2);

10593 CommonVF = std::max(VF, E2->getVectorFactor());

10594assert(all_of(Mask,

10595 [=](intIdx) {

10596return Idx < 2 * static_cast<int>(CommonVF);

10597 }) &&

10598"All elements in mask must be less than 2 * CommonVF.");

10599if (E2->Scalars.size() == VF && VF != CommonVF) {

10600SmallVector<int> E2Mask = E2->getCommonMask();

10601assert(!E2Mask.empty() &&"Expected non-empty common mask.");

10602for (int &Idx : CommonMask) {

10603if (Idx ==PoisonMaskElem)

10604continue;

10605if (Idx >=static_cast<int>(CommonVF))

10606Idx = E2Mask[Idx - CommonVF] + VF;

10607 }

10608 CommonVF = VF;

10609 }

10610 ExtraCost += GetValueMinBWAffectedCost(V1);

10611 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10612 ExtraCost += GetNodeMinBWAffectedCost(

10613 *E2, std::min(CommonVF, E2->getVectorFactor()));

10614 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10615 }elseif (!V1 && V2) {

10616// Shuffle vector and tree node.

10617unsigned VF = getVF(V2);

10618const TreeEntry *E1 = cast<const TreeEntry *>(P1);

10619 CommonVF = std::max(VF, E1->getVectorFactor());

10620assert(all_of(Mask,

10621 [=](intIdx) {

10622return Idx < 2 * static_cast<int>(CommonVF);

10623 }) &&

10624"All elements in mask must be less than 2 * CommonVF.");

10625if (E1->Scalars.size() == VF && VF != CommonVF) {

10626SmallVector<int> E1Mask = E1->getCommonMask();

10627assert(!E1Mask.empty() &&"Expected non-empty common mask.");

10628for (int &Idx : CommonMask) {

10629if (Idx ==PoisonMaskElem)

10630continue;

10631if (Idx >=static_cast<int>(CommonVF))

10632Idx = E1Mask[Idx - CommonVF] + VF;

10633else

10634Idx = E1Mask[Idx];

10635 }

10636 CommonVF = VF;

10637 }

10638 ExtraCost += GetNodeMinBWAffectedCost(

10639 *E1, std::min(CommonVF, E1->getVectorFactor()));

10640 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10641 ExtraCost += GetValueMinBWAffectedCost(V2);

10642 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10643 }else {

10644assert(V1 && V2 &&"Expected both vectors.");

10645unsigned VF = getVF(V1);

10646 CommonVF = std::max(VF, getVF(V2));

10647assert(all_of(Mask,

10648 [=](intIdx) {

10649return Idx < 2 * static_cast<int>(CommonVF);

10650 }) &&

10651"All elements in mask must be less than 2 * CommonVF.");

10652 ExtraCost +=

10653 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);

10654if (V1->getType() != V2->getType()) {

10655 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10656 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10657 }else {

10658if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)

10659 V1 =Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));

10660if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)

10661 V2 = getAllOnesValue(*R.DL,getWidenedType(ScalarTy, CommonVF));

10662 }

10663 }

10664 InVectors.front() =

10665Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));

10666if (InVectors.size() == 2)

10667 InVectors.pop_back();

10668return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(

10669 V1, V2, CommonMask, Builder, ScalarTy);

10670 }

10671

10672public:

10673ShuffleCostEstimator(Type *ScalarTy,TargetTransformInfo &TTI,

10674ArrayRef<Value *> VectorizedVals,BoUpSLP &R,

10675SmallPtrSetImpl<Value *> &CheckedExtracts)

10676 : BaseShuffleAnalysis(ScalarTy),TTI(TTI),

10677 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),

10678 CheckedExtracts(CheckedExtracts) {}

10679Value *adjustExtracts(const TreeEntry *E,MutableArrayRef<int> Mask,

10680ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

10681unsigned NumParts,bool &UseVecBaseAsInput) {

10682 UseVecBaseAsInput =false;

10683if (Mask.empty())

10684returnnullptr;

10685Value *VecBase =nullptr;

10686SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());

10687if (!E->ReorderIndices.empty()) {

10688SmallVector<int> ReorderMask(E->ReorderIndices.begin(),

10689 E->ReorderIndices.end());

10690reorderScalars(VL, ReorderMask);

10691 }

10692// Check if it can be considered reused if same extractelements were

10693// vectorized already.

10694bool PrevNodeFound =any_of(

10695ArrayRef(R.VectorizableTree).take_front(E->Idx),

10696 [&](const std::unique_ptr<TreeEntry> &TE) {

10697 return ((TE->hasState() && !TE->isAltShuffle() &&

10698 TE->getOpcode() == Instruction::ExtractElement) ||

10699 TE->isGather()) &&

10700 all_of(enumerate(TE->Scalars), [&](auto &&Data) {

10701 return VL.size() > Data.index() &&

10702 (Mask[Data.index()] == PoisonMaskElem ||

10703 isa<UndefValue>(VL[Data.index()]) ||

10704 Data.value() == VL[Data.index()]);

10705 });

10706 });

10707SmallPtrSet<Value *, 4> UniqueBases;

10708unsigned SliceSize =getPartNumElems(VL.size(), NumParts);

10709for (unsigned Part : seq<unsigned>(NumParts)) {

10710unsigned Limit =getNumElems(VL.size(), SliceSize, Part);

10711ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);

10712for (auto [I, V] :

10713enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {

10714// Ignore non-extractelement scalars.

10715if (isa<UndefValue>(V) ||

10716 (!SubMask.empty() && SubMask[I] ==PoisonMaskElem))

10717continue;

10718// If all users of instruction are going to be vectorized and this

10719// instruction itself is not going to be vectorized, consider this

10720// instruction as dead and remove its cost from the final cost of the

10721// vectorized tree.

10722// Also, avoid adjusting the cost for extractelements with multiple uses

10723// in different graph entries.

10724auto *EE = cast<ExtractElementInst>(V);

10725 VecBase = EE->getVectorOperand();

10726 UniqueBases.insert(VecBase);

10727const TreeEntry *VE = R.getTreeEntry(V);

10728if (!CheckedExtracts.insert(V).second ||

10729 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||

10730any_of(EE->users(),

10731 [&](User *U) {

10732 return isa<GetElementPtrInst>(U) &&

10733 !R.areAllUsersVectorized(cast<Instruction>(U),

10734 &VectorizedVals);

10735 }) ||

10736 (VE && VE != E))

10737continue;

10738 std::optional<unsigned> EEIdx =getExtractIndex(EE);

10739if (!EEIdx)

10740continue;

10741unsignedIdx = *EEIdx;

10742// Take credit for instruction that will become dead.

10743if (EE->hasOneUse() || !PrevNodeFound) {

10744Instruction *Ext = EE->user_back();

10745if (isa<SExtInst, ZExtInst>(Ext) &&

10746all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {

10747// Use getExtractWithExtendCost() to calculate the cost of

10748// extractelement/ext pair.

10749Cost -=

10750TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),

10751 EE->getVectorOperandType(),Idx);

10752// Add back the cost of s|zext which is subtracted separately.

10753Cost +=TTI.getCastInstrCost(

10754 Ext->getOpcode(), Ext->getType(), EE->getType(),

10755TTI::getCastContextHint(Ext), CostKind, Ext);

10756continue;

10757 }

10758 }

10759Cost -=TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),

10760 CostKind,Idx);

10761 }

10762 }

10763// Check that gather of extractelements can be represented as just a

10764// shuffle of a single/two vectors the scalars are extracted from.

10765// Found the bunch of extractelement instructions that must be gathered

10766// into a vector and can be represented as a permutation elements in a

10767// single input vector or of 2 input vectors.

10768// Done for reused if same extractelements were vectorized already.

10769if (!PrevNodeFound)

10770Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);

10771 InVectors.assign(1, E);

10772 CommonMask.assign(Mask.begin(),Mask.end());

10773 transformMaskAfterShuffle(CommonMask, CommonMask);

10774 SameNodesEstimated =false;

10775if (NumParts != 1 && UniqueBases.size() != 1) {

10776 UseVecBaseAsInput =true;

10777 VecBase =

10778Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));

10779 }

10780return VecBase;

10781 }

10782 /// Checks if the specified entry \p E needs to be delayed because of its

10783 /// dependency nodes.

10784 std::optional<InstructionCost>

10785needToDelay(const TreeEntry *,

10786ArrayRef<SmallVector<const TreeEntry *>>) const{

10787// No need to delay the cost estimation during analysis.

10788return std::nullopt;

10789 }

10790voidadd(const TreeEntry &E1,const TreeEntry &E2,ArrayRef<int> Mask) {

10791if (&E1 == &E2) {

10792assert(all_of(Mask,

10793 [&](intIdx) {

10794return Idx < static_cast<int>(E1.getVectorFactor());

10795 }) &&

10796"Expected single vector shuffle mask.");

10797 add(E1, Mask);

10798return;

10799 }

10800if (InVectors.empty()) {

10801 CommonMask.assign(Mask.begin(), Mask.end());

10802 InVectors.assign({&E1, &E2});

10803return;

10804 }

10805assert(!CommonMask.empty() &&"Expected non-empty common mask.");

10806auto *MaskVecTy =getWidenedType(ScalarTy, Mask.size());

10807unsigned NumParts =::getNumberOfParts(TTI, MaskVecTy, Mask.size());

10808unsigned SliceSize =getPartNumElems(Mask.size(), NumParts);

10809constauto *It =

10810find_if(Mask, [](intIdx) {returnIdx !=PoisonMaskElem; });

10811unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

10812 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);

10813 }

10814voidadd(const TreeEntry &E1,ArrayRef<int> Mask) {

10815if (InVectors.empty()) {

10816 CommonMask.assign(Mask.begin(), Mask.end());

10817 InVectors.assign(1, &E1);

10818return;

10819 }

10820assert(!CommonMask.empty() &&"Expected non-empty common mask.");

10821auto *MaskVecTy =getWidenedType(ScalarTy, Mask.size());

10822unsigned NumParts =::getNumberOfParts(TTI, MaskVecTy, Mask.size());

10823unsigned SliceSize =getPartNumElems(Mask.size(), NumParts);

10824constauto *It =

10825find_if(Mask, [](intIdx) {returnIdx !=PoisonMaskElem; });

10826unsigned Part = std::distance(Mask.begin(), It) / SliceSize;

10827 estimateNodesPermuteCost(E1,nullptr, Mask, Part, SliceSize);

10828if (!SameNodesEstimated && InVectors.size() == 1)

10829 InVectors.emplace_back(&E1);

10830 }

10831 /// Adds 2 input vectors and the mask for their shuffling.

10832voidadd(Value *V1,Value *V2,ArrayRef<int> Mask) {

10833// May come only for shuffling of 2 vectors with extractelements, already

10834// handled in adjustExtracts.

10835assert(InVectors.size() == 1 &&

10836all_of(enumerate(CommonMask),

10837 [&](autoP) {

10838if (P.value() ==PoisonMaskElem)

10839return Mask[P.index()] ==PoisonMaskElem;

10840auto *EI = cast<ExtractElementInst>(

10841 cast<const TreeEntry *>(InVectors.front())

10842 ->getOrdered(P.index()));

10843return EI->getVectorOperand() == V1 ||

10844 EI->getVectorOperand() == V2;

10845 }) &&

10846"Expected extractelement vectors.");

10847 }

10848 /// Adds another one input vector and the mask for the shuffling.

10849voidadd(Value *V1,ArrayRef<int> Mask,bool ForExtracts =false) {

10850if (InVectors.empty()) {

10851assert(CommonMask.empty() && !ForExtracts &&

10852"Expected empty input mask/vectors.");

10853 CommonMask.assign(Mask.begin(), Mask.end());

10854 InVectors.assign(1, V1);

10855return;

10856 }

10857if (ForExtracts) {

10858// No need to add vectors here, already handled them in adjustExtracts.

10859assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&

10860 !CommonMask.empty() &&

10861all_of(enumerate(CommonMask),

10862 [&](autoP) {

10863Value *Scalar = cast<const TreeEntry *>(InVectors[0])

10864 ->getOrdered(P.index());

10865if (P.value() ==PoisonMaskElem)

10866returnP.value() == Mask[P.index()] ||

10867 isa<UndefValue>(Scalar);

10868if (isa<Constant>(V1))

10869returntrue;

10870auto *EI = cast<ExtractElementInst>(Scalar);

10871return EI->getVectorOperand() == V1;

10872 }) &&

10873"Expected only tree entry for extractelement vectors.");

10874return;

10875 }

10876assert(!InVectors.empty() && !CommonMask.empty() &&

10877"Expected only tree entries from extracts/reused buildvectors.");

10878unsigned VF = getVF(V1);

10879if (InVectors.size() == 2) {

10880Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);

10881 transformMaskAfterShuffle(CommonMask, CommonMask);

10882 VF = std::max<unsigned>(VF, CommonMask.size());

10883 }elseif (constauto *InTE =

10884 InVectors.front().dyn_cast<const TreeEntry *>()) {

10885 VF = std::max(VF, InTE->getVectorFactor());

10886 }else {

10887 VF = std::max(

10888 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())

10889 ->getNumElements());

10890 }

10891 InVectors.push_back(V1);

10892for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

10893if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)

10894 CommonMask[Idx] = Mask[Idx] + VF;

10895 }

10896Value *gather(ArrayRef<Value *> VL,unsigned MaskVF = 0,

10897Value *Root =nullptr) {

10898Cost += getBuildVectorCost(VL, Root);

10899if (!Root) {

10900// FIXME: Need to find a way to avoid use of getNullValue here.

10901SmallVector<Constant *> Vals;

10902unsigned VF = VL.size();

10903if (MaskVF != 0)

10904 VF = std::min(VF, MaskVF);

10905for (Value *V : VL.take_front(VF)) {

10906if (isa<UndefValue>(V)) {

10907 Vals.push_back(cast<Constant>(V));

10908continue;

10909 }

10910 Vals.push_back(Constant::getNullValue(V->getType()));

10911 }

10912if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {

10913assert(SLPReVec &&"FixedVectorType is not expected.");

10914// When REVEC is enabled, we need to expand vector types into scalar

10915// types.

10916unsigned VecTyNumElements = VecTy->getNumElements();

10917SmallVector<Constant *> NewVals(VF * VecTyNumElements,nullptr);

10918for (auto [I, V] :enumerate(Vals)) {

10919Type *ScalarTy = V->getType()->getScalarType();

10920Constant *NewVal;

10921if (isa<PoisonValue>(V))

10922 NewVal =PoisonValue::get(ScalarTy);

10923elseif (isa<UndefValue>(V))

10924 NewVal =UndefValue::get(ScalarTy);

10925else

10926 NewVal =Constant::getNullValue(ScalarTy);

10927 std::fill_n(NewVals.begin() +I * VecTyNumElements, VecTyNumElements,

10928 NewVal);

10929 }

10930 Vals.swap(NewVals);

10931 }

10932returnConstantVector::get(Vals);

10933 }

10934returnConstantVector::getSplat(

10935ElementCount::getFixed(

10936 cast<FixedVectorType>(Root->getType())->getNumElements()),

10937 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));

10938 }

10939InstructionCost createFreeze(InstructionCost Cost) {returnCost; }

10940 /// Finalize emission of the shuffles.

10941InstructionCost

10942finalize(ArrayRef<int> ExtMask,

10943ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,

10944ArrayRef<int> SubVectorsMask,unsigned VF = 0,

10945function_ref<void(Value *&,SmallVectorImpl<int> &)> Action = {}) {

10946 IsFinalized =true;

10947if (Action) {

10948constPointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();

10949if (InVectors.size() == 2)

10950Cost += createShuffle(Vec, InVectors.back(), CommonMask);

10951else

10952Cost += createShuffle(Vec,nullptr, CommonMask);

10953 transformMaskAfterShuffle(CommonMask, CommonMask);

10954assert(VF > 0 &&

10955"Expected vector length for the final value before action.");

10956Value *V = cast<Value *>(Vec);

10957 Action(V, CommonMask);

10958 InVectors.front() = V;

10959 }

10960if (!SubVectors.empty()) {

10961constPointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();

10962if (InVectors.size() == 2)

10963Cost += createShuffle(Vec, InVectors.back(), CommonMask);

10964else

10965Cost += createShuffle(Vec,nullptr, CommonMask);

10966 transformMaskAfterShuffle(CommonMask, CommonMask);

10967// Add subvectors permutation cost.

10968if (!SubVectorsMask.empty()) {

10969assert(SubVectorsMask.size() <= CommonMask.size() &&

10970"Expected same size of masks for subvectors and common mask.");

10971SmallVector<int> SVMask(CommonMask.size(),PoisonMaskElem);

10972copy(SubVectorsMask, SVMask.begin());

10973for (auto [I1, I2] :zip(SVMask, CommonMask)) {

10974if (I2 !=PoisonMaskElem) {

10975assert(I1 ==PoisonMaskElem &&"Expected unused subvectors mask");

10976 I1 = I2 + CommonMask.size();

10977 }

10978 }

10979Cost +=::getShuffleCost(TTI,TTI::SK_PermuteTwoSrc,

10980getWidenedType(ScalarTy, CommonMask.size()),

10981 SVMask,CostKind);

10982 }

10983for (auto [E,Idx] : SubVectors) {

10984Type *EScalarTy = E->Scalars.front()->getType();

10985bool IsSigned =true;

10986if (auto It =R.MinBWs.find(E); It !=R.MinBWs.end()) {

10987 EScalarTy =

10988IntegerType::get(EScalarTy->getContext(), It->second.first);

10989 IsSigned = It->second.second;

10990 }

10991if (ScalarTy != EScalarTy) {

10992unsigned CastOpcode = Instruction::Trunc;

10993unsigned DstSz =R.DL->getTypeSizeInBits(ScalarTy);

10994unsigned SrcSz =R.DL->getTypeSizeInBits(EScalarTy);

10995if (DstSz > SrcSz)

10996 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;

10997Cost +=TTI.getCastInstrCost(

10998 CastOpcode,getWidenedType(ScalarTy, E->getVectorFactor()),

10999getWidenedType(EScalarTy, E->getVectorFactor()),

11000TTI::CastContextHint::Normal,CostKind);

11001 }

11002Cost +=::getShuffleCost(

11003TTI,TTI::SK_InsertSubvector,

11004getWidenedType(ScalarTy, CommonMask.size()), {},CostKind,Idx,

11005getWidenedType(ScalarTy, E->getVectorFactor()));

11006if (!CommonMask.empty()) {

11007 std::iota(std::next(CommonMask.begin(),Idx),

11008 std::next(CommonMask.begin(),Idx + E->getVectorFactor()),

11009Idx);

11010 }

11011 }

11012 }

11013

11014if (!ExtMask.empty()) {

11015if (CommonMask.empty()) {

11016 CommonMask.assign(ExtMask.begin(), ExtMask.end());

11017 }else {

11018SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);

11019for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {

11020if (ExtMask[I] ==PoisonMaskElem)

11021continue;

11022 NewMask[I] = CommonMask[ExtMask[I]];

11023 }

11024 CommonMask.swap(NewMask);

11025 }

11026 }

11027if (CommonMask.empty()) {

11028assert(InVectors.size() == 1 &&"Expected only one vector with no mask");

11029returnCost;

11030 }

11031returnCost +

11032 createShuffle(InVectors.front(),

11033 InVectors.size() == 2 ? InVectors.back() :nullptr,

11034 CommonMask);

11035 }

11036

11037~ShuffleCostEstimator() {

11038assert((IsFinalized || CommonMask.empty()) &&

11039"Shuffle construction must be finalized.");

11040 }

11041};

11042

11043const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,

11044unsignedIdx) const{

11045if (const TreeEntry *VE = getMatchedVectorizedOperand(E,Idx))

11046return VE;

11047constauto *It =

11048find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

11049return TE->isGather() &&

11050find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {

11051 return EI.EdgeIdx == Idx && EI.UserTE == E;

11052 }) != TE->UserTreeIndices.end();

11053 });

11054assert(It != VectorizableTree.end() &&"Expected vectorizable entry.");

11055return It->get();

11056}

11057

11058TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const{

11059if (TE.State == TreeEntry::ScatterVectorize ||

11060 TE.State == TreeEntry::StridedVectorize)

11061returnTTI::CastContextHint::GatherScatter;

11062if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&

11063 !TE.isAltShuffle()) {

11064if (TE.ReorderIndices.empty())

11065returnTTI::CastContextHint::Normal;

11066SmallVector<int> Mask;

11067inversePermutation(TE.ReorderIndices, Mask);

11068if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))

11069returnTTI::CastContextHint::Reversed;

11070 }

11071returnTTI::CastContextHint::None;

11072}

11073

11074/// Builds the arguments types vector for the given call instruction with the

11075/// given \p ID for the specified vector factor.

11076staticSmallVector<Type *>

11077buildIntrinsicArgTypes(constCallInst *CI,constIntrinsic::ID ID,

11078constunsigned VF,unsigned MinBW,

11079constTargetTransformInfo *TTI) {

11080SmallVector<Type *> ArgTys;

11081for (auto [Idx, Arg] :enumerate(CI->args())) {

11082if (ID !=Intrinsic::not_intrinsic) {

11083if (isVectorIntrinsicWithScalarOpAtArg(ID,Idx,TTI)) {

11084 ArgTys.push_back(Arg->getType());

11085continue;

11086 }

11087if (MinBW > 0) {

11088 ArgTys.push_back(

11089getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));

11090continue;

11091 }

11092 }

11093 ArgTys.push_back(getWidenedType(Arg->getType(), VF));

11094 }

11095return ArgTys;

11096}

11097

11098InstructionCost

11099BoUpSLP::getEntryCost(const TreeEntry *E,ArrayRef<Value *> VectorizedVals,

11100SmallPtrSetImpl<Value *> &CheckedExtracts) {

11101ArrayRef<Value *> VL = E->Scalars;

11102

11103Type *ScalarTy =getValueType(VL[0]);

11104if (!isValidElementType(ScalarTy))

11105returnInstructionCost::getInvalid();

11106TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

11107

11108// If we have computed a smaller type for the expression, update VecTy so

11109// that the costs will be accurate.

11110auto It = MinBWs.find(E);

11111Type *OrigScalarTy = ScalarTy;

11112if (It != MinBWs.end()) {

11113auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);

11114 ScalarTy =IntegerType::get(F->getContext(), It->second.first);

11115if (VecTy)

11116 ScalarTy =getWidenedType(ScalarTy, VecTy->getNumElements());

11117 }

11118auto *VecTy =getWidenedType(ScalarTy, VL.size());

11119unsigned EntryVF = E->getVectorFactor();

11120auto *FinalVecTy =getWidenedType(ScalarTy, EntryVF);

11121

11122if (E->isGather()) {

11123if (allConstant(VL))

11124return 0;

11125if (isa<InsertElementInst>(VL[0]))

11126returnInstructionCost::getInvalid();

11127if (isa<CmpInst>(VL.front()))

11128 ScalarTy = VL.front()->getType();

11129return processBuildVector<ShuffleCostEstimator, InstructionCost>(

11130 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);

11131 }

11132InstructionCost CommonCost = 0;

11133SmallVector<int>Mask;

11134if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||

11135 !isReverseOrder(E->ReorderIndices))) {

11136SmallVector<int> NewMask;

11137if (E->getOpcode() == Instruction::Store) {

11138// For stores the order is actually a mask.

11139 NewMask.resize(E->ReorderIndices.size());

11140copy(E->ReorderIndices, NewMask.begin());

11141 }else {

11142inversePermutation(E->ReorderIndices, NewMask);

11143 }

11144::addMask(Mask, NewMask);

11145 }

11146if (!E->ReuseShuffleIndices.empty())

11147::addMask(Mask, E->ReuseShuffleIndices);

11148if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask,Mask.size()))

11149 CommonCost =

11150::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);

11151assert((E->State == TreeEntry::Vectorize ||

11152 E->State == TreeEntry::ScatterVectorize ||

11153 E->State == TreeEntry::StridedVectorize) &&

11154"Unhandled state");

11155assert(E->getOpcode() &&

11156 ((allSameType(VL) &&allSameBlock(VL)) ||

11157 (E->getOpcode() == Instruction::GetElementPtr &&

11158 E->getMainOp()->getType()->isPointerTy())) &&

11159"Invalid VL");

11160Instruction *VL0 = E->getMainOp();

11161unsigned ShuffleOrOp =

11162 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

11163if (E->CombinedOp != TreeEntry::NotCombinedOp)

11164 ShuffleOrOp = E->CombinedOp;

11165SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());

11166constunsigned Sz = UniqueValues.size();

11167SmallBitVector UsedScalars(Sz,false);

11168for (unsignedI = 0;I < Sz; ++I) {

11169if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)

11170continue;

11171 UsedScalars.set(I);

11172 }

11173auto GetCastContextHint = [&](Value *V) {

11174if (const TreeEntry *OpTE = getTreeEntry(V))

11175return getCastContextHint(*OpTE);

11176 InstructionsState SrcState =getSameOpcode(E->getOperand(0), *TLI);

11177if (SrcState && SrcState.getOpcode() == Instruction::Load &&

11178 !SrcState.isAltShuffle())

11179returnTTI::CastContextHint::GatherScatter;

11180returnTTI::CastContextHint::None;

11181 };

11182auto GetCostDiff =

11183 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,

11184function_ref<InstructionCost(InstructionCost)> VectorCost) {

11185// Calculate the cost of this instruction.

11186InstructionCost ScalarCost = 0;

11187if (isa<CastInst, CallInst>(VL0)) {

11188// For some of the instructions no need to calculate cost for each

11189// particular instruction, we can use the cost of the single

11190// instruction x total number of scalar instructions.

11191 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);

11192 }else {

11193for (unsignedI = 0;I < Sz; ++I) {

11194if (UsedScalars.test(I))

11195continue;

11196 ScalarCost += ScalarEltCost(I);

11197 }

11198 }

11199

11200InstructionCost VecCost = VectorCost(CommonCost);

11201// Check if the current node must be resized, if the parent node is not

11202// resized.

11203if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&

11204 E->Idx != 0 &&

11205 (E->getOpcode() != Instruction::Load ||

11206 !E->UserTreeIndices.empty())) {

11207const EdgeInfo &EI =

11208 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {

11209 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;

11210 });

11211if (EI.UserTE->getOpcode() != Instruction::Select ||

11212 EI.EdgeIdx != 0) {

11213auto UserBWIt = MinBWs.find(EI.UserTE);

11214Type *UserScalarTy =

11215 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();

11216if (UserBWIt != MinBWs.end())

11217 UserScalarTy =IntegerType::get(ScalarTy->getContext(),

11218 UserBWIt->second.first);

11219if (ScalarTy != UserScalarTy) {

11220unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);

11221unsigned SrcBWSz =DL->getTypeSizeInBits(UserScalarTy);

11222unsigned VecOpcode;

11223auto *UserVecTy =getWidenedType(UserScalarTy, E->Scalars.size());

11224if (BWSz > SrcBWSz)

11225 VecOpcode = Instruction::Trunc;

11226else

11227 VecOpcode =

11228 It->second.second ? Instruction::SExt : Instruction::ZExt;

11229TTI::CastContextHint CCH = GetCastContextHint(VL0);

11230 VecCost +=TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,

11231CostKind);

11232 }

11233 }

11234 }

11235LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,

11236 ScalarCost,"Calculated costs for Tree"));

11237return VecCost - ScalarCost;

11238 };

11239// Calculate cost difference from vectorizing set of GEPs.

11240// Negative value means vectorizing is profitable.

11241auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs,Value *BasePtr) {

11242assert((E->State == TreeEntry::Vectorize ||

11243 E->State == TreeEntry::StridedVectorize) &&

11244"Entry state expected to be Vectorize or StridedVectorize here.");

11245InstructionCost ScalarCost = 0;

11246InstructionCost VecCost = 0;

11247 std::tie(ScalarCost, VecCost) =getGEPCosts(

11248 *TTI, Ptrs, BasePtr, E->getOpcode(),CostKind, OrigScalarTy, VecTy);

11249LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,

11250"Calculated GEPs cost for Tree"));

11251

11252return VecCost - ScalarCost;

11253 };

11254

11255auto GetMinMaxCost = [&](Type *Ty,Instruction *VI =nullptr) {

11256auto [MinMaxID, SelectOnly] =canConvertToMinOrMaxIntrinsic(VI ? VI : VL);

11257if (MinMaxID ==Intrinsic::not_intrinsic)

11258returnInstructionCost::getInvalid();

11259Type *CanonicalType = Ty;

11260if (CanonicalType->isPtrOrPtrVectorTy())

11261 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(

11262 CanonicalType->getContext(),

11263DL->getTypeSizeInBits(CanonicalType->getScalarType())));

11264

11265IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,

11266 {CanonicalType, CanonicalType});

11267InstructionCost IntrinsicCost =

11268TTI->getIntrinsicInstrCost(CostAttrs,CostKind);

11269// If the selects are the only uses of the compares, they will be

11270// dead and we can adjust the cost by removing their cost.

11271if (VI && SelectOnly) {

11272assert((!Ty->isVectorTy() ||SLPReVec) &&

11273"Expected only for scalar type.");

11274auto *CI = cast<CmpInst>(VI->getOperand(0));

11275 IntrinsicCost -=TTI->getCmpSelInstrCost(

11276 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),

11277CostKind, {TTI::OK_AnyValue, TTI::OP_None},

11278 {TTI::OK_AnyValue, TTI::OP_None}, CI);

11279 }

11280return IntrinsicCost;

11281 };

11282switch (ShuffleOrOp) {

11283case Instruction::PHI: {

11284// Count reused scalars.

11285InstructionCost ScalarCost = 0;

11286SmallPtrSet<const TreeEntry *, 4> CountedOps;

11287for (Value *V : UniqueValues) {

11288auto *PHI = dyn_cast<PHINode>(V);

11289if (!PHI)

11290continue;

11291

11292ValueList Operands(PHI->getNumIncomingValues(),nullptr);

11293for (unsignedI = 0,N =PHI->getNumIncomingValues();I <N; ++I) {

11294Value *Op =PHI->getIncomingValue(I);

11295Operands[I] =Op;

11296 }

11297if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))

11298if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)

11299if (!OpTE->ReuseShuffleIndices.empty())

11300 ScalarCost +=TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -

11301 OpTE->Scalars.size());

11302 }

11303

11304return CommonCost - ScalarCost;

11305 }

11306case Instruction::ExtractValue:

11307case Instruction::ExtractElement: {

11308auto GetScalarCost = [&](unsignedIdx) {

11309if (isa<PoisonValue>(UniqueValues[Idx]))

11310returnInstructionCost(TTI::TCC_Free);

11311

11312auto *I = cast<Instruction>(UniqueValues[Idx]);

11313VectorType *SrcVecTy;

11314if (ShuffleOrOp == Instruction::ExtractElement) {

11315auto *EE = cast<ExtractElementInst>(I);

11316 SrcVecTy = EE->getVectorOperandType();

11317 }else {

11318auto *EV = cast<ExtractValueInst>(I);

11319Type *AggregateTy = EV->getAggregateOperand()->getType();

11320unsigned NumElts;

11321if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))

11322 NumElts = ATy->getNumElements();

11323else

11324 NumElts = AggregateTy->getStructNumElements();

11325 SrcVecTy =getWidenedType(OrigScalarTy, NumElts);

11326 }

11327if (I->hasOneUse()) {

11328Instruction *Ext =I->user_back();

11329if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&

11330all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {

11331// Use getExtractWithExtendCost() to calculate the cost of

11332// extractelement/ext pair.

11333InstructionCost Cost =TTI->getExtractWithExtendCost(

11334Ext->getOpcode(),Ext->getType(), SrcVecTy, *getExtractIndex(I));

11335// Subtract the cost of s|zext which is subtracted separately.

11336Cost -=TTI->getCastInstrCost(

11337Ext->getOpcode(),Ext->getType(),I->getType(),

11338TTI::getCastContextHint(Ext),CostKind, Ext);

11339returnCost;

11340 }

11341 }

11342returnTTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,

11343CostKind, *getExtractIndex(I));

11344 };

11345auto GetVectorCost = [](InstructionCost CommonCost) {return CommonCost; };

11346return GetCostDiff(GetScalarCost, GetVectorCost);

11347 }

11348case Instruction::InsertElement: {

11349assert(E->ReuseShuffleIndices.empty() &&

11350"Unique insertelements only are expected.");

11351auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());

11352unsignedconst NumElts = SrcVecTy->getNumElements();

11353unsignedconst NumScalars = VL.size();

11354

11355unsigned NumOfParts =::getNumberOfParts(*TTI, SrcVecTy);

11356

11357SmallVector<int> InsertMask(NumElts,PoisonMaskElem);

11358unsigned OffsetBeg = *getElementIndex(VL.front());

11359unsigned OffsetEnd = OffsetBeg;

11360 InsertMask[OffsetBeg] = 0;

11361for (auto [I, V] :enumerate(VL.drop_front())) {

11362unsignedIdx = *getElementIndex(V);

11363if (OffsetBeg >Idx)

11364 OffsetBeg =Idx;

11365elseif (OffsetEnd <Idx)

11366 OffsetEnd =Idx;

11367 InsertMask[Idx] =I + 1;

11368 }

11369unsigned VecScalarsSz =PowerOf2Ceil(NumElts);

11370if (NumOfParts > 0 && NumOfParts < NumElts)

11371 VecScalarsSz =PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);

11372unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *

11373 VecScalarsSz;

11374unsignedOffset = VecScalarsSz * (OffsetBeg / VecScalarsSz);

11375unsigned InsertVecSz = std::min<unsigned>(

11376PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),

11377 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);

11378bool IsWholeSubvector =

11379 OffsetBeg ==Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);

11380// Check if we can safely insert a subvector. If it is not possible, just

11381// generate a whole-sized vector and shuffle the source vector and the new

11382// subvector.

11383if (OffsetBeg + InsertVecSz > VecSz) {

11384// Align OffsetBeg to generate correct mask.

11385 OffsetBeg =alignDown(OffsetBeg, VecSz,Offset);

11386 InsertVecSz = VecSz;

11387 }

11388

11389APInt DemandedElts =APInt::getZero(NumElts);

11390// TODO: Add support for Instruction::InsertValue.

11391SmallVector<int>Mask;

11392if (!E->ReorderIndices.empty()) {

11393inversePermutation(E->ReorderIndices, Mask);

11394Mask.append(InsertVecSz -Mask.size(),PoisonMaskElem);

11395 }else {

11396Mask.assign(VecSz,PoisonMaskElem);

11397 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);

11398 }

11399bool IsIdentity =true;

11400SmallVector<int> PrevMask(InsertVecSz,PoisonMaskElem);

11401Mask.swap(PrevMask);

11402for (unsignedI = 0;I < NumScalars; ++I) {

11403unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);

11404 DemandedElts.setBit(InsertIdx);

11405 IsIdentity &= InsertIdx - OffsetBeg ==I;

11406Mask[InsertIdx - OffsetBeg] =I;

11407 }

11408assert(Offset < NumElts &&"Failed to find vector index offset");

11409

11410InstructionCost Cost = 0;

11411Cost -=TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,

11412/*Insert*/true,/*Extract*/false,

11413CostKind);

11414

11415// First cost - resize to actual vector size if not identity shuffle or

11416// need to shift the vector.

11417// Do not calculate the cost if the actual size is the register size and

11418// we can merge this shuffle with the following SK_Select.

11419auto *InsertVecTy =getWidenedType(ScalarTy, InsertVecSz);

11420if (!IsIdentity)

11421Cost +=::getShuffleCost(*TTI,TargetTransformInfo::SK_PermuteSingleSrc,

11422 InsertVecTy, Mask);

11423auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

11424 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

11425 }));

11426// Second cost - permutation with subvector, if some elements are from the

11427// initial vector or inserting a subvector.

11428// TODO: Implement the analysis of the FirstInsert->getOperand(0)

11429// subvector of ActualVecTy.

11430SmallBitVector InMask =

11431isUndefVector(FirstInsert->getOperand(0),

11432buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));

11433if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {

11434if (InsertVecSz != VecSz) {

11435auto *ActualVecTy =getWidenedType(ScalarTy, VecSz);

11436Cost +=::getShuffleCost(*TTI,TTI::SK_InsertSubvector, ActualVecTy, {},

11437CostKind, OffsetBeg -Offset, InsertVecTy);

11438 }else {

11439for (unsignedI = 0,End = OffsetBeg -Offset;I <End; ++I)

11440 Mask[I] = InMask.test(I) ?PoisonMaskElem :I;

11441for (unsignedI = OffsetBeg -Offset,End = OffsetEnd -Offset;

11442I <=End; ++I)

11443if (Mask[I] !=PoisonMaskElem)

11444Mask[I] =I + VecSz;

11445for (unsignedI = OffsetEnd + 1 -Offset;I < VecSz; ++I)

11446 Mask[I] =

11447 ((I >= InMask.size()) || InMask.test(I)) ?PoisonMaskElem :I;

11448Cost +=

11449::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);

11450 }

11451 }

11452returnCost;

11453 }

11454case Instruction::ZExt:

11455case Instruction::SExt:

11456case Instruction::FPToUI:

11457case Instruction::FPToSI:

11458case Instruction::FPExt:

11459case Instruction::PtrToInt:

11460case Instruction::IntToPtr:

11461case Instruction::SIToFP:

11462case Instruction::UIToFP:

11463case Instruction::Trunc:

11464case Instruction::FPTrunc:

11465case Instruction::BitCast: {

11466auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

11467Type *SrcScalarTy = VL0->getOperand(0)->getType();

11468auto *SrcVecTy =getWidenedType(SrcScalarTy, VL.size());

11469unsigned Opcode = ShuffleOrOp;

11470unsigned VecOpcode = Opcode;

11471if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&

11472 (SrcIt != MinBWs.end() || It != MinBWs.end())) {

11473// Check if the values are candidates to demote.

11474unsigned SrcBWSz =DL->getTypeSizeInBits(SrcScalarTy->getScalarType());

11475if (SrcIt != MinBWs.end()) {

11476 SrcBWSz = SrcIt->second.first;

11477unsigned SrcScalarTyNumElements =getNumElements(SrcScalarTy);

11478 SrcScalarTy =IntegerType::get(F->getContext(), SrcBWSz);

11479 SrcVecTy =

11480getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);

11481 }

11482unsigned BWSz =DL->getTypeSizeInBits(ScalarTy->getScalarType());

11483if (BWSz == SrcBWSz) {

11484 VecOpcode = Instruction::BitCast;

11485 }elseif (BWSz < SrcBWSz) {

11486 VecOpcode = Instruction::Trunc;

11487 }elseif (It != MinBWs.end()) {

11488assert(BWSz > SrcBWSz &&"Invalid cast!");

11489 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

11490 }elseif (SrcIt != MinBWs.end()) {

11491assert(BWSz > SrcBWSz &&"Invalid cast!");

11492 VecOpcode =

11493 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;

11494 }

11495 }elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&

11496 !SrcIt->second.second) {

11497 VecOpcode = Instruction::UIToFP;

11498 }

11499auto GetScalarCost = [&](unsignedIdx) ->InstructionCost {

11500assert(Idx == 0 &&"Expected 0 index only");

11501returnTTI->getCastInstrCost(Opcode, VL0->getType(),

11502 VL0->getOperand(0)->getType(),

11503TTI::getCastContextHint(VL0),CostKind, VL0);

11504 };

11505auto GetVectorCost = [=](InstructionCost CommonCost) {

11506// Do not count cost here if minimum bitwidth is in effect and it is just

11507// a bitcast (here it is just a noop).

11508if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)

11509return CommonCost;

11510auto *VI = VL0->getOpcode() == Opcode ? VL0 :nullptr;

11511TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));

11512

11513bool IsArithmeticExtendedReduction =

11514 E->Idx == 0 && UserIgnoreList &&

11515all_of(*UserIgnoreList, [](Value *V) {

11516auto *I = cast<Instruction>(V);

11517returnis_contained({Instruction::Add, Instruction::FAdd,

11518 Instruction::Mul, Instruction::FMul,

11519 Instruction::And, Instruction::Or,

11520 Instruction::Xor},

11521I->getOpcode());

11522 });

11523if (IsArithmeticExtendedReduction &&

11524 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))

11525return CommonCost;

11526return CommonCost +

11527TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH,CostKind,

11528 VecOpcode == Opcode ? VI :nullptr);

11529 };

11530return GetCostDiff(GetScalarCost, GetVectorCost);

11531 }

11532case Instruction::FCmp:

11533case Instruction::ICmp:

11534case Instruction::Select: {

11535CmpPredicate VecPred, SwappedVecPred;

11536auto MatchCmp =m_Cmp(VecPred,m_Value(),m_Value());

11537if (match(VL0,m_Select(MatchCmp,m_Value(),m_Value())) ||

11538match(VL0, MatchCmp))

11539 SwappedVecPred =CmpInst::getSwappedPredicate(VecPred);

11540else

11541 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()

11542 ?CmpInst::BAD_FCMP_PREDICATE

11543 :CmpInst::BAD_ICMP_PREDICATE;

11544auto GetScalarCost = [&](unsignedIdx) {

11545if (isa<PoisonValue>(UniqueValues[Idx]))

11546returnInstructionCost(TTI::TCC_Free);

11547

11548auto *VI = cast<Instruction>(UniqueValues[Idx]);

11549CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()

11550 ?CmpInst::BAD_FCMP_PREDICATE

11551 :CmpInst::BAD_ICMP_PREDICATE;

11552auto MatchCmp =m_Cmp(CurrentPred,m_Value(),m_Value());

11553if ((!match(VI,m_Select(MatchCmp,m_Value(),m_Value())) &&

11554 !match(VI, MatchCmp)) ||

11555 (CurrentPred !=static_cast<CmpInst::Predicate>(VecPred) &&

11556 CurrentPred !=static_cast<CmpInst::Predicate>(SwappedVecPred)))

11557 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()

11558 ?CmpInst::BAD_FCMP_PREDICATE

11559 :CmpInst::BAD_ICMP_PREDICATE;

11560

11561InstructionCost ScalarCost =TTI->getCmpSelInstrCost(

11562 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,

11563CostKind, getOperandInfo(VI->getOperand(0)),

11564 getOperandInfo(VI->getOperand(1)), VI);

11565InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);

11566if (IntrinsicCost.isValid())

11567 ScalarCost = IntrinsicCost;

11568

11569return ScalarCost;

11570 };

11571auto GetVectorCost = [&](InstructionCost CommonCost) {

11572auto *MaskTy =getWidenedType(Builder.getInt1Ty(), VL.size());

11573

11574InstructionCost VecCost =

11575TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,

11576CostKind, getOperandInfo(E->getOperand(0)),

11577 getOperandInfo(E->getOperand(1)), VL0);

11578if (auto *SI = dyn_cast<SelectInst>(VL0)) {

11579auto *CondType =

11580getWidenedType(SI->getCondition()->getType(), VL.size());

11581unsigned CondNumElements = CondType->getNumElements();

11582unsigned VecTyNumElements =getNumElements(VecTy);

11583assert(VecTyNumElements >= CondNumElements &&

11584 VecTyNumElements % CondNumElements == 0 &&

11585"Cannot vectorize Instruction::Select");

11586if (CondNumElements != VecTyNumElements) {

11587// When the return type is i1 but the source is fixed vector type, we

11588// need to duplicate the condition value.

11589 VecCost +=::getShuffleCost(

11590 *TTI,TTI::SK_PermuteSingleSrc, CondType,

11591createReplicatedMask(VecTyNumElements / CondNumElements,

11592 CondNumElements));

11593 }

11594 }

11595return VecCost + CommonCost;

11596 };

11597return GetCostDiff(GetScalarCost, GetVectorCost);

11598 }

11599case TreeEntry::MinMax: {

11600auto GetScalarCost = [&](unsignedIdx) {

11601return GetMinMaxCost(OrigScalarTy);

11602 };

11603auto GetVectorCost = [&](InstructionCost CommonCost) {

11604InstructionCost VecCost = GetMinMaxCost(VecTy);

11605return VecCost + CommonCost;

11606 };

11607return GetCostDiff(GetScalarCost, GetVectorCost);

11608 }

11609case Instruction::FNeg:

11610case Instruction::Add:

11611case Instruction::FAdd:

11612case Instruction::Sub:

11613case Instruction::FSub:

11614case Instruction::Mul:

11615case Instruction::FMul:

11616case Instruction::UDiv:

11617case Instruction::SDiv:

11618case Instruction::FDiv:

11619case Instruction::URem:

11620case Instruction::SRem:

11621case Instruction::FRem:

11622case Instruction::Shl:

11623case Instruction::LShr:

11624case Instruction::AShr:

11625case Instruction::And:

11626case Instruction::Or:

11627case Instruction::Xor: {

11628auto GetScalarCost = [&](unsignedIdx) {

11629if (isa<PoisonValue>(UniqueValues[Idx]))

11630returnInstructionCost(TTI::TCC_Free);

11631

11632auto *VI = cast<Instruction>(UniqueValues[Idx]);

11633unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;

11634TTI::OperandValueInfo Op1Info =TTI::getOperandInfo(VI->getOperand(0));

11635TTI::OperandValueInfo Op2Info =

11636TTI::getOperandInfo(VI->getOperand(OpIdx));

11637SmallVector<const Value *>Operands(VI->operand_values());

11638returnTTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy,CostKind,

11639 Op1Info, Op2Info,Operands, VI);

11640 };

11641auto GetVectorCost = [=](InstructionCost CommonCost) {

11642if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {

11643for (unsignedI : seq<unsigned>(0, E->getNumOperands())) {

11644ArrayRef<Value *> Ops = E->getOperand(I);

11645if (all_of(Ops, [&](Value *Op) {

11646auto *CI = dyn_cast<ConstantInt>(Op);

11647return CI && CI->getValue().countr_one() >= It->second.first;

11648 }))

11649return CommonCost;

11650 }

11651 }

11652unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;

11653TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));

11654TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));

11655returnTTI->getArithmeticInstrCost(ShuffleOrOp, VecTy,CostKind, Op1Info,

11656 Op2Info, {},nullptr, TLI) +

11657 CommonCost;

11658 };

11659return GetCostDiff(GetScalarCost, GetVectorCost);

11660 }

11661case Instruction::GetElementPtr: {

11662return CommonCost + GetGEPCostDiff(VL, VL0);

11663 }

11664case Instruction::Load: {

11665auto GetScalarCost = [&](unsignedIdx) {

11666auto *VI = cast<LoadInst>(UniqueValues[Idx]);

11667returnTTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,

11668VI->getAlign(),VI->getPointerAddressSpace(),

11669CostKind,TTI::OperandValueInfo(), VI);

11670 };

11671auto *LI0 = cast<LoadInst>(VL0);

11672auto GetVectorCost = [&](InstructionCost CommonCost) {

11673InstructionCost VecLdCost;

11674switch (E->State) {

11675case TreeEntry::Vectorize:

11676if (unsigned Factor = E->getInterleaveFactor()) {

11677 VecLdCost =TTI->getInterleavedMemoryOpCost(

11678 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),

11679 LI0->getPointerAddressSpace(),CostKind);

11680

11681 }else {

11682 VecLdCost =TTI->getMemoryOpCost(

11683 Instruction::Load, VecTy, LI0->getAlign(),

11684 LI0->getPointerAddressSpace(),CostKind,TTI::OperandValueInfo());

11685 }

11686break;

11687case TreeEntry::StridedVectorize: {

11688Align CommonAlignment =

11689 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());

11690 VecLdCost =TTI->getStridedMemoryOpCost(

11691 Instruction::Load, VecTy, LI0->getPointerOperand(),

11692/*VariableMask=*/false, CommonAlignment,CostKind);

11693break;

11694 }

11695case TreeEntry::ScatterVectorize: {

11696Align CommonAlignment =

11697 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());

11698 VecLdCost =TTI->getGatherScatterOpCost(

11699 Instruction::Load, VecTy, LI0->getPointerOperand(),

11700/*VariableMask=*/false, CommonAlignment,CostKind);

11701break;

11702 }

11703case TreeEntry::CombinedVectorize:

11704case TreeEntry::NeedToGather:

11705llvm_unreachable("Unexpected vectorization state.");

11706 }

11707return VecLdCost + CommonCost;

11708 };

11709

11710InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);

11711// If this node generates masked gather load then it is not a terminal node.

11712// Hence address operand cost is estimated separately.

11713if (E->State == TreeEntry::ScatterVectorize)

11714returnCost;

11715

11716// Estimate cost of GEPs since this tree node is a terminator.

11717SmallVector<Value *> PointerOps(VL.size());

11718for (auto [I, V] :enumerate(VL))

11719 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();

11720returnCost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());

11721 }

11722case Instruction::Store: {

11723bool IsReorder = !E->ReorderIndices.empty();

11724auto GetScalarCost = [=](unsignedIdx) {

11725auto *VI = cast<StoreInst>(VL[Idx]);

11726TTI::OperandValueInfo OpInfo =TTI::getOperandInfo(VI->getValueOperand());

11727returnTTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,

11728VI->getAlign(),VI->getPointerAddressSpace(),

11729CostKind, OpInfo, VI);

11730 };

11731auto *BaseSI =

11732 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);

11733auto GetVectorCost = [=](InstructionCost CommonCost) {

11734// We know that we can merge the stores. Calculate the cost.

11735InstructionCost VecStCost;

11736if (E->State == TreeEntry::StridedVectorize) {

11737Align CommonAlignment =

11738 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());

11739 VecStCost =TTI->getStridedMemoryOpCost(

11740 Instruction::Store, VecTy, BaseSI->getPointerOperand(),

11741/*VariableMask=*/false, CommonAlignment,CostKind);

11742 }else {

11743assert(E->State == TreeEntry::Vectorize &&

11744"Expected either strided or consecutive stores.");

11745if (unsigned Factor = E->getInterleaveFactor()) {

11746assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&

11747"No reused shuffles expected");

11748 CommonCost = 0;

11749 VecStCost =TTI->getInterleavedMemoryOpCost(

11750 Instruction::Store, VecTy, Factor, std::nullopt,

11751 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),CostKind);

11752 }else {

11753TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));

11754 VecStCost =TTI->getMemoryOpCost(

11755 Instruction::Store, VecTy, BaseSI->getAlign(),

11756 BaseSI->getPointerAddressSpace(),CostKind, OpInfo);

11757 }

11758 }

11759return VecStCost + CommonCost;

11760 };

11761SmallVector<Value *> PointerOps(VL.size());

11762for (auto [I, V] :enumerate(VL)) {

11763unsignedIdx = IsReorder ? E->ReorderIndices[I] :I;

11764 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();

11765 }

11766

11767return GetCostDiff(GetScalarCost, GetVectorCost) +

11768 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());

11769 }

11770case Instruction::Call: {

11771auto GetScalarCost = [&](unsignedIdx) {

11772auto *CI = cast<CallInst>(UniqueValues[Idx]);

11773Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

11774if (ID !=Intrinsic::not_intrinsic) {

11775IntrinsicCostAttributes CostAttrs(ID, *CI, 1);

11776returnTTI->getIntrinsicInstrCost(CostAttrs,CostKind);

11777 }

11778returnTTI->getCallInstrCost(CI->getCalledFunction(),

11779 CI->getFunctionType()->getReturnType(),

11780 CI->getFunctionType()->params(),CostKind);

11781 };

11782auto GetVectorCost = [=](InstructionCost CommonCost) {

11783auto *CI = cast<CallInst>(VL0);

11784Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

11785SmallVector<Type *> ArgTys =buildIntrinsicArgTypes(

11786 CI,ID, VecTy->getNumElements(),

11787 It != MinBWs.end() ? It->second.first : 0,TTI);

11788auto VecCallCosts =getVectorCallCosts(CI, VecTy,TTI, TLI, ArgTys);

11789return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;

11790 };

11791return GetCostDiff(GetScalarCost, GetVectorCost);

11792 }

11793case Instruction::ShuffleVector: {

11794if (!SLPReVec || E->isAltShuffle())

11795assert(E->isAltShuffle() &&

11796 ((Instruction::isBinaryOp(E->getOpcode()) &&

11797Instruction::isBinaryOp(E->getAltOpcode())) ||

11798 (Instruction::isCast(E->getOpcode()) &&

11799Instruction::isCast(E->getAltOpcode())) ||

11800 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&

11801"Invalid Shuffle Vector Operand");

11802// Try to find the previous shuffle node with the same operands and same

11803// main/alternate ops.

11804auto TryFindNodeWithEqualOperands = [=]() {

11805for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

11806if (TE.get() == E)

11807break;

11808if (TE->hasState() &&TE->isAltShuffle() &&

11809 ((TE->getOpcode() == E->getOpcode() &&

11810TE->getAltOpcode() == E->getAltOpcode()) ||

11811 (TE->getOpcode() == E->getAltOpcode() &&

11812TE->getAltOpcode() == E->getOpcode())) &&

11813TE->hasEqualOperands(*E))

11814returntrue;

11815 }

11816returnfalse;

11817 };

11818auto GetScalarCost = [&](unsignedIdx) {

11819if (isa<PoisonValue>(UniqueValues[Idx]))

11820returnInstructionCost(TTI::TCC_Free);

11821

11822auto *VI = cast<Instruction>(UniqueValues[Idx]);

11823assert(E->isOpcodeOrAlt(VI) &&"Unexpected main/alternate opcode");

11824 (void)E;

11825returnTTI->getInstructionCost(VI,CostKind);

11826 };

11827// Need to clear CommonCost since the final shuffle cost is included into

11828// vector cost.

11829auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {

11830// VecCost is equal to sum of the cost of creating 2 vectors

11831// and the cost of creating shuffle.

11832InstructionCost VecCost = 0;

11833if (TryFindNodeWithEqualOperands()) {

11834LLVM_DEBUG({

11835dbgs() <<"SLP: diamond match for alternate node found.\n";

11836 E->dump();

11837 });

11838// No need to add new vector costs here since we're going to reuse

11839// same main/alternate vector ops, just do different shuffling.

11840 }elseif (Instruction::isBinaryOp(E->getOpcode())) {

11841 VecCost =

11842 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,CostKind);

11843 VecCost +=

11844 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,CostKind);

11845 }elseif (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

11846auto *MaskTy =getWidenedType(Builder.getInt1Ty(), VL.size());

11847 VecCost = TTIRef.getCmpSelInstrCost(

11848 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),CostKind,

11849 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},

11850 VL0);

11851 VecCost += TTIRef.getCmpSelInstrCost(

11852 E->getOpcode(), VecTy, MaskTy,

11853 cast<CmpInst>(E->getAltOp())->getPredicate(),CostKind,

11854 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},

11855 E->getAltOp());

11856 }else {

11857Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();

11858auto *SrcTy =getWidenedType(SrcSclTy, VL.size());

11859if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {

11860auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

11861unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);

11862unsigned SrcBWSz =

11863DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());

11864if (SrcIt != MinBWs.end()) {

11865 SrcBWSz = SrcIt->second.first;

11866 SrcSclTy =IntegerType::get(SrcSclTy->getContext(), SrcBWSz);

11867 SrcTy =getWidenedType(SrcSclTy, VL.size());

11868 }

11869if (BWSz <= SrcBWSz) {

11870if (BWSz < SrcBWSz)

11871 VecCost =

11872 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,

11873TTI::CastContextHint::None,CostKind);

11874LLVM_DEBUG({

11875dbgs()

11876 <<"SLP: alternate extension, which should be truncated.\n";

11877 E->dump();

11878 });

11879return VecCost;

11880 }

11881 }

11882 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,

11883TTI::CastContextHint::None,CostKind);

11884 VecCost +=

11885 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,

11886TTI::CastContextHint::None,CostKind);

11887 }

11888SmallVector<int>Mask;

11889 E->buildAltOpShuffleMask(

11890 [&](Instruction *I) {

11891assert(E->isOpcodeOrAlt(I) &&"Unexpected main/alternate opcode");

11892returnisAlternateInstruction(I, E->getMainOp(), E->getAltOp(),

11893 *TLI);

11894 },

11895Mask);

11896 VecCost +=::getShuffleCost(TTIRef,TargetTransformInfo::SK_PermuteTwoSrc,

11897 FinalVecTy, Mask,CostKind);

11898// Patterns like [fadd,fsub] can be combined into a single instruction

11899// in x86. Reordering them into [fsub,fadd] blocks this pattern. So we

11900// need to take into account their order when looking for the most used

11901// order.

11902unsigned Opcode0 = E->getOpcode();

11903unsigned Opcode1 = E->getAltOpcode();

11904SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));

11905// If this pattern is supported by the target then we consider the

11906// order.

11907if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {

11908InstructionCost AltVecCost = TTIRef.getAltInstrCost(

11909 VecTy, Opcode0, Opcode1, OpcodeMask,CostKind);

11910return AltVecCost < VecCost ? AltVecCost : VecCost;

11911 }

11912// TODO: Check the reverse order too.

11913return VecCost;

11914 };

11915if (SLPReVec && !E->isAltShuffle())

11916return GetCostDiff(

11917 GetScalarCost, [&](InstructionCost) ->InstructionCost {

11918// If a group uses mask in order, the shufflevector can be

11919// eliminated by instcombine. Then the cost is 0.

11920assert(isa<ShuffleVectorInst>(VL.front()) &&

11921"Not supported shufflevector usage.");

11922auto *SV = cast<ShuffleVectorInst>(VL.front());

11923unsigned SVNumElements =

11924 cast<FixedVectorType>(SV->getOperand(0)->getType())

11925 ->getNumElements();

11926unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();

11927for (size_tI = 0,End = VL.size();I !=End;I += GroupSize) {

11928ArrayRef<Value *> Group = VL.slice(I, GroupSize);

11929int NextIndex = 0;

11930if (!all_of(Group, [&](Value *V) {

11931assert(isa<ShuffleVectorInst>(V) &&

11932"Not supported shufflevector usage.");

11933auto *SV = cast<ShuffleVectorInst>(V);

11934intIndex;

11935 [[maybe_unused]]bool IsExtractSubvectorMask =

11936 SV->isExtractSubvectorMask(Index);

11937assert(IsExtractSubvectorMask &&

11938"Not supported shufflevector usage.");

11939if (NextIndex != Index)

11940returnfalse;

11941 NextIndex += SV->getShuffleMask().size();

11942returntrue;

11943 }))

11944 return ::getShuffleCost(

11945 *TTI,TargetTransformInfo::SK_PermuteSingleSrc, VecTy,

11946calculateShufflevectorMask(E->Scalars));

11947 }

11948returnTTI::TCC_Free;

11949 });

11950return GetCostDiff(GetScalarCost, GetVectorCost);

11951 }

11952case Instruction::Freeze:

11953return CommonCost;

11954default:

11955llvm_unreachable("Unknown instruction");

11956 }

11957}

11958

11959bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const{

11960LLVM_DEBUG(dbgs() <<"SLP: Check whether the tree with height "

11961 << VectorizableTree.size() <<" is fully vectorizable .\n");

11962

11963auto &&AreVectorizableGathers = [this](const TreeEntry *TE,unsigned Limit) {

11964SmallVector<int>Mask;

11965returnTE->isGather() &&

11966 !any_of(TE->Scalars,

11967 [this](Value *V) { return EphValues.contains(V); }) &&

11968 (allConstant(TE->Scalars) ||isSplat(TE->Scalars) ||

11969TE->Scalars.size() < Limit ||

11970 (((TE->hasState() &&

11971TE->getOpcode() == Instruction::ExtractElement) ||

11972all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&

11973isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||

11974 (TE->hasState() &&TE->getOpcode() == Instruction::Load &&

11975 !TE->isAltShuffle()) ||

11976any_of(TE->Scalars, IsaPred<LoadInst>));

11977 };

11978

11979// We only handle trees of heights 1 and 2.

11980if (VectorizableTree.size() == 1 &&

11981 (VectorizableTree[0]->State == TreeEntry::Vectorize ||

11982 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||

11983 (ForReduction &&

11984 AreVectorizableGathers(VectorizableTree[0].get(),

11985 VectorizableTree[0]->Scalars.size()) &&

11986 VectorizableTree[0]->getVectorFactor() > 2)))

11987returntrue;

11988

11989if (VectorizableTree.size() != 2)

11990returnfalse;

11991

11992// Handle splat and all-constants stores. Also try to vectorize tiny trees

11993// with the second gather nodes if they have less scalar operands rather than

11994// the initial tree element (may be profitable to shuffle the second gather)

11995// or they are extractelements, which form shuffle.

11996SmallVector<int>Mask;

11997if (VectorizableTree[0]->State == TreeEntry::Vectorize &&

11998 AreVectorizableGathers(VectorizableTree[1].get(),

11999 VectorizableTree[0]->Scalars.size()))

12000returntrue;

12001

12002// Gathering cost would be too much for tiny trees.

12003if (VectorizableTree[0]->isGather() ||

12004 (VectorizableTree[1]->isGather() &&

12005 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&

12006 VectorizableTree[0]->State != TreeEntry::StridedVectorize))

12007returnfalse;

12008

12009returntrue;

12010}

12011

12012staticboolisLoadCombineCandidateImpl(Value *Root,unsigned NumElts,

12013TargetTransformInfo *TTI,

12014bool MustMatchOrInst) {

12015// Look past the root to find a source value. Arbitrarily follow the

12016// path through operand 0 of any 'or'. Also, peek through optional

12017// shift-left-by-multiple-of-8-bits.

12018Value *ZextLoad = Root;

12019constAPInt *ShAmtC;

12020bool FoundOr =false;

12021while (!isa<ConstantExpr>(ZextLoad) &&

12022 (match(ZextLoad,m_Or(m_Value(),m_Value())) ||

12023 (match(ZextLoad,m_Shl(m_Value(),m_APInt(ShAmtC))) &&

12024 ShAmtC->urem(8) == 0))) {

12025auto *BinOp = cast<BinaryOperator>(ZextLoad);

12026 ZextLoad = BinOp->getOperand(0);

12027if (BinOp->getOpcode() == Instruction::Or)

12028 FoundOr =true;

12029 }

12030// Check if the input is an extended load of the required or/shift expression.

12031Value *Load;

12032if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||

12033 !match(ZextLoad,m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))

12034returnfalse;

12035

12036// Require that the total load bit width is a legal integer type.

12037// For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.

12038// But <16 x i8> --> i128 is not, so the backend probably can't reduce it.

12039Type *SrcTy = Load->getType();

12040unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;

12041if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))

12042returnfalse;

12043

12044// Everything matched - assume that we can fold the whole sequence using

12045// load combining.

12046LLVM_DEBUG(dbgs() <<"SLP: Assume load combining for tree starting at "

12047 << *(cast<Instruction>(Root)) <<"\n");

12048

12049returntrue;

12050}

12051

12052boolBoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const{

12053if (RdxKind !=RecurKind::Or)

12054returnfalse;

12055

12056unsigned NumElts = VectorizableTree[0]->Scalars.size();

12057Value *FirstReduced = VectorizableTree[0]->Scalars[0];

12058returnisLoadCombineCandidateImpl(FirstReduced, NumElts,TTI,

12059/* MatchOr */false);

12060}

12061

12062boolBoUpSLP::isLoadCombineCandidate(ArrayRef<Value *> Stores) const{

12063// Peek through a final sequence of stores and check if all operations are

12064// likely to be load-combined.

12065unsigned NumElts = Stores.size();

12066for (Value *Scalar : Stores) {

12067Value *X;

12068if (!match(Scalar,m_Store(m_Value(X),m_Value())) ||

12069 !isLoadCombineCandidateImpl(X, NumElts,TTI,/* MatchOr */true))

12070returnfalse;

12071 }

12072returntrue;

12073}

12074

12075boolBoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const{

12076if (!DebugCounter::shouldExecute(VectorizedGraphs))

12077returntrue;

12078

12079// Graph is empty - do nothing.

12080if (VectorizableTree.empty()) {

12081assert(ExternalUses.empty() &&"We shouldn't have any external users");

12082

12083returntrue;

12084 }

12085

12086// No need to vectorize inserts of gathered values.

12087if (VectorizableTree.size() == 2 &&

12088 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&

12089 VectorizableTree[1]->isGather() &&

12090 (VectorizableTree[1]->getVectorFactor() <= 2 ||

12091 !(isSplat(VectorizableTree[1]->Scalars) ||

12092allConstant(VectorizableTree[1]->Scalars))))

12093returntrue;

12094

12095// If the graph includes only PHI nodes and gathers, it is defnitely not

12096// profitable for the vectorization, we can skip it, if the cost threshold is

12097// default. The cost of vectorized PHI nodes is almost always 0 + the cost of

12098// gathers/buildvectors.

12099constexprint Limit = 4;

12100if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&

12101 !VectorizableTree.empty() &&

12102all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

12103return (TE->isGather() &&

12104 (!TE->hasState() ||

12105 TE->getOpcode() != Instruction::ExtractElement) &&

12106count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||

12107 (TE->hasState() && TE->getOpcode() == Instruction::PHI);

12108 }))

12109returntrue;

12110

12111// We can vectorize the tree if its size is greater than or equal to the

12112// minimum size specified by the MinTreeSize command line option.

12113if (VectorizableTree.size() >=MinTreeSize)

12114returnfalse;

12115

12116// If we have a tiny tree (a tree whose size is less than MinTreeSize), we

12117// can vectorize it if we can prove it fully vectorizable.

12118if (isFullyVectorizableTinyTree(ForReduction))

12119returnfalse;

12120

12121// Check if any of the gather node forms an insertelement buildvector

12122// somewhere.

12123bool IsAllowedSingleBVNode =

12124 VectorizableTree.size() > 1 ||

12125 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&

12126 !VectorizableTree.front()->isAltShuffle() &&

12127 VectorizableTree.front()->getOpcode() != Instruction::PHI &&

12128 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&

12129allSameBlock(VectorizableTree.front()->Scalars));

12130if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

12131return TE->isGather() &&all_of(TE->Scalars, [&](Value *V) {

12132 return isa<ExtractElementInst, UndefValue>(V) ||

12133 (IsAllowedSingleBVNode &&

12134 !V->hasNUsesOrMore(UsesLimit) &&

12135 any_of(V->users(), IsaPred<InsertElementInst>));

12136 });

12137 }))

12138returnfalse;

12139

12140if (VectorizableTree.back()->isGather() &&

12141 VectorizableTree.back()->hasState() &&

12142 VectorizableTree.back()->isAltShuffle() &&

12143 VectorizableTree.back()->getVectorFactor() > 2 &&

12144allSameBlock(VectorizableTree.back()->Scalars) &&

12145 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&

12146TTI->getScalarizationOverhead(

12147getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),

12148 VectorizableTree.back()->getVectorFactor()),

12149APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),

12150/*Insert=*/true,/*Extract=*/false,

12151TTI::TCK_RecipThroughput) > -SLPCostThreshold)

12152returnfalse;

12153

12154// Otherwise, we can't vectorize the tree. It is both tiny and not fully

12155// vectorizable.

12156returntrue;

12157}

12158

12159boolBoUpSLP::isTreeNotExtendable() const{

12160if (getCanonicalGraphSize() !=getTreeSize()) {

12161constexprunsigned SmallTree = 3;

12162if (VectorizableTree.front()->isNonPowOf2Vec() &&

12163getCanonicalGraphSize() <= SmallTree &&

12164count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),

12165 [](const std::unique_ptr<TreeEntry> &TE) {

12166return TE->isGather() && TE->hasState() &&

12167 TE->getOpcode() == Instruction::Load &&

12168 !allSameBlock(TE->Scalars);

12169 }) == 1)

12170returntrue;

12171returnfalse;

12172 }

12173bool Res =false;

12174for (unsignedIdx : seq<unsigned>(getTreeSize())) {

12175 TreeEntry &E = *VectorizableTree[Idx];

12176if (!E.isGather())

12177continue;

12178if (E.hasState() && E.getOpcode() != Instruction::Load)

12179returnfalse;

12180if (isSplat(E.Scalars) ||allConstant(E.Scalars))

12181continue;

12182 Res =true;

12183 }

12184return Res;

12185}

12186

12187InstructionCost BoUpSLP::getSpillCost() const{

12188// Walk from the bottom of the tree to the top, tracking which values are

12189// live. When we see a call instruction that is not part of our tree,

12190// query TTI to see if there is a cost to keeping values live over it

12191// (for example, if spills and fills are required).

12192unsigned BundleWidth = VectorizableTree.front()->Scalars.size();

12193InstructionCost Cost = 0;

12194

12195SmallPtrSet<Instruction *, 4> LiveValues;

12196Instruction *PrevInst =nullptr;

12197

12198// The entries in VectorizableTree are not necessarily ordered by their

12199// position in basic blocks. Collect them and order them by dominance so later

12200// instructions are guaranteed to be visited first. For instructions in

12201// different basic blocks, we only scan to the beginning of the block, so

12202// their order does not matter, as long as all instructions in a basic block

12203// are grouped together. Using dominance ensures a deterministic order.

12204SmallVector<Instruction *, 16> OrderedScalars;

12205for (constauto &TEPtr : VectorizableTree) {

12206if (TEPtr->State != TreeEntry::Vectorize)

12207continue;

12208Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);

12209if (!Inst)

12210continue;

12211 OrderedScalars.push_back(Inst);

12212 }

12213llvm::sort(OrderedScalars, [&](Instruction *A,Instruction *B) {

12214auto *NodeA = DT->getNode(A->getParent());

12215auto *NodeB = DT->getNode(B->getParent());

12216assert(NodeA &&"Should only process reachable instructions");

12217assert(NodeB &&"Should only process reachable instructions");

12218assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

12219"Different nodes should have different DFS numbers");

12220if (NodeA != NodeB)

12221return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();

12222returnB->comesBefore(A);

12223 });

12224

12225for (Instruction *Inst : OrderedScalars) {

12226if (!PrevInst) {

12227 PrevInst = Inst;

12228continue;

12229 }

12230

12231// Update LiveValues.

12232 LiveValues.erase(PrevInst);

12233for (auto &J : PrevInst->operands()) {

12234if (isa<Instruction>(&*J) && getTreeEntry(&*J))

12235 LiveValues.insert(cast<Instruction>(&*J));

12236 }

12237

12238LLVM_DEBUG({

12239dbgs() <<"SLP: #LV: " << LiveValues.size();

12240for (auto *X : LiveValues)

12241dbgs() <<" " <<X->getName();

12242dbgs() <<", Looking at ";

12243 Inst->dump();

12244 });

12245

12246// Now find the sequence of instructions between PrevInst and Inst.

12247unsigned NumCalls = 0;

12248BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),

12249 PrevInstIt =

12250 PrevInst->getIterator().getReverse();

12251while (InstIt != PrevInstIt) {

12252if (PrevInstIt == PrevInst->getParent()->rend()) {

12253 PrevInstIt = Inst->getParent()->rbegin();

12254continue;

12255 }

12256

12257auto NoCallIntrinsic = [this](Instruction *I) {

12258if (auto *II = dyn_cast<IntrinsicInst>(I)) {

12259if (II->isAssumeLikeIntrinsic())

12260returntrue;

12261IntrinsicCostAttributes ICA(II->getIntrinsicID(), *II);

12262InstructionCost IntrCost =

12263TTI->getIntrinsicInstrCost(ICA,TTI::TCK_RecipThroughput);

12264InstructionCost CallCost =

12265TTI->getCallInstrCost(nullptr,II->getType(), ICA.getArgTypes(),

12266TTI::TCK_RecipThroughput);

12267if (IntrCost < CallCost)

12268returntrue;

12269 }

12270returnfalse;

12271 };

12272

12273// Debug information does not impact spill cost.

12274if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&

12275 &*PrevInstIt != PrevInst)

12276 NumCalls++;

12277

12278 ++PrevInstIt;

12279 }

12280

12281if (NumCalls) {

12282SmallVector<Type *, 4> V;

12283for (auto *II : LiveValues) {

12284auto *ScalarTy =II->getType();

12285if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))

12286 ScalarTy = VectorTy->getElementType();

12287 V.push_back(getWidenedType(ScalarTy, BundleWidth));

12288 }

12289Cost += NumCalls *TTI->getCostOfKeepingLiveOverCall(V);

12290 }

12291

12292 PrevInst = Inst;

12293 }

12294

12295returnCost;

12296}

12297

12298/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the

12299/// buildvector sequence.

12300staticboolisFirstInsertElement(constInsertElementInst *IE1,

12301constInsertElementInst *IE2) {

12302if (IE1 == IE2)

12303returnfalse;

12304constauto *I1 = IE1;

12305constauto *I2 = IE2;

12306constInsertElementInst *PrevI1;

12307constInsertElementInst *PrevI2;

12308unsigned Idx1 = *getElementIndex(IE1);

12309unsigned Idx2 = *getElementIndex(IE2);

12310do {

12311if (I2 == IE1)

12312returntrue;

12313if (I1 == IE2)

12314returnfalse;

12315 PrevI1 = I1;

12316 PrevI2 = I2;

12317if (I1 && (I1 == IE1 || I1->hasOneUse()) &&

12318getElementIndex(I1).value_or(Idx2) != Idx2)

12319 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));

12320if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&

12321getElementIndex(I2).value_or(Idx1) != Idx1)

12322 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));

12323 }while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));

12324llvm_unreachable("Two different buildvectors not expected.");

12325}

12326

12327namespace{

12328/// Returns incoming Value *, if the requested type is Value * too, or a default

12329/// value, otherwise.

12330structValueSelect {

12331template <typename U>

12332static std::enable_if_t<std::is_same_v<Value *, U>,Value *>get(Value *V) {

12333returnV;

12334 }

12335template <typename U>

12336static std::enable_if_t<!std::is_same_v<Value *, U>,U>get(Value *) {

12337returnU();

12338 }

12339};

12340}// namespace

12341

12342/// Does the analysis of the provided shuffle masks and performs the requested

12343/// actions on the vectors with the given shuffle masks. It tries to do it in

12344/// several steps.

12345/// 1. If the Base vector is not undef vector, resizing the very first mask to

12346/// have common VF and perform action for 2 input vectors (including non-undef

12347/// Base). Other shuffle masks are combined with the resulting after the 1 stage

12348/// and processed as a shuffle of 2 elements.

12349/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the

12350/// action only for 1 vector with the given mask, if it is not the identity

12351/// mask.

12352/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2

12353/// vectors, combing the masks properly between the steps.

12354template <typename T>

12355staticT *performExtractsShuffleAction(

12356MutableArrayRef<std::pair<T *,SmallVector<int>>> ShuffleMask,Value *Base,

12357function_ref<unsigned(T *)> GetVF,

12358function_ref<std::pair<T *, bool>(T *,ArrayRef<int>,bool)> ResizeAction,

12359function_ref<T *(ArrayRef<int>,ArrayRef<T *>)> Action) {

12360assert(!ShuffleMask.empty() &&"Empty list of shuffles for inserts.");

12361SmallVector<int> Mask(ShuffleMask.begin()->second);

12362auto VMIt = std::next(ShuffleMask.begin());

12363T *Prev =nullptr;

12364SmallBitVector UseMask =

12365buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);

12366SmallBitVector IsBaseUndef =isUndefVector(Base, UseMask);

12367if (!IsBaseUndef.all()) {

12368// Base is not undef, need to combine it with the next subvectors.

12369 std::pair<T *, bool> Res =

12370 ResizeAction(ShuffleMask.begin()->first, Mask,/*ForSingleMask=*/false);

12371SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);

12372for (unsignedIdx = 0, VF = Mask.size();Idx < VF; ++Idx) {

12373if (Mask[Idx] ==PoisonMaskElem)

12374 Mask[Idx] = IsBasePoison.test(Idx) ?PoisonMaskElem :Idx;

12375else

12376 Mask[Idx] = (Res.second ?Idx : Mask[Idx]) + VF;

12377 }

12378 [[maybe_unused]]auto *V = ValueSelect::get<T *>(Base);

12379assert((!V || GetVF(V) == Mask.size()) &&

12380"Expected base vector of VF number of elements.");

12381 Prev = Action(Mask, {nullptr, Res.first});

12382 }elseif (ShuffleMask.size() == 1) {

12383// Base is undef and only 1 vector is shuffled - perform the action only for

12384// single vector, if the mask is not the identity mask.

12385 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,

12386/*ForSingleMask=*/true);

12387if (Res.second)

12388// Identity mask is found.

12389 Prev = Res.first;

12390else

12391 Prev = Action(Mask, {ShuffleMask.begin()->first});

12392 }else {

12393// Base is undef and at least 2 input vectors shuffled - perform 2 vectors

12394// shuffles step by step, combining shuffle between the steps.

12395unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);

12396unsigned Vec2VF = GetVF(VMIt->first);

12397if (Vec1VF == Vec2VF) {

12398// No need to resize the input vectors since they are of the same size, we

12399// can shuffle them directly.

12400ArrayRef<int> SecMask = VMIt->second;

12401for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {

12402if (SecMask[I] !=PoisonMaskElem) {

12403assert(Mask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");

12404 Mask[I] = SecMask[I] + Vec1VF;

12405 }

12406 }

12407 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});

12408 }else {

12409// Vectors of different sizes - resize and reshuffle.

12410 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,

12411/*ForSingleMask=*/false);

12412 std::pair<T *, bool> Res2 =

12413 ResizeAction(VMIt->first, VMIt->second,/*ForSingleMask=*/false);

12414ArrayRef<int> SecMask = VMIt->second;

12415for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {

12416if (Mask[I] !=PoisonMaskElem) {

12417assert(SecMask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");

12418if (Res1.second)

12419 Mask[I] =I;

12420 }elseif (SecMask[I] !=PoisonMaskElem) {

12421assert(Mask[I] ==PoisonMaskElem &&"Multiple uses of scalars.");

12422 Mask[I] = (Res2.second ?I : SecMask[I]) + VF;

12423 }

12424 }

12425 Prev = Action(Mask, {Res1.first, Res2.first});

12426 }

12427 VMIt = std::next(VMIt);

12428 }

12429 [[maybe_unused]]bool IsBaseNotUndef = !IsBaseUndef.all();

12430// Perform requested actions for the remaining masks/vectors.

12431for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {

12432// Shuffle other input vectors, if any.

12433 std::pair<T *, bool> Res =

12434 ResizeAction(VMIt->first, VMIt->second,/*ForSingleMask=*/false);

12435ArrayRef<int> SecMask = VMIt->second;

12436for (unsignedI = 0, VF = Mask.size();I < VF; ++I) {

12437if (SecMask[I] !=PoisonMaskElem) {

12438assert((Mask[I] ==PoisonMaskElem || IsBaseNotUndef) &&

12439"Multiple uses of scalars.");

12440 Mask[I] = (Res.second ?I : SecMask[I]) + VF;

12441 }elseif (Mask[I] !=PoisonMaskElem) {

12442 Mask[I] =I;

12443 }

12444 }

12445 Prev = Action(Mask, {Prev, Res.first});

12446 }

12447return Prev;

12448}

12449

12450namespace{

12451/// Data type for handling buildvector sequences with the reused scalars from

12452/// other tree entries.

12453template <typename T>structShuffledInsertData {

12454 /// List of insertelements to be replaced by shuffles.

12455SmallVector<InsertElementInst *> InsertElements;

12456 /// The parent vectors and shuffle mask for the given list of inserts.

12457MapVector<T, SmallVector<int>> ValueMasks;

12458};

12459}// namespace

12460

12461InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {

12462InstructionCost Cost = 0;

12463LLVM_DEBUG(dbgs() <<"SLP: Calculating cost for tree of size "

12464 << VectorizableTree.size() <<".\n");

12465

12466unsigned BundleWidth = VectorizableTree[0]->Scalars.size();

12467

12468SmallPtrSet<Value *, 4> CheckedExtracts;

12469for (unsignedI = 0, E = VectorizableTree.size();I < E; ++I) {

12470 TreeEntry &TE = *VectorizableTree[I];

12471// No need to count the cost for combined entries, they are combined and

12472// just skip their cost.

12473if (TE.State == TreeEntry::CombinedVectorize) {

12474LLVM_DEBUG(

12475dbgs() <<"SLP: Skipping cost for combined node that starts with "

12476 << *TE.Scalars[0] <<".\n";

12477 TE.dump();dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12478continue;

12479 }

12480if (TE.isGather() && TE.hasState()) {

12481if (const TreeEntry *E = getTreeEntry(TE.getMainOp());

12482 E && E->getVectorFactor() == TE.getVectorFactor() &&

12483 E->isSame(TE.Scalars)) {

12484// Some gather nodes might be absolutely the same as some vectorizable

12485// nodes after reordering, need to handle it.

12486LLVM_DEBUG(dbgs() <<"SLP: Adding cost 0 for bundle "

12487 <<shortBundleName(TE.Scalars, TE.Idx) <<".\n"

12488 <<"SLP: Current total cost = " <<Cost <<"\n");

12489continue;

12490 }

12491 }

12492

12493// Exclude cost of gather loads nodes which are not used. These nodes were

12494// built as part of the final attempt to vectorize gathered loads.

12495assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&

12496"Expected gather nodes with users only.");

12497

12498InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);

12499Cost +=C;

12500LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C <<" for bundle "

12501 <<shortBundleName(TE.Scalars, TE.Idx) <<".\n"

12502 <<"SLP: Current total cost = " <<Cost <<"\n");

12503 }

12504

12505SmallPtrSet<Value *, 16> ExtractCostCalculated;

12506InstructionCost ExtractCost = 0;

12507SmallVector<ShuffledInsertData<const TreeEntry *>> ShuffledInserts;

12508SmallVector<APInt> DemandedElts;

12509SmallDenseSet<Value *, 4> UsedInserts;

12510DenseSet<std::pair<const TreeEntry *, Type *>> VectorCasts;

12511 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;

12512DenseMap<const TreeEntry *, DenseSet<Value *>> ExtractsCount;

12513SmallPtrSet<Value *, 4> ScalarOpsFromCasts;

12514// Keep track {Scalar, Index, User} tuple.

12515// On AArch64, this helps in fusing a mov instruction, associated with

12516// extractelement, with fmul in the backend so that extractelement is free.

12517SmallVector<std::tuple<Value *, User *, int>, 4> ScalarUserAndIdx;

12518for (ExternalUser &EU : ExternalUses) {

12519 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);

12520 }

12521for (ExternalUser &EU : ExternalUses) {

12522// Uses by ephemeral values are free (because the ephemeral value will be

12523// removed prior to code generation, and so the extraction will be

12524// removed as well).

12525if (EphValues.count(EU.User))

12526continue;

12527

12528// Used in unreachable blocks or in EH pads (rarely executed) or is

12529// terminated with unreachable instruction.

12530if (BasicBlock *UserParent =

12531 EU.User ? cast<Instruction>(EU.User)->getParent() :nullptr;

12532 UserParent &&

12533 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||

12534 isa_and_present<UnreachableInst>(UserParent->getTerminator())))

12535continue;

12536

12537// We only add extract cost once for the same scalar.

12538if (!isa_and_nonnull<InsertElementInst>(EU.User) &&

12539 !ExtractCostCalculated.insert(EU.Scalar).second)

12540continue;

12541

12542// No extract cost for vector "scalar"

12543if (isa<FixedVectorType>(EU.Scalar->getType()))

12544continue;

12545

12546// If found user is an insertelement, do not calculate extract cost but try

12547// to detect it as a final shuffled/identity match.

12548if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);

12549 VU && VU->getOperand(1) == EU.Scalar) {

12550if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {

12551if (!UsedInserts.insert(VU).second)

12552continue;

12553 std::optional<unsigned> InsertIdx =getElementIndex(VU);

12554if (InsertIdx) {

12555const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);

12556auto *It =find_if(

12557 ShuffledInserts,

12558 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {

12559// Checks if 2 insertelements are from the same buildvector.

12560InsertElementInst *VecInsert =Data.InsertElements.front();

12561returnareTwoInsertFromSameBuildVector(

12562 VU, VecInsert, [this](InsertElementInst *II) ->Value * {

12563Value *Op0 =II->getOperand(0);

12564if (getTreeEntry(II) && !getTreeEntry(Op0))

12565returnnullptr;

12566return Op0;

12567 });

12568 });

12569int VecId = -1;

12570if (It == ShuffledInserts.end()) {

12571auto &Data = ShuffledInserts.emplace_back();

12572Data.InsertElements.emplace_back(VU);

12573 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));

12574 VecId = ShuffledInserts.size() - 1;

12575auto It = MinBWs.find(ScalarTE);

12576if (It != MinBWs.end() &&

12577 VectorCasts

12578 .insert(std::make_pair(ScalarTE, FTy->getElementType()))

12579 .second) {

12580unsigned BWSz = It->second.first;

12581unsigned DstBWSz =DL->getTypeSizeInBits(FTy->getElementType());

12582unsigned VecOpcode;

12583if (DstBWSz < BWSz)

12584 VecOpcode = Instruction::Trunc;

12585else

12586 VecOpcode =

12587 It->second.second ? Instruction::SExt : Instruction::ZExt;

12588TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

12589InstructionCost C =TTI->getCastInstrCost(

12590 VecOpcode, FTy,

12591getWidenedType(IntegerType::get(FTy->getContext(), BWSz),

12592 FTy->getNumElements()),

12593TTI::CastContextHint::None,CostKind);

12594LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C

12595 <<" for extending externally used vector with "

12596"non-equal minimum bitwidth.\n");

12597Cost +=C;

12598 }

12599 }else {

12600if (isFirstInsertElement(VU, It->InsertElements.front()))

12601 It->InsertElements.front() = VU;

12602 VecId = std::distance(ShuffledInserts.begin(), It);

12603 }

12604int InIdx = *InsertIdx;

12605SmallVectorImpl<int> &Mask =

12606 ShuffledInserts[VecId].ValueMasks[ScalarTE];

12607if (Mask.empty())

12608 Mask.assign(FTy->getNumElements(),PoisonMaskElem);

12609 Mask[InIdx] = EU.Lane;

12610 DemandedElts[VecId].setBit(InIdx);

12611continue;

12612 }

12613 }

12614 }

12615

12616TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

12617// If we plan to rewrite the tree in a smaller type, we will need to sign

12618// extend the extracted value back to the original type. Here, we account

12619// for the extract and the added cost of the sign extend if needed.

12620InstructionCost ExtraCost =TTI::TCC_Free;

12621auto *VecTy =getWidenedType(EU.Scalar->getType(), BundleWidth);

12622const TreeEntry *Entry = getTreeEntry(EU.Scalar);

12623auto It = MinBWs.find(Entry);

12624if (It != MinBWs.end()) {

12625auto *MinTy =IntegerType::get(F->getContext(), It->second.first);

12626unsigned Extend =isKnownNonNegative(EU.Scalar,SimplifyQuery(*DL))

12627 ? Instruction::ZExt

12628 : Instruction::SExt;

12629 VecTy =getWidenedType(MinTy, BundleWidth);

12630 ExtraCost =TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),

12631 VecTy, EU.Lane);

12632 }else {

12633 ExtraCost =

12634TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,CostKind,

12635 EU.Lane, EU.Scalar, ScalarUserAndIdx);

12636 }

12637// Leave the scalar instructions as is if they are cheaper than extracts.

12638if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||

12639 Entry->getOpcode() == Instruction::Load) {

12640// Checks if the user of the external scalar is phi in loop body.

12641auto IsPhiInLoop = [&](const ExternalUser &U) {

12642if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {

12643auto *I = cast<Instruction>(U.Scalar);

12644constLoop *L = LI->getLoopFor(Phi->getParent());

12645return L && (Phi->getParent() ==I->getParent() ||

12646 L == LI->getLoopFor(I->getParent()));

12647 }

12648returnfalse;

12649 };

12650if (!ValueToExtUses) {

12651 ValueToExtUses.emplace();

12652for_each(enumerate(ExternalUses), [&](constauto &P) {

12653// Ignore phis in loops.

12654if (IsPhiInLoop(P.value()))

12655return;

12656

12657 ValueToExtUses->try_emplace(P.value().Scalar,P.index());

12658 });

12659 }

12660// Can use original instruction, if no operands vectorized or they are

12661// marked as externally used already.

12662auto *Inst = cast<Instruction>(EU.Scalar);

12663InstructionCost ScalarCost =TTI->getInstructionCost(Inst,CostKind);

12664auto OperandIsScalar = [&](Value *V) {

12665if (!getTreeEntry(V)) {

12666// Some extractelements might be not vectorized, but

12667// transformed into shuffle and removed from the function,

12668// consider it here.

12669if (auto *EE = dyn_cast<ExtractElementInst>(V))

12670return !EE->hasOneUse() || !MustGather.contains(EE);

12671returntrue;

12672 }

12673return ValueToExtUses->contains(V);

12674 };

12675bool CanBeUsedAsScalar =all_of(Inst->operands(), OperandIsScalar);

12676bool CanBeUsedAsScalarCast =false;

12677if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {

12678if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));

12679Op &&all_of(Op->operands(), OperandIsScalar)) {

12680InstructionCost OpCost =

12681 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))

12682 ?TTI->getInstructionCost(Op,CostKind)

12683 : 0;

12684if (ScalarCost + OpCost <= ExtraCost) {

12685 CanBeUsedAsScalar = CanBeUsedAsScalarCast =true;

12686 ScalarCost += OpCost;

12687 }

12688 }

12689 }

12690if (CanBeUsedAsScalar) {

12691bool KeepScalar = ScalarCost <= ExtraCost;

12692// Try to keep original scalar if the user is the phi node from the same

12693// block as the root phis, currently vectorized. It allows to keep

12694// better ordering info of PHIs, being vectorized currently.

12695bool IsProfitablePHIUser =

12696 (KeepScalar || (ScalarCost - ExtraCost <=TTI::TCC_Basic &&

12697 VectorizableTree.front()->Scalars.size() > 2)) &&

12698 VectorizableTree.front()->getOpcode() == Instruction::PHI &&

12699 !Inst->hasNUsesOrMore(UsesLimit) &&

12700none_of(Inst->users(),

12701 [&](User *U) {

12702 auto *PHIUser = dyn_cast<PHINode>(U);

12703 return (!PHIUser ||

12704 PHIUser->getParent() !=

12705 cast<Instruction>(

12706 VectorizableTree.front()->getMainOp())

12707 ->getParent()) &&

12708 !getTreeEntry(U);

12709 }) &&

12710count_if(Entry->Scalars, [&](Value *V) {

12711 return ValueToExtUses->contains(V);

12712 }) <= 2;

12713if (IsProfitablePHIUser) {

12714 KeepScalar =true;

12715 }elseif (KeepScalar && ScalarCost !=TTI::TCC_Free &&

12716 ExtraCost - ScalarCost <=TTI::TCC_Basic &&

12717 (!GatheredLoadsEntriesFirst.has_value() ||

12718 Entry->Idx < *GatheredLoadsEntriesFirst)) {

12719unsigned ScalarUsesCount =count_if(Entry->Scalars, [&](Value *V) {

12720 return ValueToExtUses->contains(V);

12721 });

12722auto It = ExtractsCount.find(Entry);

12723if (It != ExtractsCount.end()) {

12724assert(ScalarUsesCount >= It->getSecond().size() &&

12725"Expected total number of external uses not less than "

12726"number of scalar uses.");

12727 ScalarUsesCount -= It->getSecond().size();

12728 }

12729// Keep original scalar if number of externally used instructions in

12730// the same entry is not power of 2. It may help to do some extra

12731// vectorization for now.

12732 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);

12733 }

12734if (KeepScalar) {

12735 ExternalUsesAsOriginalScalar.insert(EU.Scalar);

12736for_each(Inst->operands(), [&](Value *V) {

12737 auto It = ValueToExtUses->find(V);

12738 if (It != ValueToExtUses->end()) {

12739// Replace all uses to avoid compiler crash.

12740 ExternalUses[It->second].User = nullptr;

12741 }

12742 });

12743 ExtraCost = ScalarCost;

12744if (!IsPhiInLoop(EU))

12745 ExtractsCount[Entry].insert(Inst);

12746if (CanBeUsedAsScalarCast) {

12747 ScalarOpsFromCasts.insert(Inst->getOperand(0));

12748// Update the users of the operands of the cast operand to avoid

12749// compiler crash.

12750if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {

12751for_each(IOp->operands(), [&](Value *V) {

12752 auto It = ValueToExtUses->find(V);

12753 if (It != ValueToExtUses->end()) {

12754// Replace all uses to avoid compiler crash.

12755 ExternalUses[It->second].User = nullptr;

12756 }

12757 });

12758 }

12759 }

12760 }

12761 }

12762 }

12763

12764 ExtractCost += ExtraCost;

12765 }

12766// Insert externals for extract of operands of casts to be emitted as scalars

12767// instead of extractelement.

12768for (Value *V : ScalarOpsFromCasts) {

12769 ExternalUsesAsOriginalScalar.insert(V);

12770if (const TreeEntry *E = getTreeEntry(V)) {

12771 ExternalUses.emplace_back(V,nullptr, E->findLaneForValue(V));

12772 }

12773 }

12774// Add reduced value cost, if resized.

12775if (!VectorizedVals.empty()) {

12776const TreeEntry &Root = *VectorizableTree.front();

12777auto BWIt = MinBWs.find(&Root);

12778if (BWIt != MinBWs.end()) {

12779Type *DstTy = Root.Scalars.front()->getType();

12780unsigned OriginalSz =DL->getTypeSizeInBits(DstTy->getScalarType());

12781unsigned SrcSz =

12782 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;

12783if (OriginalSz != SrcSz) {

12784unsigned Opcode = Instruction::Trunc;

12785if (OriginalSz > SrcSz)

12786 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;

12787Type *SrcTy =IntegerType::get(DstTy->getContext(), SrcSz);

12788if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {

12789assert(SLPReVec &&"Only supported by REVEC.");

12790 SrcTy =getWidenedType(SrcTy, VecTy->getNumElements());

12791 }

12792Cost +=TTI->getCastInstrCost(Opcode, DstTy, SrcTy,

12793TTI::CastContextHint::None,

12794TTI::TCK_RecipThroughput);

12795 }

12796 }

12797 }

12798

12799InstructionCost SpillCost = getSpillCost();

12800Cost += SpillCost + ExtractCost;

12801auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE,ArrayRef<int>Mask,

12802bool) {

12803InstructionCost C = 0;

12804unsigned VF =Mask.size();

12805unsigned VecVF =TE->getVectorFactor();

12806if (VF != VecVF &&

12807 (any_of(Mask, [VF](intIdx) {returnIdx >=static_cast<int>(VF); }) ||

12808 !ShuffleVectorInst::isIdentityMask(Mask, VF))) {

12809SmallVector<int> OrigMask(VecVF,PoisonMaskElem);

12810 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),

12811 OrigMask.begin());

12812C =::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc,

12813getWidenedType(TE->getMainOp()->getType(), VecVF),

12814 OrigMask);

12815LLVM_DEBUG(

12816dbgs() <<"SLP: Adding cost " <<C

12817 <<" for final shuffle of insertelement external users.\n";

12818TE->dump();dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12819Cost +=C;

12820return std::make_pair(TE,true);

12821 }

12822return std::make_pair(TE,false);

12823 };

12824// Calculate the cost of the reshuffled vectors, if any.

12825for (intI = 0, E = ShuffledInserts.size();I < E; ++I) {

12826Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);

12827autoVector = ShuffledInserts[I].ValueMasks.takeVector();

12828unsigned VF = 0;

12829auto EstimateShufflesCost = [&](ArrayRef<int>Mask,

12830ArrayRef<const TreeEntry *> TEs) {

12831assert((TEs.size() == 1 || TEs.size() == 2) &&

12832"Expected exactly 1 or 2 tree entries.");

12833if (TEs.size() == 1) {

12834if (VF == 0)

12835 VF = TEs.front()->getVectorFactor();

12836auto *FTy =getWidenedType(TEs.back()->Scalars.front()->getType(), VF);

12837if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&

12838 !all_of(enumerate(Mask), [=](constauto &Data) {

12839returnData.value() ==PoisonMaskElem ||

12840 (Data.index() < VF &&

12841static_cast<int>(Data.index()) ==Data.value());

12842 })) {

12843InstructionCost C =

12844::getShuffleCost(*TTI,TTI::SK_PermuteSingleSrc, FTy, Mask);

12845LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C

12846 <<" for final shuffle of insertelement "

12847"external users.\n";

12848 TEs.front()->dump();

12849dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12850Cost +=C;

12851 }

12852 }else {

12853if (VF == 0) {

12854if (TEs.front() &&

12855 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())

12856 VF = TEs.front()->getVectorFactor();

12857else

12858 VF =Mask.size();

12859 }

12860auto *FTy =getWidenedType(TEs.back()->Scalars.front()->getType(), VF);

12861InstructionCost C =

12862::getShuffleCost(*TTI,TTI::SK_PermuteTwoSrc, FTy, Mask);

12863LLVM_DEBUG(dbgs() <<"SLP: Adding cost " <<C

12864 <<" for final shuffle of vector node and external "

12865"insertelement users.\n";

12866if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();

12867dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12868Cost +=C;

12869 }

12870 VF =Mask.size();

12871return TEs.back();

12872 };

12873 (void)performExtractsShuffleAction<const TreeEntry>(

12874MutableArrayRef(Vector.data(),Vector.size()),Base,

12875 [](const TreeEntry *E) {return E->getVectorFactor(); }, ResizeToVF,

12876 EstimateShufflesCost);

12877InstructionCost InsertCost =TTI->getScalarizationOverhead(

12878 cast<FixedVectorType>(

12879 ShuffledInserts[I].InsertElements.front()->getType()),

12880 DemandedElts[I],

12881/*Insert*/true,/*Extract*/false,TTI::TCK_RecipThroughput);

12882Cost -= InsertCost;

12883 }

12884

12885// Add the cost for reduced value resize (if required).

12886if (ReductionBitWidth != 0) {

12887assert(UserIgnoreList &&"Expected reduction tree.");

12888const TreeEntry &E = *VectorizableTree.front();

12889auto It = MinBWs.find(&E);

12890if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {

12891unsigned SrcSize = It->second.first;

12892unsigned DstSize = ReductionBitWidth;

12893unsigned Opcode = Instruction::Trunc;

12894if (SrcSize < DstSize) {

12895bool IsArithmeticExtendedReduction =

12896all_of(*UserIgnoreList, [](Value *V) {

12897auto *I = cast<Instruction>(V);

12898returnis_contained({Instruction::Add, Instruction::FAdd,

12899 Instruction::Mul, Instruction::FMul,

12900 Instruction::And, Instruction::Or,

12901 Instruction::Xor},

12902I->getOpcode());

12903 });

12904if (IsArithmeticExtendedReduction)

12905 Opcode =

12906 Instruction::BitCast;// Handle it by getExtendedReductionCost

12907else

12908 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

12909 }

12910if (Opcode != Instruction::BitCast) {

12911auto *SrcVecTy =

12912getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());

12913auto *DstVecTy =

12914getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());

12915TTI::CastContextHint CCH = getCastContextHint(E);

12916InstructionCost CastCost;

12917switch (E.getOpcode()) {

12918case Instruction::SExt:

12919case Instruction::ZExt:

12920case Instruction::Trunc: {

12921const TreeEntry *OpTE = getOperandEntry(&E, 0);

12922 CCH = getCastContextHint(*OpTE);

12923break;

12924 }

12925default:

12926break;

12927 }

12928 CastCost +=TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,

12929TTI::TCK_RecipThroughput);

12930Cost += CastCost;

12931LLVM_DEBUG(dbgs() <<"SLP: Adding cost " << CastCost

12932 <<" for final resize for reduction from " << SrcVecTy

12933 <<" to " << DstVecTy <<"\n";

12934dbgs() <<"SLP: Current total cost = " <<Cost <<"\n");

12935 }

12936 }

12937 }

12938

12939#ifndef NDEBUG

12940SmallString<256> Str;

12941 {

12942raw_svector_ostream OS(Str);

12943OS <<"SLP: Spill Cost = " << SpillCost <<".\n"

12944 <<"SLP: Extract Cost = " << ExtractCost <<".\n"

12945 <<"SLP: Total Cost = " <<Cost <<".\n";

12946 }

12947LLVM_DEBUG(dbgs() << Str);

12948if (ViewSLPTree)

12949ViewGraph(this,"SLP" +F->getName(),false, Str);

12950#endif

12951

12952returnCost;

12953}

12954

12955/// Tries to find extractelement instructions with constant indices from fixed

12956/// vector type and gather such instructions into a bunch, which highly likely

12957/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

12958/// successful, the matched scalars are replaced by poison values in \p VL for

12959/// future analysis.

12960std::optional<TTI::ShuffleKind>

12961BoUpSLP::tryToGatherSingleRegisterExtractElements(

12962MutableArrayRef<Value *> VL,SmallVectorImpl<int> &Mask) const{

12963// Scan list of gathered scalars for extractelements that can be represented

12964// as shuffles.

12965MapVector<Value *, SmallVector<int>> VectorOpToIdx;

12966SmallVector<int> UndefVectorExtracts;

12967for (intI = 0, E = VL.size();I < E; ++I) {

12968auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

12969if (!EI) {

12970if (isa<UndefValue>(VL[I]))

12971 UndefVectorExtracts.push_back(I);

12972continue;

12973 }

12974auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());

12975if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))

12976continue;

12977 std::optional<unsigned>Idx =getExtractIndex(EI);

12978// Undefined index.

12979if (!Idx) {

12980 UndefVectorExtracts.push_back(I);

12981continue;

12982 }

12983if (Idx >= VecTy->getNumElements()) {

12984 UndefVectorExtracts.push_back(I);

12985continue;

12986 }

12987SmallBitVector ExtractMask(VecTy->getNumElements(),true);

12988 ExtractMask.reset(*Idx);

12989if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {

12990 UndefVectorExtracts.push_back(I);

12991continue;

12992 }

12993 VectorOpToIdx[EI->getVectorOperand()].push_back(I);

12994 }

12995// Sort the vector operands by the maximum number of uses in extractelements.

12996SmallVector<std::pair<Value *, SmallVector<int>>> Vectors =

12997 VectorOpToIdx.takeVector();

12998stable_sort(Vectors, [](constauto &P1,constauto &P2) {

12999returnP1.second.size() > P2.second.size();

13000 });

13001// Find the best pair of the vectors or a single vector.

13002constint UndefSz = UndefVectorExtracts.size();

13003unsigned SingleMax = 0;

13004unsigned PairMax = 0;

13005if (!Vectors.empty()) {

13006 SingleMax = Vectors.front().second.size() + UndefSz;

13007if (Vectors.size() > 1) {

13008auto *ItNext = std::next(Vectors.begin());

13009 PairMax = SingleMax + ItNext->second.size();

13010 }

13011 }

13012if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)

13013return std::nullopt;

13014// Check if better to perform a shuffle of 2 vectors or just of a single

13015// vector.

13016SmallVector<Value *> SavedVL(VL.begin(), VL.end());

13017SmallVector<Value *> GatheredExtracts(

13018 VL.size(),PoisonValue::get(VL.front()->getType()));

13019if (SingleMax >= PairMax && SingleMax) {

13020for (intIdx : Vectors.front().second)

13021std::swap(GatheredExtracts[Idx], VL[Idx]);

13022 }elseif (!Vectors.empty()) {

13023for (unsignedIdx : {0, 1})

13024for (intIdx : Vectors[Idx].second)

13025std::swap(GatheredExtracts[Idx], VL[Idx]);

13026 }

13027// Add extracts from undefs too.

13028for (intIdx : UndefVectorExtracts)

13029std::swap(GatheredExtracts[Idx], VL[Idx]);

13030// Check that gather of extractelements can be represented as just a

13031// shuffle of a single/two vectors the scalars are extracted from.

13032 std::optional<TTI::ShuffleKind> Res =

13033isFixedVectorShuffle(GatheredExtracts, Mask, AC);

13034if (!Res ||all_of(Mask, [](intIdx) {returnIdx ==PoisonMaskElem; })) {

13035// TODO: try to check other subsets if possible.

13036// Restore the original VL if attempt was not successful.

13037copy(SavedVL, VL.begin());

13038return std::nullopt;

13039 }

13040// Restore unused scalars from mask, if some of the extractelements were not

13041// selected for shuffle.

13042for (intI = 0, E = GatheredExtracts.size();I < E; ++I) {

13043if (Mask[I] ==PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&

13044 isa<UndefValue>(GatheredExtracts[I])) {

13045std::swap(VL[I], GatheredExtracts[I]);

13046continue;

13047 }

13048auto *EI = dyn_cast<ExtractElementInst>(VL[I]);

13049if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||

13050 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||

13051is_contained(UndefVectorExtracts,I))

13052continue;

13053 }

13054return Res;

13055}

13056

13057/// Tries to find extractelement instructions with constant indices from fixed

13058/// vector type and gather such instructions into a bunch, which highly likely

13059/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was

13060/// successful, the matched scalars are replaced by poison values in \p VL for

13061/// future analysis.

13062SmallVector<std::optional<TTI::ShuffleKind>>

13063BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,

13064SmallVectorImpl<int> &Mask,

13065unsigned NumParts) const{

13066assert(NumParts > 0 &&"NumParts expected be greater than or equal to 1.");

13067SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);

13068Mask.assign(VL.size(),PoisonMaskElem);

13069unsigned SliceSize =getPartNumElems(VL.size(), NumParts);

13070for (unsigned Part : seq<unsigned>(NumParts)) {

13071// Scan list of gathered scalars for extractelements that can be represented

13072// as shuffles.

13073MutableArrayRef<Value *> SubVL =MutableArrayRef(VL).slice(

13074 Part * SliceSize,getNumElems(VL.size(), SliceSize, Part));

13075SmallVector<int> SubMask;

13076 std::optional<TTI::ShuffleKind> Res =

13077 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);

13078 ShufflesRes[Part] = Res;

13079copy(SubMask, std::next(Mask.begin(), Part * SliceSize));

13080 }

13081if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {

13082return Res.has_value();

13083 }))

13084 ShufflesRes.clear();

13085return ShufflesRes;

13086}

13087

13088std::optional<TargetTransformInfo::ShuffleKind>

13089BoUpSLP::isGatherShuffledSingleRegisterEntry(

13090const TreeEntry *TE,ArrayRef<Value *> VL,MutableArrayRef<int> Mask,

13091SmallVectorImpl<const TreeEntry *> &Entries,unsigned Part,bool ForOrder) {

13092 Entries.clear();

13093// TODO: currently checking only for Scalars in the tree entry, need to count

13094// reused elements too for better cost estimation.

13095const EdgeInfo &TEUseEI =TE == VectorizableTree.front().get()

13096 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)

13097 :TE->UserTreeIndices.front();

13098constInstruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);

13099constBasicBlock *TEInsertBlock =nullptr;

13100// Main node of PHI entries keeps the correct order of operands/incoming

13101// blocks.

13102if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {

13103 TEInsertBlock =PHI->getIncomingBlock(TEUseEI.EdgeIdx);

13104 TEInsertPt = TEInsertBlock->getTerminator();

13105 }else {

13106 TEInsertBlock = TEInsertPt->getParent();

13107 }

13108if (!DT->isReachableFromEntry(TEInsertBlock))

13109return std::nullopt;

13110auto *NodeUI = DT->getNode(TEInsertBlock);

13111assert(NodeUI &&"Should only process reachable instructions");

13112SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());

13113auto CheckOrdering = [&](constInstruction *InsertPt) {

13114// Argument InsertPt is an instruction where vector code for some other

13115// tree entry (one that shares one or more scalars with TE) is going to be

13116// generated. This lambda returns true if insertion point of vector code

13117// for the TE dominates that point (otherwise dependency is the other way

13118// around). The other node is not limited to be of a gather kind. Gather

13119// nodes are not scheduled and their vector code is inserted before their

13120// first user. If user is PHI, that is supposed to be at the end of a

13121// predecessor block. Otherwise it is the last instruction among scalars of

13122// the user node. So, instead of checking dependency between instructions

13123// themselves, we check dependency between their insertion points for vector

13124// code (since each scalar instruction ends up as a lane of a vector

13125// instruction).

13126constBasicBlock *InsertBlock = InsertPt->getParent();

13127auto *NodeEUI = DT->getNode(InsertBlock);

13128if (!NodeEUI)

13129returnfalse;

13130assert((NodeUI == NodeEUI) ==

13131 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&

13132"Different nodes should have different DFS numbers");

13133// Check the order of the gather nodes users.

13134if (TEInsertPt->getParent() != InsertBlock &&

13135 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))

13136returnfalse;

13137if (TEInsertPt->getParent() == InsertBlock &&

13138 TEInsertPt->comesBefore(InsertPt))

13139returnfalse;

13140returntrue;

13141 };

13142// Find all tree entries used by the gathered values. If no common entries

13143// found - not a shuffle.

13144// Here we build a set of tree nodes for each gathered value and trying to

13145// find the intersection between these sets. If we have at least one common

13146// tree node for each gathered value - we have just a permutation of the

13147// single vector. If we have 2 different sets, we're in situation where we

13148// have a permutation of 2 input vectors.

13149SmallVector<SmallPtrSet<const TreeEntry *, 4>> UsedTEs;

13150DenseMap<Value *, int> UsedValuesEntry;

13151for (Value *V : VL) {

13152if (isConstant(V))

13153continue;

13154// Build a list of tree entries where V is used.

13155SmallPtrSet<const TreeEntry *, 4> VToTEs;

13156for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {

13157if (TEPtr == TE || TEPtr->Idx == 0)

13158continue;

13159assert(any_of(TEPtr->Scalars,

13160 [&](Value *V) { return GatheredScalars.contains(V); }) &&

13161"Must contain at least single gathered value.");

13162assert(TEPtr->UserTreeIndices.size() == 1 &&

13163"Expected only single user of a gather node.");

13164const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();

13165

13166PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());

13167constInstruction *InsertPt =

13168 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()

13169 : &getLastInstructionInBundle(UseEI.UserTE);

13170if (TEInsertPt == InsertPt) {

13171// If 2 gathers are operands of the same entry (regardless of whether

13172// user is PHI or else), compare operands indices, use the earlier one

13173// as the base.

13174if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)

13175continue;

13176// If the user instruction is used for some reason in different

13177// vectorized nodes - make it depend on index.

13178if (TEUseEI.UserTE != UseEI.UserTE &&

13179 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)

13180continue;

13181 }

13182

13183// Check if the user node of the TE comes after user node of TEPtr,

13184// otherwise TEPtr depends on TE.

13185if ((TEInsertBlock != InsertPt->getParent() ||

13186 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&

13187 !CheckOrdering(InsertPt))

13188continue;

13189 VToTEs.insert(TEPtr);

13190 }

13191if (const TreeEntry *VTE = getTreeEntry(V)) {

13192if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {

13193if (VTE->State != TreeEntry::Vectorize) {

13194auto It = MultiNodeScalars.find(V);

13195if (It == MultiNodeScalars.end())

13196continue;

13197 VTE = *It->getSecond().begin();

13198// Iterate through all vectorized nodes.

13199auto *MIt =find_if(It->getSecond(), [](const TreeEntry *MTE) {

13200 return MTE->State == TreeEntry::Vectorize;

13201 });

13202if (MIt == It->getSecond().end())

13203continue;

13204 VTE = *MIt;

13205 }

13206 }

13207if (none_of(TE->CombinedEntriesWithIndices,

13208 [&](constauto &P) { return P.first == VTE->Idx; })) {

13209Instruction &LastBundleInst = getLastInstructionInBundle(VTE);

13210if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))

13211continue;

13212 }

13213 VToTEs.insert(VTE);

13214 }

13215if (VToTEs.empty())

13216continue;

13217if (UsedTEs.empty()) {

13218// The first iteration, just insert the list of nodes to vector.

13219 UsedTEs.push_back(VToTEs);

13220 UsedValuesEntry.try_emplace(V, 0);

13221 }else {

13222// Need to check if there are any previously used tree nodes which use V.

13223// If there are no such nodes, consider that we have another one input

13224// vector.

13225SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);

13226unsignedIdx = 0;

13227for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {

13228// Do we have a non-empty intersection of previously listed tree entries

13229// and tree entries using current V?

13230set_intersect(VToTEs, Set);

13231if (!VToTEs.empty()) {

13232// Yes, write the new subset and continue analysis for the next

13233// scalar.

13234Set.swap(VToTEs);

13235break;

13236 }

13237 VToTEs = SavedVToTEs;

13238 ++Idx;

13239 }

13240// No non-empty intersection found - need to add a second set of possible

13241// source vectors.

13242if (Idx == UsedTEs.size()) {

13243// If the number of input vectors is greater than 2 - not a permutation,

13244// fallback to the regular gather.

13245// TODO: support multiple reshuffled nodes.

13246if (UsedTEs.size() == 2)

13247continue;

13248 UsedTEs.push_back(SavedVToTEs);

13249Idx = UsedTEs.size() - 1;

13250 }

13251 UsedValuesEntry.try_emplace(V,Idx);

13252 }

13253 }

13254

13255if (UsedTEs.empty()) {

13256 Entries.clear();

13257return std::nullopt;

13258 }

13259

13260unsigned VF = 0;

13261if (UsedTEs.size() == 1) {

13262// Keep the order to avoid non-determinism.

13263SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),

13264 UsedTEs.front().end());

13265sort(FirstEntries, [](const TreeEntry *TE1,const TreeEntry *TE2) {

13266return TE1->Idx < TE2->Idx;

13267 });

13268// Try to find the perfect match in another gather node at first.

13269auto *It =find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {

13270return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);

13271 });

13272if (It != FirstEntries.end() &&

13273 ((*It)->getVectorFactor() == VL.size() ||

13274 ((*It)->getVectorFactor() ==TE->Scalars.size() &&

13275TE->ReuseShuffleIndices.size() == VL.size() &&

13276 (*It)->isSame(TE->Scalars)))) {

13277 Entries.push_back(*It);

13278if ((*It)->getVectorFactor() == VL.size()) {

13279 std::iota(std::next(Mask.begin(), Part * VL.size()),

13280 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);

13281 }else {

13282SmallVector<int> CommonMask =TE->getCommonMask();

13283copy(CommonMask,Mask.begin());

13284 }

13285// Clear undef scalars.

13286for (unsignedI : seq<unsigned>(VL.size()))

13287if (isa<PoisonValue>(VL[I]))

13288Mask[Part * VL.size() +I] =PoisonMaskElem;

13289returnTargetTransformInfo::SK_PermuteSingleSrc;

13290 }

13291// No perfect match, just shuffle, so choose the first tree node from the

13292// tree.

13293 Entries.push_back(FirstEntries.front());

13294 VF = FirstEntries.front()->getVectorFactor();

13295 }else {

13296// Try to find nodes with the same vector factor.

13297assert(UsedTEs.size() == 2 &&"Expected at max 2 permuted entries.");

13298// Keep the order of tree nodes to avoid non-determinism.

13299DenseMap<int, const TreeEntry *> VFToTE;

13300for (const TreeEntry *TE : UsedTEs.front()) {

13301unsigned VF =TE->getVectorFactor();

13302auto It = VFToTE.find(VF);

13303if (It != VFToTE.end()) {

13304if (It->second->Idx >TE->Idx)

13305 It->getSecond() =TE;

13306continue;

13307 }

13308 VFToTE.try_emplace(VF, TE);

13309 }

13310// Same, keep the order to avoid non-determinism.

13311SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),

13312 UsedTEs.back().end());

13313sort(SecondEntries, [](const TreeEntry *TE1,const TreeEntry *TE2) {

13314return TE1->Idx < TE2->Idx;

13315 });

13316for (const TreeEntry *TE : SecondEntries) {

13317auto It = VFToTE.find(TE->getVectorFactor());

13318if (It != VFToTE.end()) {

13319 VF = It->first;

13320 Entries.push_back(It->second);

13321 Entries.push_back(TE);

13322break;

13323 }

13324 }

13325// No 2 source vectors with the same vector factor - just choose 2 with max

13326// index.

13327if (Entries.empty()) {

13328 Entries.push_back(*llvm::max_element(

13329 UsedTEs.front(), [](const TreeEntry *TE1,const TreeEntry *TE2) {

13330 return TE1->Idx < TE2->Idx;

13331 }));

13332 Entries.push_back(SecondEntries.front());

13333 VF = std::max(Entries.front()->getVectorFactor(),

13334 Entries.back()->getVectorFactor());

13335 }else {

13336 VF = Entries.front()->getVectorFactor();

13337 }

13338 }

13339

13340bool IsSplatOrUndefs =isSplat(VL) ||all_of(VL, IsaPred<UndefValue>);

13341// Checks if the 2 PHIs are compatible in terms of high possibility to be

13342// vectorized.

13343auto AreCompatiblePHIs = [&](Value *V,Value *V1) {

13344auto *PHI = cast<PHINode>(V);

13345auto *PHI1 = cast<PHINode>(V1);

13346// Check that all incoming values are compatible/from same parent (if they

13347// are instructions).

13348// The incoming values are compatible if they all are constants, or

13349// instruction with the same/alternate opcodes from the same basic block.

13350for (intI = 0, E =PHI->getNumIncomingValues();I < E; ++I) {

13351Value *In =PHI->getIncomingValue(I);

13352Value *In1 = PHI1->getIncomingValue(I);

13353if (isConstant(In) &&isConstant(In1))

13354continue;

13355if (!getSameOpcode({In, In1}, *TLI))

13356returnfalse;

13357if (cast<Instruction>(In)->getParent() !=

13358 cast<Instruction>(In1)->getParent())

13359returnfalse;

13360 }

13361returntrue;

13362 };

13363// Check if the value can be ignored during analysis for shuffled gathers.

13364// We suppose it is better to ignore instruction, which do not form splats,

13365// are not vectorized/not extractelements (these instructions will be handled

13366// by extractelements processing) or may form vector node in future.

13367auto MightBeIgnored = [=](Value *V) {

13368auto *I = dyn_cast<Instruction>(V);

13369returnI && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&

13370 !isVectorLikeInstWithConstOps(I) &&

13371 !areAllUsersVectorized(I, UserIgnoreList) &&isSimple(I);

13372 };

13373// Check that the neighbor instruction may form a full vector node with the

13374// current instruction V. It is possible, if they have same/alternate opcode

13375// and same parent basic block.

13376auto NeighborMightBeIgnored = [&](Value *V,intIdx) {

13377Value *V1 = VL[Idx];

13378bool UsedInSameVTE =false;

13379auto It = UsedValuesEntry.find(V1);

13380if (It != UsedValuesEntry.end())

13381 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;

13382returnV != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&

13383getSameOpcode({V, V1}, *TLI) &&

13384 cast<Instruction>(V)->getParent() ==

13385 cast<Instruction>(V1)->getParent() &&

13386 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));

13387 };

13388// Build a shuffle mask for better cost estimation and vector emission.

13389SmallBitVector UsedIdxs(Entries.size());

13390SmallVector<std::pair<unsigned, int>> EntryLanes;

13391for (intI = 0, E = VL.size();I < E; ++I) {

13392Value *V = VL[I];

13393auto It = UsedValuesEntry.find(V);

13394if (It == UsedValuesEntry.end())

13395continue;

13396// Do not try to shuffle scalars, if they are constants, or instructions

13397// that can be vectorized as a result of the following vector build

13398// vectorization.

13399if (isConstant(V) || (MightBeIgnored(V) &&

13400 ((I > 0 && NeighborMightBeIgnored(V,I - 1)) ||

13401 (I != E - 1 && NeighborMightBeIgnored(V,I + 1)))))

13402continue;

13403unsignedIdx = It->second;

13404 EntryLanes.emplace_back(Idx,I);

13405 UsedIdxs.set(Idx);

13406 }

13407// Iterate through all shuffled scalars and select entries, which can be used

13408// for final shuffle.

13409SmallVector<const TreeEntry *> TempEntries;

13410for (unsignedI = 0, Sz = Entries.size();I < Sz; ++I) {

13411if (!UsedIdxs.test(I))

13412continue;

13413// Fix the entry number for the given scalar. If it is the first entry, set

13414// Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).

13415// These indices are used when calculating final shuffle mask as the vector

13416// offset.

13417for (std::pair<unsigned, int> &Pair : EntryLanes)

13418if (Pair.first ==I)

13419 Pair.first = TempEntries.size();

13420 TempEntries.push_back(Entries[I]);

13421 }

13422 Entries.swap(TempEntries);

13423if (EntryLanes.size() == Entries.size() &&

13424 !VL.equals(ArrayRef(TE->Scalars)

13425 .slice(Part * VL.size(),

13426 std::min<int>(VL.size(),TE->Scalars.size())))) {

13427// We may have here 1 or 2 entries only. If the number of scalars is equal

13428// to the number of entries, no need to do the analysis, it is not very

13429// profitable. Since VL is not the same as TE->Scalars, it means we already

13430// have some shuffles before. Cut off not profitable case.

13431 Entries.clear();

13432return std::nullopt;

13433 }

13434// Build the final mask, check for the identity shuffle, if possible.

13435bool IsIdentity = Entries.size() == 1;

13436// Pair.first is the offset to the vector, while Pair.second is the index of

13437// scalar in the list.

13438for (const std::pair<unsigned, int> &Pair : EntryLanes) {

13439unsignedIdx = Part * VL.size() + Pair.second;

13440Mask[Idx] =

13441 Pair.first * VF +

13442 (ForOrder ? std::distance(

13443 Entries[Pair.first]->Scalars.begin(),

13444find(Entries[Pair.first]->Scalars, VL[Pair.second]))

13445 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));

13446 IsIdentity &=Mask[Idx] == Pair.second;

13447 }

13448if (ForOrder || IsIdentity || Entries.empty()) {

13449switch (Entries.size()) {

13450case 1:

13451if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)

13452returnTargetTransformInfo::SK_PermuteSingleSrc;

13453break;

13454case 2:

13455if (EntryLanes.size() > 2 || VL.size() <= 2)

13456returnTargetTransformInfo::SK_PermuteTwoSrc;

13457break;

13458default:

13459break;

13460 }

13461 }elseif (!isa<VectorType>(VL.front()->getType()) &&

13462 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {

13463// Do the cost estimation if shuffle beneficial than buildvector.

13464SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),

13465 std::next(Mask.begin(), (Part + 1) * VL.size()));

13466int MinElement = SubMask.front(), MaxElement = SubMask.front();

13467for (intIdx : SubMask) {

13468if (Idx ==PoisonMaskElem)

13469continue;

13470if (MinElement ==PoisonMaskElem || MinElement % VF >Idx % VF)

13471 MinElement =Idx;

13472if (MaxElement ==PoisonMaskElem || MaxElement % VF <Idx % VF)

13473 MaxElement =Idx;

13474 }

13475assert(MaxElement >= 0 && MinElement >= 0 &&

13476 MaxElement % VF >= MinElement % VF &&

13477"Expected at least single element.");

13478unsigned NewVF = std::max<unsigned>(

13479 VL.size(),getFullVectorNumberOfElements(*TTI, VL.front()->getType(),

13480 (MaxElement % VF) -

13481 (MinElement % VF) + 1));

13482if (NewVF < VF) {

13483for_each(SubMask, [&](int &Idx) {

13484if (Idx ==PoisonMaskElem)

13485return;

13486Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +

13487 (Idx >=static_cast<int>(VF) ? NewVF : 0);

13488 });

13489 }else {

13490 NewVF = VF;

13491 }

13492

13493constexprTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

13494auto *VecTy =getWidenedType(VL.front()->getType(), NewVF);

13495auto *MaskVecTy =getWidenedType(VL.front()->getType(), SubMask.size());

13496auto GetShuffleCost = [&,

13497 &TTI = *TTI](ArrayRef<int>Mask,

13498ArrayRef<const TreeEntry *> Entries,

13499VectorType *VecTy) ->InstructionCost {

13500if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&

13501ShuffleVectorInst::isDeInterleaveMaskOfFactor(

13502 Mask, Entries.front()->getInterleaveFactor()))

13503returnTTI::TCC_Free;

13504 return ::getShuffleCost(TTI,

13505 Entries.size() > 1 ?TTI::SK_PermuteTwoSrc

13506 :TTI::SK_PermuteSingleSrc,

13507 VecTy, Mask,CostKind);

13508 };

13509InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);

13510InstructionCost FirstShuffleCost = 0;

13511SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());

13512if (Entries.size() == 1 || !Entries[0]->isGather()) {

13513 FirstShuffleCost = ShuffleCost;

13514 }else {

13515// Transform mask to include only first entry.

13516APInt DemandedElts =APInt::getAllOnes(SubMask.size());

13517bool IsIdentity =true;

13518for (auto [I,Idx] :enumerate(FirstMask)) {

13519if (Idx >=static_cast<int>(NewVF)) {

13520Idx =PoisonMaskElem;

13521 }else {

13522 DemandedElts.clearBit(I);

13523if (Idx !=PoisonMaskElem)

13524 IsIdentity &=static_cast<int>(I) ==Idx;

13525 }

13526 }

13527if (!IsIdentity)

13528 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);

13529 FirstShuffleCost +=TTI->getScalarizationOverhead(

13530 MaskVecTy, DemandedElts,/*Insert=*/true,

13531/*Extract=*/false,CostKind);

13532 }

13533InstructionCost SecondShuffleCost = 0;

13534SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());

13535if (Entries.size() == 1 || !Entries[1]->isGather()) {

13536 SecondShuffleCost = ShuffleCost;

13537 }else {

13538// Transform mask to include only first entry.

13539APInt DemandedElts =APInt::getAllOnes(SubMask.size());

13540bool IsIdentity =true;

13541for (auto [I,Idx] :enumerate(SecondMask)) {

13542if (Idx <static_cast<int>(NewVF) &&Idx >= 0) {

13543Idx =PoisonMaskElem;

13544 }else {

13545 DemandedElts.clearBit(I);

13546if (Idx !=PoisonMaskElem) {

13547Idx -= NewVF;

13548 IsIdentity &=static_cast<int>(I) ==Idx;

13549 }

13550 }

13551 }

13552if (!IsIdentity)

13553 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);

13554 SecondShuffleCost +=TTI->getScalarizationOverhead(

13555 MaskVecTy, DemandedElts,/*Insert=*/true,

13556/*Extract=*/false,CostKind);

13557 }

13558APInt DemandedElts =APInt::getAllOnes(SubMask.size());

13559for (auto [I,Idx] :enumerate(SubMask))

13560if (Idx ==PoisonMaskElem)

13561 DemandedElts.clearBit(I);

13562InstructionCost BuildVectorCost =

13563TTI->getScalarizationOverhead(MaskVecTy, DemandedElts,/*Insert=*/true,

13564/*Extract=*/false,CostKind);

13565const TreeEntry *BestEntry =nullptr;

13566if (FirstShuffleCost < ShuffleCost) {

13567 std::for_each(std::next(Mask.begin(), Part * VL.size()),

13568 std::next(Mask.begin(), (Part + 1) * VL.size()),

13569 [&](int &Idx) {

13570 if (Idx >= static_cast<int>(VF))

13571 Idx = PoisonMaskElem;

13572 });

13573 BestEntry = Entries.front();

13574 ShuffleCost = FirstShuffleCost;

13575 }

13576if (SecondShuffleCost < ShuffleCost) {

13577 std::for_each(std::next(Mask.begin(), Part * VL.size()),

13578 std::next(Mask.begin(), (Part + 1) * VL.size()),

13579 [&](int &Idx) {

13580 if (Idx < static_cast<int>(VF))

13581 Idx = PoisonMaskElem;

13582 else

13583 Idx -= VF;

13584 });

13585 BestEntry = Entries[1];

13586 ShuffleCost = SecondShuffleCost;

13587 }

13588if (BuildVectorCost >= ShuffleCost) {

13589if (BestEntry) {

13590 Entries.clear();

13591 Entries.push_back(BestEntry);

13592 }

13593return Entries.size() > 1 ?TargetTransformInfo::SK_PermuteTwoSrc

13594 :TargetTransformInfo::SK_PermuteSingleSrc;

13595 }

13596 }

13597 Entries.clear();

13598// Clear the corresponding mask elements.

13599 std::fill(std::next(Mask.begin(), Part * VL.size()),

13600 std::next(Mask.begin(), (Part + 1) * VL.size()),PoisonMaskElem);

13601return std::nullopt;

13602}

13603

13604SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>

13605BoUpSLP::isGatherShuffledEntry(

13606const TreeEntry *TE,ArrayRef<Value *> VL,SmallVectorImpl<int> &Mask,

13607SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,unsigned NumParts,

13608bool ForOrder) {

13609assert(NumParts > 0 && NumParts < VL.size() &&

13610"Expected positive number of registers.");

13611 Entries.clear();

13612// No need to check for the topmost gather node.

13613if (TE == VectorizableTree.front().get() &&

13614 (!GatheredLoadsEntriesFirst.has_value() ||

13615none_of(ArrayRef(VectorizableTree).drop_front(),

13616 [](const std::unique_ptr<TreeEntry> &TE) {

13617return !TE->isGather();

13618 })))

13619return {};

13620// FIXME: Gathering for non-power-of-2 (non whole registers) nodes not

13621// implemented yet.

13622if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))

13623return {};

13624Mask.assign(VL.size(),PoisonMaskElem);

13625assert((TE->UserTreeIndices.size() == 1 ||

13626 TE == VectorizableTree.front().get()) &&

13627"Expected only single user of the gather node.");

13628assert(VL.size() % NumParts == 0 &&

13629"Number of scalars must be divisible by NumParts.");

13630if (!TE->UserTreeIndices.empty() &&

13631TE->UserTreeIndices.front().UserTE->isGather() &&

13632TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {

13633assert(

13634 (TE->Idx == 0 ||

13635 (TE->hasState() &&TE->getOpcode() == Instruction::ExtractElement) ||

13636isSplat(TE->Scalars)) &&

13637"Expected splat or extractelements only node.");

13638return {};

13639 }

13640unsigned SliceSize =getPartNumElems(VL.size(), NumParts);

13641SmallVector<std::optional<TTI::ShuffleKind>> Res;

13642for (unsigned Part : seq<unsigned>(NumParts)) {

13643ArrayRef<Value *> SubVL =

13644 VL.slice(Part * SliceSize,getNumElems(VL.size(), SliceSize, Part));

13645SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();

13646 std::optional<TTI::ShuffleKind> SubRes =

13647 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,

13648 ForOrder);

13649if (!SubRes)

13650 SubEntries.clear();

13651 Res.push_back(SubRes);

13652if (SubEntries.size() == 1 && *SubRes ==TTI::SK_PermuteSingleSrc &&

13653 SubEntries.front()->getVectorFactor() == VL.size() &&

13654 (SubEntries.front()->isSame(TE->Scalars) ||

13655 SubEntries.front()->isSame(VL))) {

13656SmallVector<const TreeEntry *> LocalSubEntries;

13657 LocalSubEntries.swap(SubEntries);

13658 Entries.clear();

13659 Res.clear();

13660 std::iota(Mask.begin(),Mask.end(), 0);

13661// Clear undef scalars.

13662for (intI = 0, Sz = VL.size();I < Sz; ++I)

13663if (isa<PoisonValue>(VL[I]))

13664Mask[I] =PoisonMaskElem;

13665 Entries.emplace_back(1, LocalSubEntries.front());

13666 Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);

13667return Res;

13668 }

13669 }

13670if (all_of(Res,

13671 [](const std::optional<TTI::ShuffleKind> &SK) {return !SK; })) {

13672 Entries.clear();

13673return {};

13674 }

13675return Res;

13676}

13677

13678InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,bool ForPoisonSrc,

13679Type *ScalarTy) const{

13680auto *VecTy =getWidenedType(ScalarTy, VL.size());

13681bool DuplicateNonConst =false;

13682// Find the cost of inserting/extracting values from the vector.

13683// Check if the same elements are inserted several times and count them as

13684// shuffle candidates.

13685APInt ShuffledElements =APInt::getZero(VL.size());

13686DenseMap<Value *, unsigned> UniqueElements;

13687constexprTTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

13688InstructionCost Cost;

13689auto EstimateInsertCost = [&](unsignedI,Value *V) {

13690if (V->getType() != ScalarTy) {

13691Cost +=TTI->getCastInstrCost(Instruction::Trunc, ScalarTy,V->getType(),

13692TTI::CastContextHint::None,CostKind);

13693V =nullptr;

13694 }

13695if (!ForPoisonSrc)

13696Cost +=

13697TTI->getVectorInstrCost(Instruction::InsertElement, VecTy,CostKind,

13698I,Constant::getNullValue(VecTy),V);

13699 };

13700SmallVector<int> ShuffleMask(VL.size(),PoisonMaskElem);

13701for (unsignedI = 0, E = VL.size();I < E; ++I) {

13702Value *V = VL[I];

13703// No need to shuffle duplicates for constants.

13704if ((ForPoisonSrc &&isConstant(V)) || isa<UndefValue>(V)) {

13705 ShuffledElements.setBit(I);

13706 ShuffleMask[I] = isa<PoisonValue>(V) ?PoisonMaskElem :I;

13707continue;

13708 }

13709

13710auto Res = UniqueElements.try_emplace(V,I);

13711if (Res.second) {

13712 EstimateInsertCost(I, V);

13713 ShuffleMask[I] =I;

13714continue;

13715 }

13716

13717 DuplicateNonConst =true;

13718 ShuffledElements.setBit(I);

13719 ShuffleMask[I] = Res.first->second;

13720 }

13721if (ForPoisonSrc) {

13722if (isa<FixedVectorType>(ScalarTy)) {

13723assert(SLPReVec &&"Only supported by REVEC.");

13724// We don't need to insert elements one by one. Instead, we can insert the

13725// entire vector into the destination.

13726Cost = 0;

13727unsigned ScalarTyNumElements =getNumElements(ScalarTy);

13728for (unsignedI : seq<unsigned>(VL.size()))

13729if (!ShuffledElements[I])

13730Cost +=TTI->getShuffleCost(

13731TTI::SK_InsertSubvector, VecTy, std::nullopt,CostKind,

13732I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));

13733 }else {

13734Cost =TTI->getScalarizationOverhead(VecTy,

13735/*DemandedElts*/ ~ShuffledElements,

13736/*Insert*/true,

13737/*Extract*/false,CostKind, VL);

13738 }

13739 }

13740if (DuplicateNonConst)

13741Cost +=::getShuffleCost(*TTI,TargetTransformInfo::SK_PermuteSingleSrc,

13742 VecTy, ShuffleMask);

13743returnCost;

13744}

13745

13746Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {

13747auto &Res = EntryToLastInstruction.try_emplace(E).first->second;

13748if (Res)

13749return *Res;

13750// Get the basic block this bundle is in. All instructions in the bundle

13751// should be in this block (except for extractelement-like instructions with

13752// constant indices or gathered loads).

13753auto *Front = E->getMainOp();

13754auto *BB = Front->getParent();

13755assert(((GatheredLoadsEntriesFirst.has_value() &&

13756 E->getOpcode() == Instruction::Load && E->isGather() &&

13757 E->Idx < *GatheredLoadsEntriesFirst) ||

13758all_of(E->Scalars,

13759 [=](Value *V) ->bool {

13760 if (E->getOpcode() == Instruction::GetElementPtr &&

13761 !isa<GetElementPtrInst>(V))

13762 return true;

13763 auto *I = dyn_cast<Instruction>(V);

13764 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||

13765 isVectorLikeInstWithConstOps(I);

13766 })) &&

13767"Expected gathered loads or GEPs or instructions from same basic "

13768"block.");

13769

13770auto FindLastInst = [&]() {

13771Instruction *LastInst = Front;

13772for (Value *V : E->Scalars) {

13773auto *I = dyn_cast<Instruction>(V);

13774if (!I)

13775continue;

13776if (LastInst->getParent() ==I->getParent()) {

13777if (LastInst->comesBefore(I))

13778 LastInst =I;

13779continue;

13780 }

13781assert(((E->getOpcode() == Instruction::GetElementPtr &&

13782 !isa<GetElementPtrInst>(I)) ||

13783 (isVectorLikeInstWithConstOps(LastInst) &&

13784isVectorLikeInstWithConstOps(I)) ||

13785 (GatheredLoadsEntriesFirst.has_value() &&

13786 E->getOpcode() == Instruction::Load && E->isGather() &&

13787 E->Idx < *GatheredLoadsEntriesFirst)) &&

13788"Expected vector-like or non-GEP in GEP node insts only.");

13789if (!DT->isReachableFromEntry(LastInst->getParent())) {

13790 LastInst =I;

13791continue;

13792 }

13793if (!DT->isReachableFromEntry(I->getParent()))

13794continue;

13795auto *NodeA = DT->getNode(LastInst->getParent());

13796auto *NodeB = DT->getNode(I->getParent());

13797assert(NodeA &&"Should only process reachable instructions");

13798assert(NodeB &&"Should only process reachable instructions");

13799assert((NodeA == NodeB) ==

13800 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

13801"Different nodes should have different DFS numbers");

13802if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())

13803 LastInst =I;

13804 }

13805 BB = LastInst->getParent();

13806return LastInst;

13807 };

13808

13809auto FindFirstInst = [&]() {

13810Instruction *FirstInst = Front;

13811for (Value *V : E->Scalars) {

13812auto *I = dyn_cast<Instruction>(V);

13813if (!I)

13814continue;

13815if (FirstInst->getParent() ==I->getParent()) {

13816if (I->comesBefore(FirstInst))

13817 FirstInst =I;

13818continue;

13819 }

13820assert(((E->getOpcode() == Instruction::GetElementPtr &&

13821 !isa<GetElementPtrInst>(I)) ||

13822 (isVectorLikeInstWithConstOps(FirstInst) &&

13823isVectorLikeInstWithConstOps(I))) &&

13824"Expected vector-like or non-GEP in GEP node insts only.");

13825if (!DT->isReachableFromEntry(FirstInst->getParent())) {

13826 FirstInst =I;

13827continue;

13828 }

13829if (!DT->isReachableFromEntry(I->getParent()))

13830continue;

13831auto *NodeA = DT->getNode(FirstInst->getParent());

13832auto *NodeB = DT->getNode(I->getParent());

13833assert(NodeA &&"Should only process reachable instructions");

13834assert(NodeB &&"Should only process reachable instructions");

13835assert((NodeA == NodeB) ==

13836 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&

13837"Different nodes should have different DFS numbers");

13838if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())

13839 FirstInst =I;

13840 }

13841return FirstInst;

13842 };

13843

13844// Set insertpoint for gathered loads to the very first load.

13845if (GatheredLoadsEntriesFirst.has_value() &&

13846 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&

13847 E->getOpcode() == Instruction::Load) {

13848 Res = FindFirstInst();

13849return *Res;

13850 }

13851

13852// Set the insert point to the beginning of the basic block if the entry

13853// should not be scheduled.

13854if (doesNotNeedToSchedule(E->Scalars) ||

13855 (!E->isGather() &&all_of(E->Scalars,isVectorLikeInstWithConstOps))) {

13856if ((E->getOpcode() == Instruction::GetElementPtr &&

13857any_of(E->Scalars,

13858 [](Value *V) {

13859 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);

13860 })) ||

13861all_of(E->Scalars,

13862 [](Value *V) {

13863 return isa<PoisonValue>(V) ||

13864 (!isVectorLikeInstWithConstOps(V) &&

13865 isUsedOutsideBlock(V));

13866 }) ||

13867 (E->isGather() && E->Idx == 0 &&all_of(E->Scalars, [](Value *V) {

13868 return isa<ExtractElementInst, UndefValue>(V) ||

13869 areAllOperandsNonInsts(V);

13870 })))

13871 Res = FindLastInst();

13872else

13873 Res = FindFirstInst();

13874return *Res;

13875 }

13876

13877// Find the last instruction. The common case should be that BB has been

13878// scheduled, and the last instruction is VL.back(). So we start with

13879// VL.back() and iterate over schedule data until we reach the end of the

13880// bundle. The end of the bundle is marked by null ScheduleData.

13881if (BlocksSchedules.count(BB) && !E->isGather()) {

13882Value *V = E->isOneOf(E->Scalars.back());

13883if (doesNotNeedToBeScheduled(V))

13884V = *find_if_not(E->Scalars,doesNotNeedToBeScheduled);

13885auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);

13886if (Bundle && Bundle->isPartOfBundle())

13887for (; Bundle; Bundle = Bundle->NextInBundle)

13888 Res = Bundle->Inst;

13889 }

13890

13891// LastInst can still be null at this point if there's either not an entry

13892// for BB in BlocksSchedules or there's no ScheduleData available for

13893// VL.back(). This can be the case if buildTree_rec aborts for various

13894// reasons (e.g., the maximum recursion depth is reached, the maximum region

13895// size is reached, etc.). ScheduleData is initialized in the scheduling

13896// "dry-run".

13897//

13898// If this happens, we can still find the last instruction by brute force. We

13899// iterate forwards from Front (inclusive) until we either see all

13900// instructions in the bundle or reach the end of the block. If Front is the

13901// last instruction in program order, LastInst will be set to Front, and we

13902// will visit all the remaining instructions in the block.

13903//

13904// One of the reasons we exit early from buildTree_rec is to place an upper

13905// bound on compile-time. Thus, taking an additional compile-time hit here is

13906// not ideal. However, this should be exceedingly rare since it requires that

13907// we both exit early from buildTree_rec and that the bundle be out-of-order

13908// (causing us to iterate all the way to the end of the block).

13909if (!Res)

13910 Res = FindLastInst();

13911assert(Res &&"Failed to find last instruction in bundle");

13912return *Res;

13913}

13914

13915void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {

13916auto *Front = E->getMainOp();

13917Instruction *LastInst = &getLastInstructionInBundle(E);

13918assert(LastInst &&"Failed to find last instruction in bundle");

13919BasicBlock::iterator LastInstIt = LastInst->getIterator();

13920// If the instruction is PHI, set the insert point after all the PHIs.

13921bool IsPHI = isa<PHINode>(LastInst);

13922if (IsPHI)

13923 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();

13924if (IsPHI || (!E->isGather() &&doesNotNeedToSchedule(E->Scalars))) {

13925 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);

13926 }else {

13927// Set the insertion point after the last instruction in the bundle. Set the

13928// debug location to Front.

13929 Builder.SetInsertPoint(

13930 LastInst->getParent(),

13931 LastInst->getNextNonDebugInstruction()->getIterator());

13932 }

13933 Builder.SetCurrentDebugLocation(Front->getDebugLoc());

13934}

13935

13936Value *BoUpSLP::gather(

13937ArrayRef<Value *> VL,Value *Root,Type *ScalarTy,

13938function_ref<Value *(Value *,Value *,ArrayRef<int>)> CreateShuffle) {

13939// List of instructions/lanes from current block and/or the blocks which are

13940// part of the current loop. These instructions will be inserted at the end to

13941// make it possible to optimize loops and hoist invariant instructions out of

13942// the loops body with better chances for success.

13943SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;

13944SmallSet<int, 4> PostponedIndices;

13945Loop *L = LI->getLoopFor(Builder.GetInsertBlock());

13946auto &&CheckPredecessor = [](BasicBlock *InstBB,BasicBlock *InsertBB) {

13947SmallPtrSet<BasicBlock *, 4> Visited;

13948while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)

13949 InsertBB = InsertBB->getSinglePredecessor();

13950return InsertBB && InsertBB == InstBB;

13951 };

13952for (intI = 0, E = VL.size();I < E; ++I) {

13953if (auto *Inst = dyn_cast<Instruction>(VL[I]))

13954if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||

13955 getTreeEntry(Inst) ||

13956 (L && (!Root ||L->isLoopInvariant(Root)) &&L->contains(Inst))) &&

13957 PostponedIndices.insert(I).second)

13958 PostponedInsts.emplace_back(Inst,I);

13959 }

13960

13961auto &&CreateInsertElement = [this](Value *Vec,Value *V,unsigned Pos,

13962Type *Ty) {

13963Value *Scalar =V;

13964if (Scalar->getType() != Ty) {

13965assert(Scalar->getType()->isIntOrIntVectorTy() &&

13966 Ty->isIntOrIntVectorTy() &&"Expected integer types only.");

13967Value *V =Scalar;

13968if (auto *CI = dyn_cast<CastInst>(Scalar);

13969 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {

13970Value *Op = CI->getOperand(0);

13971if (auto *IOp = dyn_cast<Instruction>(Op);

13972 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))

13973V =Op;

13974 }

13975Scalar = Builder.CreateIntCast(

13976 V, Ty, !isKnownNonNegative(Scalar,SimplifyQuery(*DL)));

13977 }

13978

13979Instruction *InsElt;

13980if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {

13981assert(SLPReVec &&"FixedVectorType is not expected.");

13982 Vec =

13983createInsertVector(Builder, Vec, Scalar, Pos *getNumElements(VecTy));

13984auto *II = dyn_cast<IntrinsicInst>(Vec);

13985if (!II ||II->getIntrinsicID() != Intrinsic::vector_insert)

13986return Vec;

13987 InsElt =II;

13988 }else {

13989 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));

13990 InsElt = dyn_cast<InsertElementInst>(Vec);

13991if (!InsElt)

13992return Vec;

13993 }

13994 GatherShuffleExtractSeq.insert(InsElt);

13995 CSEBlocks.insert(InsElt->getParent());

13996// Add to our 'need-to-extract' list.

13997if (isa<Instruction>(V)) {

13998if (TreeEntry *Entry = getTreeEntry(V)) {

13999// Find which lane we need to extract.

14000User *UserOp =nullptr;

14001if (Scalar != V) {

14002if (auto *SI = dyn_cast<Instruction>(Scalar))

14003 UserOp =SI;

14004 }else {

14005 UserOp = InsElt;

14006 }

14007if (UserOp) {

14008unsigned FoundLane =Entry->findLaneForValue(V);

14009 ExternalUses.emplace_back(V, UserOp, FoundLane);

14010 }

14011 }

14012 }

14013return Vec;

14014 };

14015auto *VecTy =getWidenedType(ScalarTy, VL.size());

14016Value *Vec =PoisonValue::get(VecTy);

14017SmallVector<int> NonConsts;

14018SmallVector<int>Mask(VL.size());

14019 std::iota(Mask.begin(),Mask.end(), 0);

14020Value *OriginalRoot = Root;

14021if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);

14022 SV && isa<PoisonValue>(SV->getOperand(1)) &&

14023 SV->getOperand(0)->getType() == VecTy) {

14024 Root = SV->getOperand(0);

14025Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());

14026 }

14027// Insert constant values at first.

14028for (intI = 0, E = VL.size();I < E; ++I) {

14029if (PostponedIndices.contains(I))

14030continue;

14031if (!isConstant(VL[I])) {

14032 NonConsts.push_back(I);

14033continue;

14034 }

14035if (isa<PoisonValue>(VL[I]))

14036continue;

14037 Vec = CreateInsertElement(Vec, VL[I],I, ScalarTy);

14038Mask[I] =I + E;

14039 }

14040if (Root) {

14041if (isa<PoisonValue>(Vec)) {

14042 Vec = OriginalRoot;

14043 }else {

14044 Vec = CreateShuffle(Root, Vec, Mask);

14045if (auto *OI = dyn_cast<Instruction>(OriginalRoot);

14046 OI && OI->hasNUses(0) &&

14047none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

14048returnTE->VectorizedValue == OI;

14049 }))

14050eraseInstruction(OI);

14051 }

14052 }

14053// Insert non-constant values.

14054for (intI : NonConsts)

14055 Vec = CreateInsertElement(Vec, VL[I],I, ScalarTy);

14056// Append instructions, which are/may be part of the loop, in the end to make

14057// it possible to hoist non-loop-based instructions.

14058for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)

14059 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);

14060

14061return Vec;

14062}

14063

14064/// Merges shuffle masks and emits final shuffle instruction, if required. It

14065/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,

14066/// when the actual shuffle instruction is generated only if this is actually

14067/// required. Otherwise, the shuffle instruction emission is delayed till the

14068/// end of the process, to reduce the number of emitted instructions and further

14069/// analysis/transformations.

14070/// The class also will look through the previously emitted shuffle instructions

14071/// and properly mark indices in mask as undef.

14072/// For example, given the code

14073/// \code

14074/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>

14075/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>

14076/// \endcode

14077/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will

14078/// look through %s1 and %s2 and emit

14079/// \code

14080/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>

14081/// \endcode

14082/// instead.

14083/// If 2 operands are of different size, the smallest one will be resized and

14084/// the mask recalculated properly.

14085/// For example, given the code

14086/// \code

14087/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>

14088/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>

14089/// \endcode

14090/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will

14091/// look through %s1 and %s2 and emit

14092/// \code

14093/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>

14094/// \endcode

14095/// instead.

14096classBoUpSLP::ShuffleInstructionBuilder final :public BaseShuffleAnalysis {

14097bool IsFinalized =false;

14098 /// Combined mask for all applied operands and masks. It is built during

14099 /// analysis and actual emission of shuffle vector instructions.

14100SmallVector<int> CommonMask;

14101 /// List of operands for the shuffle vector instruction. It hold at max 2

14102 /// operands, if the 3rd is going to be added, the first 2 are combined into

14103 /// shuffle with \p CommonMask mask, the first operand sets to be the

14104 /// resulting shuffle and the second operand sets to be the newly added

14105 /// operand. The \p CommonMask is transformed in the proper way after that.

14106SmallVector<Value *, 2> InVectors;

14107IRBuilderBase &Builder;

14108BoUpSLP &R;

14109

14110classShuffleIRBuilder {

14111IRBuilderBase &Builder;

14112 /// Holds all of the instructions that we gathered.

14113SetVector<Instruction *> &GatherShuffleExtractSeq;

14114 /// A list of blocks that we are going to CSE.

14115DenseSet<BasicBlock *> &CSEBlocks;

14116 /// Data layout.

14117constDataLayout &DL;

14118

14119public:

14120 ShuffleIRBuilder(IRBuilderBase &Builder,

14121SetVector<Instruction *> &GatherShuffleExtractSeq,

14122DenseSet<BasicBlock *> &CSEBlocks,constDataLayout &DL)

14123 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),

14124 CSEBlocks(CSEBlocks),DL(DL) {}

14125 ~ShuffleIRBuilder() =default;

14126 /// Creates shufflevector for the 2 operands with the given mask.

14127Value *createShuffleVector(Value *V1,Value *V2,ArrayRef<int> Mask) {

14128if (V1->getType() != V2->getType()) {

14129assert(V1->getType()->isIntOrIntVectorTy() &&

14130 V1->getType()->isIntOrIntVectorTy() &&

14131"Expected integer vector types only.");

14132if (V1->getType() != V2->getType()) {

14133if (cast<VectorType>(V2->getType())

14134 ->getElementType()

14135 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())

14136 ->getElementType()

14137 ->getIntegerBitWidth())

14138 V2 = Builder.CreateIntCast(

14139 V2, V1->getType(), !isKnownNonNegative(V2,SimplifyQuery(DL)));

14140else

14141 V1 = Builder.CreateIntCast(

14142 V1, V2->getType(), !isKnownNonNegative(V1,SimplifyQuery(DL)));

14143 }

14144 }

14145Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);

14146if (auto *I = dyn_cast<Instruction>(Vec)) {

14147 GatherShuffleExtractSeq.insert(I);

14148 CSEBlocks.insert(I->getParent());

14149 }

14150return Vec;

14151 }

14152 /// Creates permutation of the single vector operand with the given mask, if

14153 /// it is not identity mask.

14154Value *createShuffleVector(Value *V1,ArrayRef<int> Mask) {

14155if (Mask.empty())

14156return V1;

14157unsigned VF = Mask.size();

14158unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();

14159if (VF == LocalVF &&ShuffleVectorInst::isIdentityMask(Mask, VF))

14160return V1;

14161Value *Vec = Builder.CreateShuffleVector(V1, Mask);

14162if (auto *I = dyn_cast<Instruction>(Vec)) {

14163 GatherShuffleExtractSeq.insert(I);

14164 CSEBlocks.insert(I->getParent());

14165 }

14166return Vec;

14167 }

14168Value *createIdentity(Value *V) {return V; }

14169Value *createPoison(Type *Ty,unsigned VF) {

14170returnPoisonValue::get(getWidenedType(Ty, VF));

14171 }

14172 /// Resizes 2 input vector to match the sizes, if the they are not equal

14173 /// yet. The smallest vector is resized to the size of the larger vector.

14174void resizeToMatch(Value *&V1,Value *&V2) {

14175if (V1->getType() == V2->getType())

14176return;

14177int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();

14178int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();

14179int VF = std::max(V1VF, V2VF);

14180int MinVF = std::min(V1VF, V2VF);

14181SmallVector<int> IdentityMask(VF,PoisonMaskElem);

14182 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),

14183 0);

14184Value *&Op = MinVF == V1VF ? V1 : V2;

14185Op = Builder.CreateShuffleVector(Op, IdentityMask);

14186if (auto *I = dyn_cast<Instruction>(Op)) {

14187 GatherShuffleExtractSeq.insert(I);

14188 CSEBlocks.insert(I->getParent());

14189 }

14190if (MinVF == V1VF)

14191 V1 =Op;

14192else

14193 V2 =Op;

14194 }

14195 };

14196

14197 /// Smart shuffle instruction emission, walks through shuffles trees and

14198 /// tries to find the best matching vector for the actual shuffle

14199 /// instruction.

14200Value *createShuffle(Value *V1,Value *V2,ArrayRef<int> Mask) {

14201assert(V1 &&"Expected at least one vector value.");

14202 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,

14203 R.CSEBlocks, *R.DL);

14204return BaseShuffleAnalysis::createShuffle<Value *>(

14205 V1, V2, Mask, ShuffleBuilder, ScalarTy);

14206 }

14207

14208 /// Cast value \p V to the vector type with the same number of elements, but

14209 /// the base type \p ScalarTy.

14210Value *castToScalarTyElem(Value *V,

14211 std::optional<bool> IsSigned = std::nullopt) {

14212auto *VecTy = cast<VectorType>(V->getType());

14213assert(getNumElements(VecTy) %getNumElements(ScalarTy) == 0);

14214if (VecTy->getElementType() == ScalarTy->getScalarType())

14215return V;

14216return Builder.CreateIntCast(

14217 V,VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),

14218 IsSigned.value_or(!isKnownNonNegative(V,SimplifyQuery(*R.DL))));

14219 }

14220

14221public:

14222ShuffleInstructionBuilder(Type *ScalarTy,IRBuilderBase &Builder,BoUpSLP &R)

14223 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}

14224

14225 /// Adjusts extractelements after reusing them.

14226Value *adjustExtracts(const TreeEntry *E,MutableArrayRef<int> Mask,

14227ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,

14228unsigned NumParts,bool &UseVecBaseAsInput) {

14229 UseVecBaseAsInput =false;

14230SmallPtrSet<Value *, 4> UniqueBases;

14231Value *VecBase =nullptr;

14232SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());

14233if (!E->ReorderIndices.empty()) {

14234SmallVector<int> ReorderMask(E->ReorderIndices.begin(),

14235 E->ReorderIndices.end());

14236reorderScalars(VL, ReorderMask);

14237 }

14238for (intI = 0, Sz = Mask.size();I < Sz; ++I) {

14239intIdx = Mask[I];

14240if (Idx ==PoisonMaskElem)

14241continue;

14242auto *EI = cast<ExtractElementInst>(VL[I]);

14243 VecBase = EI->getVectorOperand();

14244if (const TreeEntry *TE = R.getTreeEntry(VecBase))

14245 VecBase = TE->VectorizedValue;

14246assert(VecBase &&"Expected vectorized value.");

14247 UniqueBases.insert(VecBase);

14248// If the only one use is vectorized - can delete the extractelement

14249// itself.

14250if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||

14251 (NumParts != 1 &&count(VL, EI) > 1) ||

14252any_of(EI->users(), [&](User *U) {

14253 const TreeEntry *UTE = R.getTreeEntry(U);

14254 return !UTE || R.MultiNodeScalars.contains(U) ||

14255 (isa<GetElementPtrInst>(U) &&

14256 !R.areAllUsersVectorized(cast<Instruction>(U))) ||

14257 count_if(R.VectorizableTree,

14258 [&](const std::unique_ptr<TreeEntry> &TE) {

14259 return any_of(TE->UserTreeIndices,

14260 [&](const EdgeInfo &Edge) {

14261 return Edge.UserTE == UTE;

14262 }) &&

14263 is_contained(VL, EI);

14264 }) != 1;

14265 }))

14266continue;

14267 R.eraseInstruction(EI);

14268 }

14269if (NumParts == 1 || UniqueBases.size() == 1) {

14270assert(VecBase &&"Expected vectorized value.");

14271return castToScalarTyElem(VecBase);

14272 }

14273 UseVecBaseAsInput =true;

14274auto TransformToIdentity = [](MutableArrayRef<int> Mask) {

14275for (auto [I,Idx] :enumerate(Mask))

14276if (Idx !=PoisonMaskElem)

14277Idx =I;

14278 };

14279// Perform multi-register vector shuffle, joining them into a single virtual

14280// long vector.

14281// Need to shuffle each part independently and then insert all this parts

14282// into a long virtual vector register, forming the original vector.

14283Value *Vec =nullptr;

14284SmallVector<int> VecMask(Mask.size(),PoisonMaskElem);

14285unsigned SliceSize =getPartNumElems(VL.size(), NumParts);

14286for (unsigned Part : seq<unsigned>(NumParts)) {

14287unsigned Limit =getNumElems(VL.size(), SliceSize, Part);

14288ArrayRef<Value *> SubVL =ArrayRef(VL).slice(Part * SliceSize, Limit);

14289MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);

14290constexprint MaxBases = 2;

14291SmallVector<Value *, MaxBases> Bases(MaxBases);

14292auto VLMask =zip(SubVL, SubMask);

14293constunsigned VF = std::accumulate(

14294 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S,constauto &D) {

14295 if (std::get<1>(D) == PoisonMaskElem)

14296 return S;

14297 Value *VecOp =

14298 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();

14299 if (const TreeEntry *TE = R.getTreeEntry(VecOp))

14300 VecOp = TE->VectorizedValue;

14301 assert(VecOp &&"Expected vectorized value.");

14302 const unsigned Size =

14303 cast<FixedVectorType>(VecOp->getType())->getNumElements();

14304 return std::max(S, Size);

14305 });

14306for (constauto [V,I] : VLMask) {

14307if (I ==PoisonMaskElem)

14308continue;

14309Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();

14310if (const TreeEntry *TE = R.getTreeEntry(VecOp))

14311 VecOp = TE->VectorizedValue;

14312assert(VecOp &&"Expected vectorized value.");

14313 VecOp = castToScalarTyElem(VecOp);

14314 Bases[I / VF] = VecOp;

14315 }

14316if (!Bases.front())

14317continue;

14318Value *SubVec;

14319if (Bases.back()) {

14320 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);

14321 TransformToIdentity(SubMask);

14322 }else {

14323 SubVec = Bases.front();

14324 }

14325if (!Vec) {

14326 Vec = SubVec;

14327assert((Part == 0 ||all_of(seq<unsigned>(0, Part),

14328 [&](unsignedP) {

14329ArrayRef<int> SubMask =

14330Mask.slice(P * SliceSize,

14331getNumElems(Mask.size(),

14332 SliceSize,P));

14333returnall_of(SubMask, [](intIdx) {

14334returnIdx ==PoisonMaskElem;

14335 });

14336 })) &&

14337"Expected first part or all previous parts masked.");

14338copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

14339 }else {

14340unsigned NewVF =

14341 cast<FixedVectorType>(Vec->getType())->getNumElements();

14342if (Vec->getType() != SubVec->getType()) {

14343unsigned SubVecVF =

14344 cast<FixedVectorType>(SubVec->getType())->getNumElements();

14345 NewVF = std::max(NewVF, SubVecVF);

14346 }

14347// Adjust SubMask.

14348for (int &Idx : SubMask)

14349if (Idx !=PoisonMaskElem)

14350Idx += NewVF;

14351copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));

14352 Vec = createShuffle(Vec, SubVec, VecMask);

14353 TransformToIdentity(VecMask);

14354 }

14355 }

14356copy(VecMask,Mask.begin());

14357return Vec;

14358 }

14359 /// Checks if the specified entry \p E needs to be delayed because of its

14360 /// dependency nodes.

14361 std::optional<Value *>

14362needToDelay(const TreeEntry *E,

14363ArrayRef<SmallVector<const TreeEntry *>> Deps) const{

14364// No need to delay emission if all deps are ready.

14365if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {

14366returnall_of(

14367 TEs, [](const TreeEntry *TE) {return TE->VectorizedValue; });

14368 }))

14369return std::nullopt;

14370// Postpone gather emission, will be emitted after the end of the

14371// process to keep correct order.

14372auto *ResVecTy =getWidenedType(ScalarTy, E->getVectorFactor());

14373return Builder.CreateAlignedLoad(

14374 ResVecTy,

14375PoisonValue::get(PointerType::getUnqual(ScalarTy->getContext())),

14376MaybeAlign());

14377 }

14378 /// Adds 2 input vectors (in form of tree entries) and the mask for their

14379 /// shuffling.

14380voidadd(const TreeEntry &E1,const TreeEntry &E2,ArrayRef<int> Mask) {

14381Value *V1 = E1.VectorizedValue;

14382if (V1->getType()->isIntOrIntVectorTy())

14383 V1 = castToScalarTyElem(V1,any_of(E1.Scalars, [&](Value *V) {

14384 if (isa<PoisonValue>(V))

14385 return false;

14386 return !isKnownNonNegative(

14387 V, SimplifyQuery(*R.DL));

14388 }));

14389Value *V2 = E2.VectorizedValue;

14390if (V2->getType()->isIntOrIntVectorTy())

14391 V2 = castToScalarTyElem(V2,any_of(E2.Scalars, [&](Value *V) {

14392 if (isa<PoisonValue>(V))

14393 return false;

14394 return !isKnownNonNegative(

14395 V, SimplifyQuery(*R.DL));

14396 }));

14397 add(V1, V2, Mask);

14398 }

14399 /// Adds single input vector (in form of tree entry) and the mask for its

14400 /// shuffling.

14401voidadd(const TreeEntry &E1,ArrayRef<int> Mask) {

14402Value *V1 = E1.VectorizedValue;

14403if (V1->getType()->isIntOrIntVectorTy())

14404 V1 = castToScalarTyElem(V1,any_of(E1.Scalars, [&](Value *V) {

14405 if (isa<PoisonValue>(V))

14406 return false;

14407 return !isKnownNonNegative(

14408 V, SimplifyQuery(*R.DL));

14409 }));

14410 add(V1, Mask);

14411 }

14412 /// Adds 2 input vectors and the mask for their shuffling.

14413voidadd(Value *V1,Value *V2,ArrayRef<int> Mask) {

14414assert(V1 && V2 && !Mask.empty() &&"Expected non-empty input vectors.");

14415assert(isa<FixedVectorType>(V1->getType()) &&

14416 isa<FixedVectorType>(V2->getType()) &&

14417"castToScalarTyElem expects V1 and V2 to be FixedVectorType");

14418 V1 = castToScalarTyElem(V1);

14419 V2 = castToScalarTyElem(V2);

14420if (InVectors.empty()) {

14421 InVectors.push_back(V1);

14422 InVectors.push_back(V2);

14423 CommonMask.assign(Mask.begin(), Mask.end());

14424return;

14425 }

14426Value *Vec = InVectors.front();

14427if (InVectors.size() == 2) {

14428 Vec = createShuffle(Vec, InVectors.back(), CommonMask);

14429 transformMaskAfterShuffle(CommonMask, CommonMask);

14430 }elseif (cast<FixedVectorType>(Vec->getType())->getNumElements() !=

14431 Mask.size()) {

14432 Vec = createShuffle(Vec,nullptr, CommonMask);

14433 transformMaskAfterShuffle(CommonMask, CommonMask);

14434 }

14435 V1 = createShuffle(V1, V2, Mask);

14436unsigned VF = std::max(getVF(V1), getVF(Vec));

14437for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

14438if (Mask[Idx] !=PoisonMaskElem)

14439 CommonMask[Idx] =Idx + VF;

14440 InVectors.front() = Vec;

14441if (InVectors.size() == 2)

14442 InVectors.back() = V1;

14443else

14444 InVectors.push_back(V1);

14445 }

14446 /// Adds another one input vector and the mask for the shuffling.

14447voidadd(Value *V1,ArrayRef<int> Mask,bool =false) {

14448assert(isa<FixedVectorType>(V1->getType()) &&

14449"castToScalarTyElem expects V1 to be FixedVectorType");

14450 V1 = castToScalarTyElem(V1);

14451if (InVectors.empty()) {

14452 InVectors.push_back(V1);

14453 CommonMask.assign(Mask.begin(), Mask.end());

14454return;

14455 }

14456constauto *It =find(InVectors, V1);

14457if (It == InVectors.end()) {

14458if (InVectors.size() == 2 ||

14459 InVectors.front()->getType() != V1->getType()) {

14460Value *V = InVectors.front();

14461if (InVectors.size() == 2) {

14462 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);

14463 transformMaskAfterShuffle(CommonMask, CommonMask);

14464 }elseif (cast<FixedVectorType>(V->getType())->getNumElements() !=

14465 CommonMask.size()) {

14466 V = createShuffle(InVectors.front(),nullptr, CommonMask);

14467 transformMaskAfterShuffle(CommonMask, CommonMask);

14468 }

14469unsigned VF = std::max(CommonMask.size(), Mask.size());

14470for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

14471if (CommonMask[Idx] ==PoisonMaskElem && Mask[Idx] !=PoisonMaskElem)

14472 CommonMask[Idx] =

14473 V->getType() != V1->getType()

14474 ?Idx + VF

14475 : Mask[Idx] + cast<FixedVectorType>(V1->getType())

14476 ->getNumElements();

14477if (V->getType() != V1->getType())

14478 V1 = createShuffle(V1,nullptr, Mask);

14479 InVectors.front() = V;

14480if (InVectors.size() == 2)

14481 InVectors.back() = V1;

14482else

14483 InVectors.push_back(V1);

14484return;

14485 }

14486// Check if second vector is required if the used elements are already

14487// used from the first one.

14488for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

14489if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem) {

14490 InVectors.push_back(V1);

14491break;

14492 }

14493 }

14494unsigned VF = 0;

14495for (Value *V : InVectors)

14496 VF = std::max(VF, getVF(V));

14497for (unsignedIdx = 0, Sz = CommonMask.size();Idx < Sz; ++Idx)

14498if (Mask[Idx] !=PoisonMaskElem && CommonMask[Idx] ==PoisonMaskElem)

14499 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);

14500 }

14501 /// Adds another one input vector and the mask for the shuffling.

14502voidaddOrdered(Value *V1,ArrayRef<unsigned> Order) {

14503SmallVector<int> NewMask;

14504inversePermutation(Order, NewMask);

14505 add(V1, NewMask);

14506 }

14507Value *gather(ArrayRef<Value *> VL,unsigned MaskVF = 0,

14508Value *Root =nullptr) {

14509return R.gather(VL, Root, ScalarTy,

14510 [&](Value *V1,Value *V2,ArrayRef<int> Mask) {

14511return createShuffle(V1, V2, Mask);

14512 });

14513 }

14514Value *createFreeze(Value *V) {return Builder.CreateFreeze(V); }

14515 /// Finalize emission of the shuffles.

14516 /// \param Action the action (if any) to be performed before final applying of

14517 /// the \p ExtMask mask.

14518Value *

14519finalize(ArrayRef<int> ExtMask,

14520ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,

14521ArrayRef<int> SubVectorsMask,unsigned VF = 0,

14522function_ref<void(Value *&,SmallVectorImpl<int> &)> Action = {}) {

14523 IsFinalized =true;

14524if (Action) {

14525Value *Vec = InVectors.front();

14526if (InVectors.size() == 2) {

14527 Vec = createShuffle(Vec, InVectors.back(), CommonMask);

14528 InVectors.pop_back();

14529 }else {

14530 Vec = createShuffle(Vec,nullptr, CommonMask);

14531 }

14532 transformMaskAfterShuffle(CommonMask, CommonMask);

14533assert(VF > 0 &&

14534"Expected vector length for the final value before action.");

14535unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();

14536if (VecVF < VF) {

14537SmallVector<int> ResizeMask(VF,PoisonMaskElem);

14538 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);

14539 Vec = createShuffle(Vec,nullptr, ResizeMask);

14540 }

14541 Action(Vec, CommonMask);

14542 InVectors.front() = Vec;

14543 }

14544if (!SubVectors.empty()) {

14545Value *Vec = InVectors.front();

14546if (InVectors.size() == 2) {

14547 Vec = createShuffle(Vec, InVectors.back(), CommonMask);

14548 InVectors.pop_back();

14549 }else {

14550 Vec = createShuffle(Vec,nullptr, CommonMask);

14551 }

14552 transformMaskAfterShuffle(CommonMask, CommonMask);

14553auto CreateSubVectors = [&](Value *Vec,

14554SmallVectorImpl<int> &CommonMask) {

14555for (auto [E,Idx] : SubVectors) {

14556Value *V = E->VectorizedValue;

14557if (V->getType()->isIntOrIntVectorTy())

14558 V = castToScalarTyElem(V,any_of(E->Scalars, [&](Value *V) {

14559 if (isa<PoisonValue>(V))

14560 return false;

14561 return !isKnownNonNegative(

14562 V, SimplifyQuery(*R.DL));

14563 }));

14564unsigned InsertionIndex =Idx *getNumElements(ScalarTy);

14565 Vec =createInsertVector(

14566 Builder, Vec, V, InsertionIndex,

14567 std::bind(&ShuffleInstructionBuilder::createShuffle,this, _1, _2,

14568 _3));

14569if (!CommonMask.empty()) {

14570 std::iota(std::next(CommonMask.begin(),Idx),

14571 std::next(CommonMask.begin(),Idx + E->getVectorFactor()),

14572Idx);

14573 }

14574 }

14575return Vec;

14576 };

14577if (SubVectorsMask.empty()) {

14578 Vec = CreateSubVectors(Vec, CommonMask);

14579 }else {

14580SmallVector<int> SVMask(CommonMask.size(),PoisonMaskElem);

14581copy(SubVectorsMask, SVMask.begin());

14582for (auto [I1, I2] :zip(SVMask, CommonMask)) {

14583if (I2 !=PoisonMaskElem) {

14584assert(I1 ==PoisonMaskElem &&"Expected unused subvectors mask");

14585I1 = I2 + CommonMask.size();

14586 }

14587 }

14588Value *InsertVec =

14589 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);

14590 Vec = createShuffle(InsertVec, Vec, SVMask);

14591 transformMaskAfterShuffle(CommonMask, SVMask);

14592 }

14593 InVectors.front() = Vec;

14594 }

14595

14596if (!ExtMask.empty()) {

14597if (CommonMask.empty()) {

14598 CommonMask.assign(ExtMask.begin(), ExtMask.end());

14599 }else {

14600SmallVector<int> NewMask(ExtMask.size(),PoisonMaskElem);

14601for (intI = 0, Sz = ExtMask.size();I < Sz; ++I) {

14602if (ExtMask[I] ==PoisonMaskElem)

14603continue;

14604 NewMask[I] = CommonMask[ExtMask[I]];

14605 }

14606 CommonMask.swap(NewMask);

14607 }

14608 }

14609if (CommonMask.empty()) {

14610assert(InVectors.size() == 1 &&"Expected only one vector with no mask");

14611return InVectors.front();

14612 }

14613if (InVectors.size() == 2)

14614return createShuffle(InVectors.front(), InVectors.back(), CommonMask);

14615return createShuffle(InVectors.front(),nullptr, CommonMask);

14616 }

14617

14618~ShuffleInstructionBuilder() {

14619assert((IsFinalized || CommonMask.empty()) &&

14620"Shuffle construction must be finalized.");

14621 }

14622};

14623

14624BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,

14625unsigned NodeIdx) {

14626ArrayRef<Value *> VL = E->getOperand(NodeIdx);

14627 InstructionsState S =getSameOpcode(VL, *TLI);

14628// Special processing for GEPs bundle, which may include non-gep values.

14629if (!S && VL.front()->getType()->isPointerTy()) {

14630constauto *It =find_if(VL, IsaPred<GetElementPtrInst>);

14631if (It != VL.end())

14632 S =getSameOpcode(*It, *TLI);

14633 }

14634if (!S)

14635returnnullptr;

14636auto CheckSameVE = [&](const TreeEntry *VE) {

14637return VE->isSame(VL) &&

14638 (any_of(VE->UserTreeIndices,

14639 [E, NodeIdx](const EdgeInfo &EI) {

14640 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;

14641 }) ||

14642any_of(VectorizableTree,

14643 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {

14644return TE->isOperandGatherNode(

14645 {const_cast<TreeEntry *>(E), NodeIdx}) &&

14646 VE->isSame(TE->Scalars);

14647 }));

14648 };

14649 TreeEntry *VE = getTreeEntry(S.getMainOp());

14650if (VE && CheckSameVE(VE))

14651return VE;

14652auto It = MultiNodeScalars.find(S.getMainOp());

14653if (It != MultiNodeScalars.end()) {

14654auto *I =find_if(It->getSecond(), [&](const TreeEntry *TE) {

14655 return TE != VE && CheckSameVE(TE);

14656 });

14657if (I != It->getSecond().end())

14658return *I;

14659 }

14660returnnullptr;

14661}

14662

14663Value *BoUpSLP::vectorizeOperand(TreeEntry *E,unsigned NodeIdx,

14664bool PostponedPHIs) {

14665ValueList &VL = E->getOperand(NodeIdx);

14666constunsigned VF = VL.size();

14667if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {

14668auto FinalShuffle = [&](Value *V,ArrayRef<int>Mask) {

14669// V may be affected by MinBWs.

14670// We want ShuffleInstructionBuilder to correctly support REVEC. The key

14671// factor is the number of elements, not their type.

14672Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();

14673unsigned NumElements =getNumElements(VL.front()->getType());

14674 ShuffleInstructionBuilder ShuffleBuilder(

14675 NumElements != 1 ?FixedVectorType::get(ScalarTy, NumElements)

14676 : ScalarTy,

14677 Builder, *this);

14678 ShuffleBuilder.add(V, Mask);

14679SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(

14680 E->CombinedEntriesWithIndices.size());

14681transform(E->CombinedEntriesWithIndices, SubVectors.begin(),

14682 [&](constauto &P) {

14683 return std::make_pair(VectorizableTree[P.first].get(),

14684 P.second);

14685 });

14686assert((E->CombinedEntriesWithIndices.empty() ||

14687 E->ReorderIndices.empty()) &&

14688"Expected either combined subnodes or reordering");

14689return ShuffleBuilder.finalize({}, SubVectors, {});

14690 };

14691Value *V =vectorizeTree(VE, PostponedPHIs);

14692if (VF *getNumElements(VL[0]->getType()) !=

14693 cast<FixedVectorType>(V->getType())->getNumElements()) {

14694if (!VE->ReuseShuffleIndices.empty()) {

14695// Reshuffle to get only unique values.

14696// If some of the scalars are duplicated in the vectorization

14697// tree entry, we do not vectorize them but instead generate a

14698// mask for the reuses. But if there are several users of the

14699// same entry, they may have different vectorization factors.

14700// This is especially important for PHI nodes. In this case, we

14701// need to adapt the resulting instruction for the user

14702// vectorization factor and have to reshuffle it again to take

14703// only unique elements of the vector. Without this code the

14704// function incorrectly returns reduced vector instruction with

14705// the same elements, not with the unique ones.

14706

14707// block:

14708// %phi = phi <2 x > { .., %entry} {%shuffle, %block}

14709// %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>

14710// ... (use %2)

14711// %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}

14712// br %block

14713SmallVector<int>Mask(VF,PoisonMaskElem);

14714for (auto [I, V] :enumerate(VL)) {

14715if (isa<PoisonValue>(V))

14716continue;

14717Mask[I] = VE->findLaneForValue(V);

14718 }

14719V = FinalShuffle(V, Mask);

14720 }else {

14721assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&

14722"Expected vectorization factor less "

14723"than original vector size.");

14724SmallVector<int> UniformMask(VF, 0);

14725 std::iota(UniformMask.begin(), UniformMask.end(), 0);

14726V = FinalShuffle(V, UniformMask);

14727 }

14728 }

14729// Need to update the operand gather node, if actually the operand is not a

14730// vectorized node, but the buildvector/gather node, which matches one of

14731// the vectorized nodes.

14732if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {

14733 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;

14734 }) == VE->UserTreeIndices.end()) {

14735auto *It =

14736find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {

14737returnTE->isGather() &&TE->UserTreeIndices.front().UserTE == E &&

14738TE->UserTreeIndices.front().EdgeIdx == NodeIdx;

14739 });

14740assert(It != VectorizableTree.end() &&"Expected gather node operand.");

14741 (*It)->VectorizedValue =V;

14742 }

14743returnV;

14744 }

14745

14746// Find the corresponding gather entry and vectorize it.

14747// Allows to be more accurate with tree/graph transformations, checks for the

14748// correctness of the transformations in many cases.

14749auto *I =find_if(VectorizableTree,

14750 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {

14751returnTE->isOperandGatherNode({E, NodeIdx});

14752 });

14753assert(I != VectorizableTree.end() &&"Gather node is not in the graph.");

14754assert(I->get()->UserTreeIndices.size() == 1 &&

14755"Expected only single user for the gather node.");

14756assert(I->get()->isSame(VL) &&"Expected same list of scalars.");

14757returnvectorizeTree(I->get(), PostponedPHIs);

14758}

14759

14760template <typename BVTy,typename ResTy,typename...Args>

14761ResTy BoUpSLP::processBuildVector(const TreeEntry *E,Type *ScalarTy,

14762 Args &...Params) {

14763assert(E->isGather() &&"Expected gather node.");

14764unsigned VF = E->getVectorFactor();

14765

14766bool NeedFreeze =false;

14767SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),

14768 E->ReuseShuffleIndices.end());

14769SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());

14770// Clear values, to be replaced by insertvector instructions.

14771for (auto [EIdx,Idx] : E->CombinedEntriesWithIndices)

14772for_each(MutableArrayRef(GatheredScalars)

14773 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),

14774 [&](Value *&V) {V =PoisonValue::get(V->getType()); });

14775SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(

14776 E->CombinedEntriesWithIndices.size());

14777transform(E->CombinedEntriesWithIndices, SubVectors.begin(),

14778 [&](constauto &P) {

14779 return std::make_pair(VectorizableTree[P.first].get(), P.second);

14780 });

14781// Build a mask out of the reorder indices and reorder scalars per this

14782// mask.

14783SmallVector<int> ReorderMask(E->ReorderIndices.begin(),

14784 E->ReorderIndices.end());

14785if (!ReorderMask.empty())

14786reorderScalars(GatheredScalars, ReorderMask);

14787SmallVector<int> SubVectorsMask;

14788inversePermutation(E->ReorderIndices, SubVectorsMask);

14789// Transform non-clustered elements in the mask to poison (-1).

14790// "Clustered" operations will be reordered using this mask later.

14791if (!SubVectors.empty() && !SubVectorsMask.empty()) {

14792for (unsignedI : seq<unsigned>(GatheredScalars.size()))

14793if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])

14794 SubVectorsMask[ReorderMask[I]] =PoisonMaskElem;

14795 }else {

14796 SubVectorsMask.clear();

14797 }

14798SmallVector<Value *> StoredGS(GatheredScalars);

14799auto FindReusedSplat = [&](MutableArrayRef<int>Mask,unsigned InputVF,

14800unsignedI,unsigned SliceSize,

14801bool IsNotPoisonous) {

14802if (!isSplat(E->Scalars) ||none_of(E->Scalars, [](Value *V) {

14803 return isa<UndefValue>(V) && !isa<PoisonValue>(V);

14804 }))

14805returnfalse;

14806 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;

14807unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;

14808if (UserTE->getNumOperands() != 2)

14809returnfalse;

14810if (!IsNotPoisonous) {

14811auto *It =

14812find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {

14813returnfind_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {

14814 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;

14815 }) !=TE->UserTreeIndices.end();

14816 });

14817if (It == VectorizableTree.end())

14818returnfalse;

14819SmallVector<Value *>GS((*It)->Scalars.begin(), (*It)->Scalars.end());

14820if (!(*It)->ReorderIndices.empty()) {

14821inversePermutation((*It)->ReorderIndices, ReorderMask);

14822reorderScalars(GS, ReorderMask);

14823 }

14824if (!all_of(zip(GatheredScalars, GS), [&](constauto &P) {

14825Value *V0 = std::get<0>(P);

14826Value *V1 = std::get<1>(P);

14827return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||

14828 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&

14829is_contained(E->Scalars, V1));

14830 }))

14831returnfalse;

14832 }

14833intIdx;

14834if ((Mask.size() < InputVF &&

14835ShuffleVectorInst::isExtractSubvectorMask(Mask, InputVF,Idx) &&

14836Idx == 0) ||

14837 (Mask.size() == InputVF &&

14838ShuffleVectorInst::isIdentityMask(Mask,Mask.size()))) {

14839 std::iota(

14840 std::next(Mask.begin(),I * SliceSize),

14841 std::next(Mask.begin(),

14842I * SliceSize +getNumElems(Mask.size(), SliceSize,I)),

14843 0);

14844 }else {

14845unsigned IVal =

14846 *find_if_not(Mask, [](intIdx) {returnIdx ==PoisonMaskElem; });

14847 std::fill(

14848 std::next(Mask.begin(),I * SliceSize),

14849 std::next(Mask.begin(),

14850I * SliceSize +getNumElems(Mask.size(), SliceSize,I)),

14851 IVal);

14852 }

14853returntrue;

14854 };

14855 BVTy ShuffleBuilder(ScalarTy, Params...);

14856 ResTy Res = ResTy();

14857SmallVector<int>Mask;

14858SmallVector<int> ExtractMask(GatheredScalars.size(),PoisonMaskElem);

14859SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;

14860Value *ExtractVecBase =nullptr;

14861bool UseVecBaseAsInput =false;

14862SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;

14863SmallVector<SmallVector<const TreeEntry *>> Entries;

14864Type *OrigScalarTy = GatheredScalars.front()->getType();

14865auto *VecTy =getWidenedType(ScalarTy, GatheredScalars.size());

14866unsigned NumParts =::getNumberOfParts(*TTI, VecTy, GatheredScalars.size());

14867if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {

14868// Check for gathered extracts.

14869bool Resized =false;

14870 ExtractShuffles =

14871 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);

14872if (!ExtractShuffles.empty()) {

14873SmallVector<const TreeEntry *> ExtractEntries;

14874for (auto [Idx,I] :enumerate(ExtractMask)) {

14875if (I ==PoisonMaskElem)

14876continue;

14877if (constauto *TE = getTreeEntry(

14878 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))

14879 ExtractEntries.push_back(TE);

14880 }

14881if (std::optional<ResTy> Delayed =

14882 ShuffleBuilder.needToDelay(E, ExtractEntries)) {

14883// Delay emission of gathers which are not ready yet.

14884 PostponedGathers.insert(E);

14885// Postpone gather emission, will be emitted after the end of the

14886// process to keep correct order.

14887return *Delayed;

14888 }

14889if (Value *VecBase = ShuffleBuilder.adjustExtracts(

14890 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {

14891 ExtractVecBase = VecBase;

14892if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))

14893if (VF == VecBaseTy->getNumElements() &&

14894 GatheredScalars.size() != VF) {

14895 Resized =true;

14896 GatheredScalars.append(VF - GatheredScalars.size(),

14897PoisonValue::get(OrigScalarTy));

14898 NumParts =

14899::getNumberOfParts(*TTI,getWidenedType(OrigScalarTy, VF), VF);

14900 }

14901 }

14902 }

14903// Gather extracts after we check for full matched gathers only.

14904if (!ExtractShuffles.empty() || !E->hasState() ||

14905 E->getOpcode() != Instruction::Load ||

14906 (((E->hasState() && E->getOpcode() == Instruction::Load) ||

14907any_of(E->Scalars, IsaPred<LoadInst>)) &&

14908any_of(E->Scalars,

14909 [this](Value *V) {

14910 return isa<LoadInst>(V) && getTreeEntry(V);

14911 })) ||

14912 (E->hasState() && E->isAltShuffle()) ||

14913all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||

14914isSplat(E->Scalars) ||

14915 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {

14916 GatherShuffles =

14917 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);

14918 }

14919if (!GatherShuffles.empty()) {

14920if (std::optional<ResTy> Delayed =

14921 ShuffleBuilder.needToDelay(E, Entries)) {

14922// Delay emission of gathers which are not ready yet.

14923 PostponedGathers.insert(E);

14924// Postpone gather emission, will be emitted after the end of the

14925// process to keep correct order.

14926return *Delayed;

14927 }

14928if (GatherShuffles.size() == 1 &&

14929 *GatherShuffles.front() ==TTI::SK_PermuteSingleSrc &&

14930 Entries.front().front()->isSame(E->Scalars)) {

14931// Perfect match in the graph, will reuse the previously vectorized

14932// node. Cost is 0.

14933LLVM_DEBUG(dbgs() <<"SLP: perfect diamond match for gather bundle "

14934 <<shortBundleName(E->Scalars, E->Idx) <<".\n");

14935// Restore the mask for previous partially matched values.

14936Mask.resize(E->Scalars.size());

14937const TreeEntry *FrontTE = Entries.front().front();

14938if (FrontTE->ReorderIndices.empty() &&

14939 ((FrontTE->ReuseShuffleIndices.empty() &&

14940 E->Scalars.size() == FrontTE->Scalars.size()) ||

14941 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {

14942 std::iota(Mask.begin(),Mask.end(), 0);

14943 }else {

14944for (auto [I, V] :enumerate(E->Scalars)) {

14945if (isa<PoisonValue>(V)) {

14946Mask[I] =PoisonMaskElem;

14947continue;

14948 }

14949Mask[I] = FrontTE->findLaneForValue(V);

14950 }

14951 }

14952 ShuffleBuilder.add(*FrontTE, Mask);

14953// Full matched entry found, no need to insert subvectors.

14954 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});

14955return Res;

14956 }

14957if (!Resized) {

14958if (GatheredScalars.size() != VF &&

14959any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {

14960returnany_of(TEs, [&](const TreeEntry *TE) {

14961returnTE->getVectorFactor() == VF;

14962 });

14963 }))

14964 GatheredScalars.append(VF - GatheredScalars.size(),

14965PoisonValue::get(OrigScalarTy));

14966 }

14967// Remove shuffled elements from list of gathers.

14968for (intI = 0, Sz =Mask.size();I < Sz; ++I) {

14969if (Mask[I] !=PoisonMaskElem)

14970 GatheredScalars[I] =PoisonValue::get(OrigScalarTy);

14971 }

14972 }

14973 }

14974auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,

14975SmallVectorImpl<int> &ReuseMask,

14976bool IsRootPoison) {

14977// For splats with can emit broadcasts instead of gathers, so try to find

14978// such sequences.

14979bool IsSplat = IsRootPoison &&isSplat(Scalars) &&

14980 (Scalars.size() > 2 || Scalars.front() == Scalars.back());

14981 Scalars.append(VF - Scalars.size(),PoisonValue::get(OrigScalarTy));

14982SmallVector<int> UndefPos;

14983DenseMap<Value *, unsigned> UniquePositions;

14984// Gather unique non-const values and all constant values.

14985// For repeated values, just shuffle them.

14986int NumNonConsts = 0;

14987int SinglePos = 0;

14988for (auto [I, V] :enumerate(Scalars)) {

14989if (isa<UndefValue>(V)) {

14990if (!isa<PoisonValue>(V)) {

14991 ReuseMask[I] =I;

14992 UndefPos.push_back(I);

14993 }

14994continue;

14995 }

14996if (isConstant(V)) {

14997 ReuseMask[I] =I;

14998continue;

14999 }

15000 ++NumNonConsts;

15001 SinglePos =I;

15002Value *OrigV =V;

15003 Scalars[I] =PoisonValue::get(OrigScalarTy);

15004if (IsSplat) {

15005 Scalars.front() = OrigV;

15006 ReuseMask[I] = 0;

15007 }else {

15008constauto Res = UniquePositions.try_emplace(OrigV,I);

15009 Scalars[Res.first->second] = OrigV;

15010 ReuseMask[I] = Res.first->second;

15011 }

15012 }

15013if (NumNonConsts == 1) {

15014// Restore single insert element.

15015if (IsSplat) {

15016 ReuseMask.assign(VF,PoisonMaskElem);

15017std::swap(Scalars.front(), Scalars[SinglePos]);

15018if (!UndefPos.empty() && UndefPos.front() == 0)

15019 Scalars.front() =UndefValue::get(OrigScalarTy);

15020 }

15021 ReuseMask[SinglePos] = SinglePos;

15022 }elseif (!UndefPos.empty() && IsSplat) {

15023// For undef values, try to replace them with the simple broadcast.

15024// We can do it if the broadcasted value is guaranteed to be

15025// non-poisonous, or by freezing the incoming scalar value first.

15026auto *It =find_if(Scalars, [this, E](Value *V) {

15027return !isa<UndefValue>(V) &&

15028 (getTreeEntry(V) ||isGuaranteedNotToBePoison(V, AC) ||

15029 (E->UserTreeIndices.size() == 1 &&

15030any_of(V->uses(), [E](constUse &U) {

15031// Check if the value already used in the same operation in

15032// one of the nodes already.

15033 return E->UserTreeIndices.front().EdgeIdx !=

15034 U.getOperandNo() &&

15035 is_contained(

15036 E->UserTreeIndices.front().UserTE->Scalars,

15037 U.getUser());

15038 })));

15039 });

15040if (It != Scalars.end()) {

15041// Replace undefs by the non-poisoned scalars and emit broadcast.

15042int Pos = std::distance(Scalars.begin(), It);

15043for (intI : UndefPos) {

15044// Set the undef position to the non-poisoned scalar.

15045 ReuseMask[I] = Pos;

15046// Replace the undef by the poison, in the mask it is replaced by

15047// non-poisoned scalar already.

15048if (I != Pos)

15049 Scalars[I] =PoisonValue::get(OrigScalarTy);

15050 }

15051 }else {

15052// Replace undefs by the poisons, emit broadcast and then emit

15053// freeze.

15054for (intI : UndefPos) {

15055 ReuseMask[I] =PoisonMaskElem;

15056if (isa<UndefValue>(Scalars[I]))

15057 Scalars[I] =PoisonValue::get(OrigScalarTy);

15058 }

15059 NeedFreeze =true;

15060 }

15061 }

15062 };

15063if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {

15064bool IsNonPoisoned =true;

15065bool IsUsedInExpr =true;

15066Value *Vec1 =nullptr;

15067if (!ExtractShuffles.empty()) {

15068// Gather of extractelements can be represented as just a shuffle of

15069// a single/two vectors the scalars are extracted from.

15070// Find input vectors.

15071Value *Vec2 =nullptr;

15072for (unsignedI = 0, Sz = ExtractMask.size();I < Sz; ++I) {

15073if (!Mask.empty() && Mask[I] !=PoisonMaskElem)

15074 ExtractMask[I] =PoisonMaskElem;

15075 }

15076if (UseVecBaseAsInput) {

15077 Vec1 = ExtractVecBase;

15078 }else {

15079for (unsignedI = 0, Sz = ExtractMask.size();I < Sz; ++I) {

15080if (ExtractMask[I] ==PoisonMaskElem)

15081continue;

15082if (isa<UndefValue>(E->Scalars[I]))

15083continue;

15084auto *EI = cast<ExtractElementInst>(StoredGS[I]);

15085Value *VecOp = EI->getVectorOperand();

15086if (constauto *TE = getTreeEntry(VecOp))

15087if (TE->VectorizedValue)

15088 VecOp =TE->VectorizedValue;

15089if (!Vec1) {

15090 Vec1 = VecOp;

15091 }elseif (Vec1 != VecOp) {

15092assert((!Vec2 || Vec2 == VecOp) &&

15093"Expected only 1 or 2 vectors shuffle.");

15094 Vec2 = VecOp;

15095 }

15096 }

15097 }

15098if (Vec2) {

15099 IsUsedInExpr =false;

15100 IsNonPoisoned &=isGuaranteedNotToBePoison(Vec1, AC) &&

15101isGuaranteedNotToBePoison(Vec2, AC);

15102 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);

15103 }elseif (Vec1) {

15104bool IsNotPoisonedVec =isGuaranteedNotToBePoison(Vec1, AC);

15105 IsUsedInExpr &= FindReusedSplat(

15106 ExtractMask,

15107 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,

15108 ExtractMask.size(), IsNotPoisonedVec);

15109 ShuffleBuilder.add(Vec1, ExtractMask,/*ForExtracts=*/true);

15110 IsNonPoisoned &= IsNotPoisonedVec;

15111 }else {

15112 IsUsedInExpr =false;

15113 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,

15114/*ForExtracts=*/true);

15115 }

15116 }

15117if (!GatherShuffles.empty()) {

15118unsigned SliceSize =getPartNumElems(E->Scalars.size(), NumParts);

15119SmallVector<int> VecMask(Mask.size(),PoisonMaskElem);

15120for (constauto [I, TEs] :enumerate(Entries)) {

15121if (TEs.empty()) {

15122assert(!GatherShuffles[I] &&

15123"No shuffles with empty entries list expected.");

15124continue;

15125 }

15126assert((TEs.size() == 1 || TEs.size() == 2) &&

15127"Expected shuffle of 1 or 2 entries.");

15128unsigned Limit =getNumElems(Mask.size(), SliceSize,I);

15129auto SubMask =ArrayRef(Mask).slice(I * SliceSize, Limit);

15130 VecMask.assign(VecMask.size(),PoisonMaskElem);

15131copy(SubMask, std::next(VecMask.begin(),I * SliceSize));

15132if (TEs.size() == 1) {

15133bool IsNotPoisonedVec =

15134 TEs.front()->VectorizedValue

15135 ?isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)

15136 :true;

15137 IsUsedInExpr &=

15138 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(),I,

15139 SliceSize, IsNotPoisonedVec);

15140 ShuffleBuilder.add(*TEs.front(), VecMask);

15141 IsNonPoisoned &= IsNotPoisonedVec;

15142 }else {

15143 IsUsedInExpr =false;

15144 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);

15145if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)

15146 IsNonPoisoned &=

15147isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&

15148isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);

15149 }

15150 }

15151 }

15152// Try to figure out best way to combine values: build a shuffle and insert

15153// elements or just build several shuffles.

15154// Insert non-constant scalars.

15155SmallVector<Value *> NonConstants(GatheredScalars);

15156int EMSz = ExtractMask.size();

15157int MSz =Mask.size();

15158// Try to build constant vector and shuffle with it only if currently we

15159// have a single permutation and more than 1 scalar constants.

15160bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();

15161bool IsIdentityShuffle =

15162 ((UseVecBaseAsInput ||

15163all_of(ExtractShuffles,

15164 [](const std::optional<TTI::ShuffleKind> &SK) {

15165return SK.value_or(TTI::SK_PermuteTwoSrc) ==

15166TTI::SK_PermuteSingleSrc;

15167 })) &&

15168none_of(ExtractMask, [&](intI) {returnI >= EMSz; }) &&

15169ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||

15170 (!GatherShuffles.empty() &&

15171all_of(GatherShuffles,

15172 [](const std::optional<TTI::ShuffleKind> &SK) {

15173return SK.value_or(TTI::SK_PermuteTwoSrc) ==

15174TTI::SK_PermuteSingleSrc;

15175 }) &&

15176none_of(Mask, [&](intI) {returnI >= MSz; }) &&

15177ShuffleVectorInst::isIdentityMask(Mask, MSz));

15178bool EnoughConstsForShuffle =

15179 IsSingleShuffle &&

15180 (none_of(GatheredScalars,

15181 [](Value *V) {

15182return isa<UndefValue>(V) && !isa<PoisonValue>(V);

15183 }) ||

15184any_of(GatheredScalars,

15185 [](Value *V) {

15186return isa<Constant>(V) && !isa<UndefValue>(V);

15187 })) &&

15188 (!IsIdentityShuffle ||

15189 (GatheredScalars.size() == 2 &&

15190any_of(GatheredScalars,

15191 [](Value *V) {return !isa<UndefValue>(V); })) ||

15192count_if(GatheredScalars, [](Value *V) {

15193return isa<Constant>(V) && !isa<PoisonValue>(V);

15194 }) > 1);

15195// NonConstants array contains just non-constant values, GatheredScalars

15196// contains only constant to build final vector and then shuffle.

15197for (intI = 0, Sz = GatheredScalars.size();I < Sz; ++I) {

15198if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))

15199 NonConstants[I] =PoisonValue::get(OrigScalarTy);

15200else

15201 GatheredScalars[I] =PoisonValue::get(OrigScalarTy);

15202 }

15203// Generate constants for final shuffle and build a mask for them.

15204if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {

15205SmallVector<int> BVMask(GatheredScalars.size(),PoisonMaskElem);

15206 TryPackScalars(GatheredScalars, BVMask,/*IsRootPoison=*/true);

15207Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());

15208 ShuffleBuilder.add(BV, BVMask);

15209 }

15210if (all_of(NonConstants, [=](Value *V) {

15211return isa<PoisonValue>(V) ||

15212 (IsSingleShuffle && ((IsIdentityShuffle &&

15213 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));

15214 }))

15215 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,

15216 SubVectorsMask);

15217else

15218 Res = ShuffleBuilder.finalize(

15219 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),

15220 [&](Value *&Vec,SmallVectorImpl<int> &Mask) {

15221 TryPackScalars(NonConstants, Mask,/*IsRootPoison=*/false);

15222 Vec = ShuffleBuilder.gather(NonConstants,Mask.size(), Vec);

15223 });

15224 }elseif (!allConstant(GatheredScalars)) {

15225// Gather unique scalars and all constants.

15226SmallVector<int> ReuseMask(GatheredScalars.size(),PoisonMaskElem);

15227 TryPackScalars(GatheredScalars, ReuseMask,/*IsRootPoison=*/true);

15228Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());

15229 ShuffleBuilder.add(BV, ReuseMask);

15230 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,

15231 SubVectorsMask);

15232 }else {

15233// Gather all constants.

15234SmallVector<int>Mask(GatheredScalars.size(),PoisonMaskElem);

15235for (auto [I, V] :enumerate(GatheredScalars)) {

15236if (!isa<PoisonValue>(V))

15237Mask[I] =I;

15238 }

15239Value *BV = ShuffleBuilder.gather(GatheredScalars);

15240 ShuffleBuilder.add(BV, Mask);

15241 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,

15242 SubVectorsMask);

15243 }

15244

15245if (NeedFreeze)

15246 Res = ShuffleBuilder.createFreeze(Res);

15247return Res;

15248}

15249

15250Value *BoUpSLP::createBuildVector(const TreeEntry *E,Type *ScalarTy,

15251bool PostponedPHIs) {

15252for (auto [EIdx,_] : E->CombinedEntriesWithIndices)

15253 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);

15254return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,

15255 Builder, *this);

15256}

15257

15258/// \returns \p I after propagating metadata from \p VL only for instructions in

15259/// \p VL.

15260staticInstruction *propagateMetadata(Instruction *Inst,ArrayRef<Value *> VL) {

15261SmallVector<Value *> Insts;

15262for (Value *V : VL)

15263if (isa<Instruction>(V))

15264 Insts.push_back(V);

15265returnllvm::propagateMetadata(Inst, Insts);

15266}

15267

15268Value *BoUpSLP::vectorizeTree(TreeEntry *E,bool PostponedPHIs) {

15269IRBuilderBase::InsertPointGuard Guard(Builder);

15270

15271if (E->VectorizedValue &&

15272 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||

15273 E->isAltShuffle())) {

15274LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *E->Scalars[0] <<".\n");

15275return E->VectorizedValue;

15276 }

15277

15278Value *V = E->Scalars.front();

15279Type *ScalarTy =V->getType();

15280if (!isa<CmpInst>(V))

15281 ScalarTy =getValueType(V);

15282auto It = MinBWs.find(E);

15283if (It != MinBWs.end()) {

15284auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);

15285 ScalarTy =IntegerType::get(F->getContext(), It->second.first);

15286if (VecTy)

15287 ScalarTy =getWidenedType(ScalarTy, VecTy->getNumElements());

15288 }

15289auto *VecTy =getWidenedType(ScalarTy, E->Scalars.size());

15290if (E->isGather()) {

15291// Set insert point for non-reduction initial nodes.

15292if (E->hasState() && E->Idx == 0 && !UserIgnoreList)

15293 setInsertPointAfterBundle(E);

15294Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);

15295 E->VectorizedValue = Vec;

15296return Vec;

15297 }

15298

15299bool IsReverseOrder =

15300 !E->ReorderIndices.empty() &&isReverseOrder(E->ReorderIndices);

15301auto FinalShuffle = [&](Value *V,const TreeEntry *E) {

15302 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);

15303if (E->getOpcode() == Instruction::Store &&

15304 E->State == TreeEntry::Vectorize) {

15305ArrayRef<int>Mask =

15306ArrayRef(reinterpret_cast<constint *>(E->ReorderIndices.begin()),

15307 E->ReorderIndices.size());

15308 ShuffleBuilder.add(V, Mask);

15309 }elseif (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {

15310 ShuffleBuilder.addOrdered(V, {});

15311 }else {

15312 ShuffleBuilder.addOrdered(V, E->ReorderIndices);

15313 }

15314SmallVector<std::pair<const TreeEntry *, unsigned>> SubVectors(

15315 E->CombinedEntriesWithIndices.size());

15316transform(

15317 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](constauto &P) {

15318 return std::make_pair(VectorizableTree[P.first].get(), P.second);

15319 });

15320assert(

15321 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&

15322"Expected either combined subnodes or reordering");

15323return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});

15324 };

15325

15326assert(!E->isGather() &&"Unhandled state");

15327unsigned ShuffleOrOp =

15328 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();

15329Instruction *VL0 = E->getMainOp();

15330auto GetOperandSignedness = [&](unsignedIdx) {

15331const TreeEntry *OpE = getOperandEntry(E,Idx);

15332bool IsSigned =false;

15333auto It = MinBWs.find(OpE);

15334if (It != MinBWs.end())

15335 IsSigned = It->second.second;

15336else

15337 IsSigned =any_of(OpE->Scalars, [&](Value *R) {

15338 if (isa<PoisonValue>(V))

15339 return false;

15340 return !isKnownNonNegative(R, SimplifyQuery(*DL));

15341 });

15342return IsSigned;

15343 };

15344switch (ShuffleOrOp) {

15345case Instruction::PHI: {

15346assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||

15347 E != VectorizableTree.front().get() ||

15348 !E->UserTreeIndices.empty()) &&

15349"PHI reordering is free.");

15350if (PostponedPHIs && E->VectorizedValue)

15351return E->VectorizedValue;

15352auto *PH = cast<PHINode>(VL0);

15353 Builder.SetInsertPoint(PH->getParent(),

15354 PH->getParent()->getFirstNonPHIIt());

15355 Builder.SetCurrentDebugLocation(PH->getDebugLoc());

15356if (PostponedPHIs || !E->VectorizedValue) {

15357PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());

15358 E->PHI = NewPhi;

15359Value *V = NewPhi;

15360

15361// Adjust insertion point once all PHI's have been generated.

15362 Builder.SetInsertPoint(PH->getParent(),

15363 PH->getParent()->getFirstInsertionPt());

15364 Builder.SetCurrentDebugLocation(PH->getDebugLoc());

15365

15366V = FinalShuffle(V, E);

15367

15368 E->VectorizedValue =V;

15369if (PostponedPHIs)

15370returnV;

15371 }

15372PHINode *NewPhi = cast<PHINode>(E->PHI);

15373// If phi node is fully emitted - exit.

15374if (NewPhi->getNumIncomingValues() != 0)

15375return NewPhi;

15376

15377// PHINodes may have multiple entries from the same block. We want to

15378// visit every block once.

15379SmallPtrSet<BasicBlock *, 4> VisitedBBs;

15380

15381for (unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {

15382ValueList Operands;

15383BasicBlock *IBB = PH->getIncomingBlock(I);

15384

15385// Stop emission if all incoming values are generated.

15386if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {

15387LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15388return NewPhi;

15389 }

15390

15391if (!VisitedBBs.insert(IBB).second) {

15392 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);

15393continue;

15394 }

15395

15396 Builder.SetInsertPoint(IBB->getTerminator());

15397 Builder.SetCurrentDebugLocation(PH->getDebugLoc());

15398Value *Vec = vectorizeOperand(E,I,/*PostponedPHIs=*/true);

15399if (VecTy != Vec->getType()) {

15400assert((It != MinBWs.end() || getOperandEntry(E,I)->isGather() ||

15401 MinBWs.contains(getOperandEntry(E,I))) &&

15402"Expected item in MinBWs.");

15403 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));

15404 }

15405 NewPhi->addIncoming(Vec, IBB);

15406 }

15407

15408assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&

15409"Invalid number of incoming values");

15410assert(E->VectorizedValue &&"Expected vectorized value.");

15411return E->VectorizedValue;

15412 }

15413

15414case Instruction::ExtractElement: {

15415Value *V = E->getSingleOperand(0);

15416if (const TreeEntry *TE = getTreeEntry(V))

15417V =TE->VectorizedValue;

15418 setInsertPointAfterBundle(E);

15419V = FinalShuffle(V, E);

15420 E->VectorizedValue =V;

15421returnV;

15422 }

15423case Instruction::ExtractValue: {

15424auto *LI = cast<LoadInst>(E->getSingleOperand(0));

15425 Builder.SetInsertPoint(LI);

15426Value *Ptr = LI->getPointerOperand();

15427LoadInst *V = Builder.CreateAlignedLoad(VecTy,Ptr, LI->getAlign());

15428Value *NewV =::propagateMetadata(V, E->Scalars);

15429 NewV = FinalShuffle(NewV, E);

15430 E->VectorizedValue = NewV;

15431return NewV;

15432 }

15433case Instruction::InsertElement: {

15434assert(E->ReuseShuffleIndices.empty() &&"All inserts should be unique");

15435 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));

15436Value *V = vectorizeOperand(E, 1, PostponedPHIs);

15437ArrayRef<Value *>Op = E->getOperand(1);

15438Type *ScalarTy =Op.front()->getType();

15439if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {

15440assert(ScalarTy->isIntegerTy() &&"Expected item in MinBWs.");

15441 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));

15442assert(Res.first > 0 &&"Expected item in MinBWs.");

15443V = Builder.CreateIntCast(

15444 V,

15445getWidenedType(

15446 ScalarTy,

15447 cast<FixedVectorType>(V->getType())->getNumElements()),

15448 Res.second);

15449 }

15450

15451// Create InsertVector shuffle if necessary

15452auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {

15453 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));

15454 }));

15455constunsigned NumElts =

15456 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();

15457constunsigned NumScalars = E->Scalars.size();

15458

15459unsignedOffset = *getElementIndex(VL0);

15460assert(Offset < NumElts &&"Failed to find vector index offset");

15461

15462// Create shuffle to resize vector

15463SmallVector<int>Mask;

15464if (!E->ReorderIndices.empty()) {

15465inversePermutation(E->ReorderIndices, Mask);

15466Mask.append(NumElts - NumScalars,PoisonMaskElem);

15467 }else {

15468Mask.assign(NumElts,PoisonMaskElem);

15469 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);

15470 }

15471// Create InsertVector shuffle if necessary

15472bool IsIdentity =true;

15473SmallVector<int> PrevMask(NumElts,PoisonMaskElem);

15474Mask.swap(PrevMask);

15475for (unsignedI = 0;I < NumScalars; ++I) {

15476Value *Scalar = E->Scalars[PrevMask[I]];

15477unsigned InsertIdx = *getElementIndex(Scalar);

15478 IsIdentity &= InsertIdx -Offset ==I;

15479Mask[InsertIdx -Offset] =I;

15480 }

15481if (!IsIdentity || NumElts != NumScalars) {

15482Value *V2 =nullptr;

15483bool IsVNonPoisonous =

15484 !isConstant(V) &&isGuaranteedNotToBePoison(V, AC);

15485SmallVector<int> InsertMask(Mask);

15486if (NumElts != NumScalars &&Offset == 0) {

15487// Follow all insert element instructions from the current buildvector

15488// sequence.

15489InsertElementInst *Ins = cast<InsertElementInst>(VL0);

15490do {

15491 std::optional<unsigned> InsertIdx =getElementIndex(Ins);

15492if (!InsertIdx)

15493break;

15494if (InsertMask[*InsertIdx] ==PoisonMaskElem)

15495 InsertMask[*InsertIdx] = *InsertIdx;

15496if (!Ins->hasOneUse())

15497break;

15498Ins = dyn_cast_or_null<InsertElementInst>(

15499Ins->getUniqueUndroppableUser());

15500 }while (Ins);

15501SmallBitVector UseMask =

15502buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);

15503SmallBitVector IsFirstPoison =

15504 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

15505SmallBitVector IsFirstUndef =

15506isUndefVector(FirstInsert->getOperand(0), UseMask);

15507if (!IsFirstPoison.all()) {

15508unsignedIdx = 0;

15509for (unsignedI = 0;I < NumElts;I++) {

15510if (InsertMask[I] ==PoisonMaskElem && !IsFirstPoison.test(I) &&

15511 IsFirstUndef.test(I)) {

15512if (IsVNonPoisonous) {

15513 InsertMask[I] =I < NumScalars ?I : 0;

15514continue;

15515 }

15516if (!V2)

15517V2 =UndefValue::get(V->getType());

15518if (Idx >= NumScalars)

15519Idx = NumScalars - 1;

15520 InsertMask[I] = NumScalars +Idx;

15521 ++Idx;

15522 }elseif (InsertMask[I] !=PoisonMaskElem &&

15523 Mask[I] ==PoisonMaskElem) {

15524 InsertMask[I] =PoisonMaskElem;

15525 }

15526 }

15527 }else {

15528 InsertMask =Mask;

15529 }

15530 }

15531if (!V2)

15532V2 =PoisonValue::get(V->getType());

15533V = Builder.CreateShuffleVector(V, V2, InsertMask);

15534if (auto *I = dyn_cast<Instruction>(V)) {

15535 GatherShuffleExtractSeq.insert(I);

15536 CSEBlocks.insert(I->getParent());

15537 }

15538 }

15539

15540SmallVector<int> InsertMask(NumElts,PoisonMaskElem);

15541for (unsignedI = 0;I < NumElts;I++) {

15542if (Mask[I] !=PoisonMaskElem)

15543 InsertMask[Offset +I] =I;

15544 }

15545SmallBitVector UseMask =

15546buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);

15547SmallBitVector IsFirstUndef =

15548isUndefVector(FirstInsert->getOperand(0), UseMask);

15549if ((!IsIdentity ||Offset != 0 || !IsFirstUndef.all()) &&

15550 NumElts != NumScalars) {

15551if (IsFirstUndef.all()) {

15552if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {

15553SmallBitVector IsFirstPoison =

15554 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

15555if (!IsFirstPoison.all()) {

15556for (unsignedI = 0;I < NumElts;I++) {

15557if (InsertMask[I] ==PoisonMaskElem && !IsFirstPoison.test(I))

15558 InsertMask[I] =I + NumElts;

15559 }

15560 }

15561V = Builder.CreateShuffleVector(

15562 V,

15563 IsFirstPoison.all() ?PoisonValue::get(V->getType())

15564 : FirstInsert->getOperand(0),

15565 InsertMask, cast<Instruction>(E->Scalars.back())->getName());

15566if (auto *I = dyn_cast<Instruction>(V)) {

15567 GatherShuffleExtractSeq.insert(I);

15568 CSEBlocks.insert(I->getParent());

15569 }

15570 }

15571 }else {

15572SmallBitVector IsFirstPoison =

15573 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);

15574for (unsignedI = 0;I < NumElts;I++) {

15575if (InsertMask[I] ==PoisonMaskElem)

15576 InsertMask[I] = IsFirstPoison.test(I) ?PoisonMaskElem :I;

15577else

15578 InsertMask[I] += NumElts;

15579 }

15580V = Builder.CreateShuffleVector(

15581 FirstInsert->getOperand(0), V, InsertMask,

15582 cast<Instruction>(E->Scalars.back())->getName());

15583if (auto *I = dyn_cast<Instruction>(V)) {

15584 GatherShuffleExtractSeq.insert(I);

15585 CSEBlocks.insert(I->getParent());

15586 }

15587 }

15588 }

15589

15590 ++NumVectorInstructions;

15591 E->VectorizedValue =V;

15592returnV;

15593 }

15594case Instruction::ZExt:

15595case Instruction::SExt:

15596case Instruction::FPToUI:

15597case Instruction::FPToSI:

15598case Instruction::FPExt:

15599case Instruction::PtrToInt:

15600case Instruction::IntToPtr:

15601case Instruction::SIToFP:

15602case Instruction::UIToFP:

15603case Instruction::Trunc:

15604case Instruction::FPTrunc:

15605case Instruction::BitCast: {

15606 setInsertPointAfterBundle(E);

15607

15608Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);

15609if (E->VectorizedValue) {

15610LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15611return E->VectorizedValue;

15612 }

15613

15614auto *CI = cast<CastInst>(VL0);

15615Instruction::CastOps VecOpcode = CI->getOpcode();

15616Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();

15617auto SrcIt = MinBWs.find(getOperandEntry(E, 0));

15618if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&

15619 (SrcIt != MinBWs.end() || It != MinBWs.end() ||

15620 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {

15621// Check if the values are candidates to demote.

15622unsigned SrcBWSz =DL->getTypeSizeInBits(SrcScalarTy);

15623if (SrcIt != MinBWs.end())

15624 SrcBWSz = SrcIt->second.first;

15625unsigned BWSz =DL->getTypeSizeInBits(ScalarTy->getScalarType());

15626if (BWSz == SrcBWSz) {

15627 VecOpcode = Instruction::BitCast;

15628 }elseif (BWSz < SrcBWSz) {

15629 VecOpcode = Instruction::Trunc;

15630 }elseif (It != MinBWs.end()) {

15631assert(BWSz > SrcBWSz &&"Invalid cast!");

15632 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;

15633 }elseif (SrcIt != MinBWs.end()) {

15634assert(BWSz > SrcBWSz &&"Invalid cast!");

15635 VecOpcode =

15636 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;

15637 }

15638 }elseif (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&

15639 !SrcIt->second.second) {

15640 VecOpcode = Instruction::UIToFP;

15641 }

15642Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)

15643 ? InVec

15644 : Builder.CreateCast(VecOpcode, InVec, VecTy);

15645V = FinalShuffle(V, E);

15646

15647 E->VectorizedValue =V;

15648 ++NumVectorInstructions;

15649returnV;

15650 }

15651case Instruction::FCmp:

15652case Instruction::ICmp: {

15653 setInsertPointAfterBundle(E);

15654

15655Value *L = vectorizeOperand(E, 0, PostponedPHIs);

15656if (E->VectorizedValue) {

15657LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15658return E->VectorizedValue;

15659 }

15660Value *R = vectorizeOperand(E, 1, PostponedPHIs);

15661if (E->VectorizedValue) {

15662LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15663return E->VectorizedValue;

15664 }

15665if (L->getType() !=R->getType()) {

15666assert((getOperandEntry(E, 0)->isGather() ||

15667 getOperandEntry(E, 1)->isGather() ||

15668 MinBWs.contains(getOperandEntry(E, 0)) ||

15669 MinBWs.contains(getOperandEntry(E, 1))) &&

15670"Expected item in MinBWs.");

15671if (cast<VectorType>(L->getType())

15672 ->getElementType()

15673 ->getIntegerBitWidth() < cast<VectorType>(R->getType())

15674 ->getElementType()

15675 ->getIntegerBitWidth()) {

15676Type *CastTy =R->getType();

15677L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));

15678 }else {

15679Type *CastTy =L->getType();

15680R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));

15681 }

15682 }

15683

15684CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();

15685Value *V = Builder.CreateCmp(P0, L, R);

15686propagateIRFlags(V, E->Scalars, VL0);

15687if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())

15688 ICmp->setSameSign(/*B=*/false);

15689// Do not cast for cmps.

15690 VecTy = cast<FixedVectorType>(V->getType());

15691V = FinalShuffle(V, E);

15692

15693 E->VectorizedValue =V;

15694 ++NumVectorInstructions;

15695returnV;

15696 }

15697case Instruction::Select: {

15698 setInsertPointAfterBundle(E);

15699

15700Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);

15701if (E->VectorizedValue) {

15702LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15703return E->VectorizedValue;

15704 }

15705Value *True = vectorizeOperand(E, 1, PostponedPHIs);

15706if (E->VectorizedValue) {

15707LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15708return E->VectorizedValue;

15709 }

15710Value *False = vectorizeOperand(E, 2, PostponedPHIs);

15711if (E->VectorizedValue) {

15712LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15713return E->VectorizedValue;

15714 }

15715if (True->getType() != VecTy || False->getType() != VecTy) {

15716assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||

15717 getOperandEntry(E, 2)->isGather() ||

15718 MinBWs.contains(getOperandEntry(E, 1)) ||

15719 MinBWs.contains(getOperandEntry(E, 2))) &&

15720"Expected item in MinBWs.");

15721if (True->getType() != VecTy)

15722 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));

15723if (False->getType() != VecTy)

15724 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));

15725 }

15726

15727unsigned CondNumElements =getNumElements(Cond->getType());

15728unsigned TrueNumElements =getNumElements(True->getType());

15729assert(TrueNumElements >= CondNumElements &&

15730 TrueNumElements % CondNumElements == 0 &&

15731"Cannot vectorize Instruction::Select");

15732assert(TrueNumElements ==getNumElements(False->getType()) &&

15733"Cannot vectorize Instruction::Select");

15734if (CondNumElements != TrueNumElements) {

15735// When the return type is i1 but the source is fixed vector type, we

15736// need to duplicate the condition value.

15737Cond = Builder.CreateShuffleVector(

15738Cond,createReplicatedMask(TrueNumElements / CondNumElements,

15739 CondNumElements));

15740 }

15741assert(getNumElements(Cond->getType()) == TrueNumElements &&

15742"Cannot vectorize Instruction::Select");

15743Value *V = Builder.CreateSelect(Cond, True, False);

15744V = FinalShuffle(V, E);

15745

15746 E->VectorizedValue =V;

15747 ++NumVectorInstructions;

15748returnV;

15749 }

15750case Instruction::FNeg: {

15751 setInsertPointAfterBundle(E);

15752

15753Value *Op = vectorizeOperand(E, 0, PostponedPHIs);

15754

15755if (E->VectorizedValue) {

15756LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15757return E->VectorizedValue;

15758 }

15759

15760Value *V = Builder.CreateUnOp(

15761static_cast<Instruction::UnaryOps>(E->getOpcode()),Op);

15762propagateIRFlags(V, E->Scalars, VL0);

15763if (auto *I = dyn_cast<Instruction>(V))

15764V =::propagateMetadata(I, E->Scalars);

15765

15766V = FinalShuffle(V, E);

15767

15768 E->VectorizedValue =V;

15769 ++NumVectorInstructions;

15770

15771returnV;

15772 }

15773case Instruction::Freeze: {

15774 setInsertPointAfterBundle(E);

15775

15776Value *Op = vectorizeOperand(E, 0, PostponedPHIs);

15777

15778if (E->VectorizedValue) {

15779LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15780return E->VectorizedValue;

15781 }

15782

15783if (Op->getType() != VecTy) {

15784assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||

15785 MinBWs.contains(getOperandEntry(E, 0))) &&

15786"Expected item in MinBWs.");

15787Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));

15788 }

15789Value *V = Builder.CreateFreeze(Op);

15790V = FinalShuffle(V, E);

15791

15792 E->VectorizedValue =V;

15793 ++NumVectorInstructions;

15794

15795returnV;

15796 }

15797case Instruction::Add:

15798case Instruction::FAdd:

15799case Instruction::Sub:

15800case Instruction::FSub:

15801case Instruction::Mul:

15802case Instruction::FMul:

15803case Instruction::UDiv:

15804case Instruction::SDiv:

15805case Instruction::FDiv:

15806case Instruction::URem:

15807case Instruction::SRem:

15808case Instruction::FRem:

15809case Instruction::Shl:

15810case Instruction::LShr:

15811case Instruction::AShr:

15812case Instruction::And:

15813case Instruction::Or:

15814case Instruction::Xor: {

15815 setInsertPointAfterBundle(E);

15816

15817Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);

15818if (E->VectorizedValue) {

15819LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15820return E->VectorizedValue;

15821 }

15822Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);

15823if (E->VectorizedValue) {

15824LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15825return E->VectorizedValue;

15826 }

15827if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {

15828for (unsignedI : seq<unsigned>(0, E->getNumOperands())) {

15829ArrayRef<Value *> Ops = E->getOperand(I);

15830if (all_of(Ops, [&](Value *Op) {

15831auto *CI = dyn_cast<ConstantInt>(Op);

15832return CI && CI->getValue().countr_one() >= It->second.first;

15833 })) {

15834V = FinalShuffle(I == 0 ? RHS : LHS, E);

15835 E->VectorizedValue =V;

15836 ++NumVectorInstructions;

15837returnV;

15838 }

15839 }

15840 }

15841if (LHS->getType() != VecTy ||RHS->getType() != VecTy) {

15842assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||

15843 getOperandEntry(E, 1)->isGather() ||

15844 MinBWs.contains(getOperandEntry(E, 0)) ||

15845 MinBWs.contains(getOperandEntry(E, 1))) &&

15846"Expected item in MinBWs.");

15847if (LHS->getType() != VecTy)

15848LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));

15849if (RHS->getType() != VecTy)

15850RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));

15851 }

15852

15853Value *V = Builder.CreateBinOp(

15854static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,

15855 RHS);

15856propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());

15857if (auto *I = dyn_cast<Instruction>(V)) {

15858V =::propagateMetadata(I, E->Scalars);

15859// Drop nuw flags for abs(sub(commutative), true).

15860if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&

15861any_of(E->Scalars, [](Value *V) {

15862 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));

15863 }))

15864I->setHasNoUnsignedWrap(/*b=*/false);

15865 }

15866

15867V = FinalShuffle(V, E);

15868

15869 E->VectorizedValue =V;

15870 ++NumVectorInstructions;

15871

15872returnV;

15873 }

15874case Instruction::Load: {

15875// Loads are inserted at the head of the tree because we don't want to

15876// sink them all the way down past store instructions.

15877 setInsertPointAfterBundle(E);

15878

15879LoadInst *LI = cast<LoadInst>(VL0);

15880Instruction *NewLI;

15881Value *PO = LI->getPointerOperand();

15882if (E->State == TreeEntry::Vectorize) {

15883 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());

15884 }elseif (E->State == TreeEntry::StridedVectorize) {

15885Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();

15886Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();

15887 PO = IsReverseOrder ? PtrN : Ptr0;

15888 std::optional<int> Diff =getPointersDiff(

15889 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);

15890Type *StrideTy =DL->getIndexType(PO->getType());

15891Value *StrideVal;

15892if (Diff) {

15893int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);

15894 StrideVal =

15895 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *

15896DL->getTypeAllocSize(ScalarTy));

15897 }else {

15898SmallVector<Value *> PointerOps(E->Scalars.size(),nullptr);

15899transform(E->Scalars, PointerOps.begin(), [](Value *V) {

15900 return cast<LoadInst>(V)->getPointerOperand();

15901 });

15902OrdersType Order;

15903 std::optional<Value *> Stride =

15904calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,

15905 &*Builder.GetInsertPoint());

15906Value *NewStride =

15907 Builder.CreateIntCast(*Stride, StrideTy,/*isSigned=*/true);

15908 StrideVal = Builder.CreateMul(

15909 NewStride,

15910 ConstantInt::get(

15911 StrideTy,

15912 (IsReverseOrder ? -1 : 1) *

15913static_cast<int>(DL->getTypeAllocSize(ScalarTy))));

15914 }

15915Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);

15916auto *Inst = Builder.CreateIntrinsic(

15917 Intrinsic::experimental_vp_strided_load,

15918 {VecTy, PO->getType(), StrideTy},

15919 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),

15920 Builder.getInt32(E->Scalars.size())});

15921 Inst->addParamAttr(

15922/*ArgNo=*/0,

15923Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));

15924 NewLI = Inst;

15925 }else {

15926assert(E->State == TreeEntry::ScatterVectorize &&"Unhandled state");

15927Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);

15928if (E->VectorizedValue) {

15929LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

15930return E->VectorizedValue;

15931 }

15932if (isa<FixedVectorType>(ScalarTy)) {

15933assert(SLPReVec &&"FixedVectorType is not expected.");

15934// CreateMaskedGather expects VecTy and VecPtr have same size. We need

15935// to expand VecPtr if ScalarTy is a vector type.

15936unsigned ScalarTyNumElements =

15937 cast<FixedVectorType>(ScalarTy)->getNumElements();

15938unsigned VecTyNumElements =

15939 cast<FixedVectorType>(VecTy)->getNumElements();

15940assert(VecTyNumElements % ScalarTyNumElements == 0 &&

15941"Cannot expand getelementptr.");

15942unsigned VF = VecTyNumElements / ScalarTyNumElements;

15943SmallVector<Constant *> Indices(VecTyNumElements);

15944transform(seq(VecTyNumElements), Indices.begin(), [=](unsignedI) {

15945 return Builder.getInt64(I % ScalarTyNumElements);

15946 });

15947 VecPtr = Builder.CreateGEP(

15948 VecTy->getElementType(),

15949 Builder.CreateShuffleVector(

15950 VecPtr,createReplicatedMask(ScalarTyNumElements, VF)),

15951ConstantVector::get(Indices));

15952 }

15953// Use the minimum alignment of the gathered loads.

15954Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);

15955 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);

15956 }

15957Value *V =::propagateMetadata(NewLI, E->Scalars);

15958

15959V = FinalShuffle(V, E);

15960 E->VectorizedValue =V;

15961 ++NumVectorInstructions;

15962returnV;

15963 }

15964case Instruction::Store: {

15965auto *SI = cast<StoreInst>(VL0);

15966

15967 setInsertPointAfterBundle(E);

15968

15969Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);

15970if (VecValue->getType() != VecTy)

15971 VecValue =

15972 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));

15973 VecValue = FinalShuffle(VecValue, E);

15974

15975Value *Ptr =SI->getPointerOperand();

15976Instruction *ST;

15977if (E->State == TreeEntry::Vectorize) {

15978ST = Builder.CreateAlignedStore(VecValue,Ptr,SI->getAlign());

15979 }else {

15980assert(E->State == TreeEntry::StridedVectorize &&

15981"Expected either strided or consecutive stores.");

15982if (!E->ReorderIndices.empty()) {

15983SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);

15984Ptr =SI->getPointerOperand();

15985 }

15986Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);

15987Type *StrideTy =DL->getIndexType(SI->getPointerOperandType());

15988auto *Inst = Builder.CreateIntrinsic(

15989 Intrinsic::experimental_vp_strided_store,

15990 {VecTy,Ptr->getType(), StrideTy},

15991 {VecValue,Ptr,

15992 ConstantInt::get(

15993 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),

15994 Builder.getAllOnesMask(VecTy->getElementCount()),

15995 Builder.getInt32(E->Scalars.size())});

15996 Inst->addParamAttr(

15997/*ArgNo=*/1,

15998Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));

15999ST = Inst;

16000 }

16001

16002Value *V =::propagateMetadata(ST, E->Scalars);

16003

16004 E->VectorizedValue =V;

16005 ++NumVectorInstructions;

16006returnV;

16007 }

16008case Instruction::GetElementPtr: {

16009auto *GEP0 = cast<GetElementPtrInst>(VL0);

16010 setInsertPointAfterBundle(E);

16011

16012Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);

16013if (E->VectorizedValue) {

16014LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16015return E->VectorizedValue;

16016 }

16017

16018SmallVector<Value *> OpVecs;

16019for (int J = 1,N = GEP0->getNumOperands(); J <N; ++J) {

16020Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);

16021if (E->VectorizedValue) {

16022LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16023return E->VectorizedValue;

16024 }

16025 OpVecs.push_back(OpVec);

16026 }

16027

16028Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);

16029if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {

16030SmallVector<Value *> GEPs;

16031for (Value *V : E->Scalars) {

16032if (isa<GetElementPtrInst>(V))

16033 GEPs.push_back(V);

16034 }

16035V =::propagateMetadata(I, GEPs);

16036 }

16037

16038V = FinalShuffle(V, E);

16039

16040 E->VectorizedValue =V;

16041 ++NumVectorInstructions;

16042

16043returnV;

16044 }

16045case Instruction::Call: {

16046CallInst *CI = cast<CallInst>(VL0);

16047 setInsertPointAfterBundle(E);

16048

16049Intrinsic::ID ID =getVectorIntrinsicIDForCall(CI, TLI);

16050

16051SmallVector<Type *> ArgTys =buildIntrinsicArgTypes(

16052 CI,ID, VecTy->getNumElements(),

16053 It != MinBWs.end() ? It->second.first : 0,TTI);

16054auto VecCallCosts =getVectorCallCosts(CI, VecTy,TTI, TLI, ArgTys);

16055bool UseIntrinsic =ID !=Intrinsic::not_intrinsic &&

16056 VecCallCosts.first <= VecCallCosts.second;

16057

16058Value *ScalarArg =nullptr;

16059SmallVector<Value *> OpVecs;

16060SmallVector<Type *, 2> TysForDecl;

16061// Add return type if intrinsic is overloaded on it.

16062if (UseIntrinsic &&isVectorIntrinsicWithOverloadTypeAtArg(ID, -1,TTI))

16063 TysForDecl.push_back(VecTy);

16064auto *CEI = cast<CallInst>(VL0);

16065for (unsignedI : seq<unsigned>(0, CI->arg_size())) {

16066ValueList OpVL;

16067// Some intrinsics have scalar arguments. This argument should not be

16068// vectorized.

16069if (UseIntrinsic &&isVectorIntrinsicWithScalarOpAtArg(ID,I,TTI)) {

16070 ScalarArg = CEI->getArgOperand(I);

16071// if decided to reduce bitwidth of abs intrinsic, it second argument

16072// must be set false (do not return poison, if value issigned min).

16073if (ID == Intrinsic::abs && It != MinBWs.end() &&

16074 It->second.first <DL->getTypeSizeInBits(CEI->getType()))

16075 ScalarArg = Builder.getFalse();

16076 OpVecs.push_back(ScalarArg);

16077if (isVectorIntrinsicWithOverloadTypeAtArg(ID,I,TTI))

16078 TysForDecl.push_back(ScalarArg->getType());

16079continue;

16080 }

16081

16082Value *OpVec = vectorizeOperand(E,I, PostponedPHIs);

16083if (E->VectorizedValue) {

16084LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16085return E->VectorizedValue;

16086 }

16087 ScalarArg = CEI->getArgOperand(I);

16088if (cast<VectorType>(OpVec->getType())->getElementType() !=

16089 ScalarArg->getType()->getScalarType() &&

16090 It == MinBWs.end()) {

16091auto *CastTy =

16092getWidenedType(ScalarArg->getType(), VecTy->getNumElements());

16093 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));

16094 }elseif (It != MinBWs.end()) {

16095 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));

16096 }

16097LLVM_DEBUG(dbgs() <<"SLP: OpVec[" <<I <<"]: " << *OpVec <<"\n");

16098 OpVecs.push_back(OpVec);

16099if (UseIntrinsic &&isVectorIntrinsicWithOverloadTypeAtArg(ID,I,TTI))

16100 TysForDecl.push_back(OpVec->getType());

16101 }

16102

16103Function *CF;

16104if (!UseIntrinsic) {

16105VFShape Shape =

16106VFShape::get(CI->getFunctionType(),

16107ElementCount::getFixed(

16108static_cast<unsigned>(VecTy->getNumElements())),

16109false/*HasGlobalPred*/);

16110 CF =VFDatabase(*CI).getVectorizedFunction(Shape);

16111 }else {

16112 CF =Intrinsic::getOrInsertDeclaration(F->getParent(),ID, TysForDecl);

16113 }

16114

16115SmallVector<OperandBundleDef, 1> OpBundles;

16116 CI->getOperandBundlesAsDefs(OpBundles);

16117Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);

16118

16119propagateIRFlags(V, E->Scalars, VL0);

16120V = FinalShuffle(V, E);

16121

16122 E->VectorizedValue =V;

16123 ++NumVectorInstructions;

16124returnV;

16125 }

16126case Instruction::ShuffleVector: {

16127Value *V;

16128if (SLPReVec && !E->isAltShuffle()) {

16129 setInsertPointAfterBundle(E);

16130Value *Src = vectorizeOperand(E, 0, PostponedPHIs);

16131if (E->VectorizedValue) {

16132LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16133return E->VectorizedValue;

16134 }

16135SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));

16136if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {

16137assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&

16138"Not supported shufflevector usage.");

16139SmallVector<int> NewMask(ThisMask.size());

16140transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {

16141 return SVSrc->getShuffleMask()[Mask];

16142 });

16143V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);

16144 }else {

16145V = Builder.CreateShuffleVector(Src, ThisMask);

16146 }

16147propagateIRFlags(V, E->Scalars, VL0);

16148if (auto *I = dyn_cast<Instruction>(V))

16149V =::propagateMetadata(I, E->Scalars);

16150V = FinalShuffle(V, E);

16151 }else {

16152assert(E->isAltShuffle() &&

16153 ((Instruction::isBinaryOp(E->getOpcode()) &&

16154Instruction::isBinaryOp(E->getAltOpcode())) ||

16155 (Instruction::isCast(E->getOpcode()) &&

16156Instruction::isCast(E->getAltOpcode())) ||

16157 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&

16158"Invalid Shuffle Vector Operand");

16159

16160Value *LHS =nullptr, *RHS =nullptr;

16161if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {

16162 setInsertPointAfterBundle(E);

16163LHS = vectorizeOperand(E, 0, PostponedPHIs);

16164if (E->VectorizedValue) {

16165LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16166return E->VectorizedValue;

16167 }

16168RHS = vectorizeOperand(E, 1, PostponedPHIs);

16169 }else {

16170 setInsertPointAfterBundle(E);

16171LHS = vectorizeOperand(E, 0, PostponedPHIs);

16172 }

16173if (E->VectorizedValue) {

16174LLVM_DEBUG(dbgs() <<"SLP: Diamond merged for " << *VL0 <<".\n");

16175return E->VectorizedValue;

16176 }

16177if (LHS && RHS &&

16178 ((Instruction::isBinaryOp(E->getOpcode()) &&

16179 (LHS->getType() != VecTy ||RHS->getType() != VecTy)) ||

16180 (isa<CmpInst>(VL0) &&LHS->getType() !=RHS->getType()))) {

16181assert((It != MinBWs.end() ||

16182 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||

16183 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||

16184 MinBWs.contains(getOperandEntry(E, 0)) ||

16185 MinBWs.contains(getOperandEntry(E, 1))) &&

16186"Expected item in MinBWs.");

16187Type *CastTy = VecTy;

16188if (isa<CmpInst>(VL0) &&LHS->getType() !=RHS->getType()) {

16189if (cast<VectorType>(LHS->getType())

16190 ->getElementType()

16191 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())

16192 ->getElementType()

16193 ->getIntegerBitWidth())

16194 CastTy =RHS->getType();

16195else

16196 CastTy =LHS->getType();

16197 }

16198if (LHS->getType() != CastTy)

16199LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));

16200if (RHS->getType() != CastTy)

16201RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));

16202 }

16203

16204Value *V0, *V1;

16205if (Instruction::isBinaryOp(E->getOpcode())) {

16206 V0 = Builder.CreateBinOp(

16207static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);

16208 V1 = Builder.CreateBinOp(

16209static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);

16210 }elseif (auto *CI0 = dyn_cast<CmpInst>(VL0)) {

16211 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);

16212auto *AltCI = cast<CmpInst>(E->getAltOp());

16213CmpInst::Predicate AltPred = AltCI->getPredicate();

16214 V1 = Builder.CreateCmp(AltPred, LHS, RHS);

16215 }else {

16216if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {

16217unsigned SrcBWSz =DL->getTypeSizeInBits(

16218 cast<VectorType>(LHS->getType())->getElementType());

16219unsigned BWSz =DL->getTypeSizeInBits(ScalarTy);

16220if (BWSz <= SrcBWSz) {

16221if (BWSz < SrcBWSz)

16222LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);

16223assert(LHS->getType() == VecTy &&

16224"Expected same type as operand.");

16225if (auto *I = dyn_cast<Instruction>(LHS))

16226LHS =::propagateMetadata(I, E->Scalars);

16227LHS = FinalShuffle(LHS, E);

16228 E->VectorizedValue =LHS;

16229 ++NumVectorInstructions;

16230returnLHS;

16231 }

16232 }

16233 V0 = Builder.CreateCast(

16234static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);

16235 V1 = Builder.CreateCast(

16236static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);

16237 }

16238// Add V0 and V1 to later analysis to try to find and remove matching

16239// instruction, if any.

16240for (Value *V : {V0, V1}) {

16241if (auto *I = dyn_cast<Instruction>(V)) {

16242 GatherShuffleExtractSeq.insert(I);

16243 CSEBlocks.insert(I->getParent());

16244 }

16245 }

16246

16247// Create shuffle to take alternate operations from the vector.

16248// Also, gather up main and alt scalar ops to propagate IR flags to

16249// each vector operation.

16250ValueList OpScalars, AltScalars;

16251SmallVector<int>Mask;

16252 E->buildAltOpShuffleMask(

16253 [E,this](Instruction *I) {

16254assert(E->isOpcodeOrAlt(I) &&"Unexpected main/alternate opcode");

16255returnisAlternateInstruction(I, E->getMainOp(), E->getAltOp(),

16256 *TLI);

16257 },

16258Mask, &OpScalars, &AltScalars);

16259

16260propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());

16261propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());

16262auto DropNuwFlag = [&](Value *Vec,unsigned Opcode) {

16263// Drop nuw flags for abs(sub(commutative), true).

16264if (auto *I = dyn_cast<Instruction>(Vec);

16265I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&

16266any_of(E->Scalars, [](Value *V) {

16267 if (isa<PoisonValue>(V))

16268 return false;

16269 auto *IV = cast<Instruction>(V);

16270 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);

16271 }))

16272I->setHasNoUnsignedWrap(/*b=*/false);

16273 };

16274 DropNuwFlag(V0, E->getOpcode());

16275 DropNuwFlag(V1, E->getAltOpcode());

16276

16277if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {

16278assert(SLPReVec &&"FixedVectorType is not expected.");

16279transformScalarShuffleIndiciesToVector(VecTy->getNumElements(), Mask);

16280 }

16281V = Builder.CreateShuffleVector(V0, V1, Mask);

16282if (auto *I = dyn_cast<Instruction>(V)) {

16283V =::propagateMetadata(I, E->Scalars);

16284 GatherShuffleExtractSeq.insert(I);

16285 CSEBlocks.insert(I->getParent());

16286 }

16287 }

16288

16289 E->VectorizedValue =V;

16290 ++NumVectorInstructions;

16291

16292returnV;

16293 }

16294default:

16295llvm_unreachable("unknown inst");

16296 }

16297returnnullptr;

16298}

16299

16300Value *BoUpSLP::vectorizeTree() {

16301ExtraValueToDebugLocsMap ExternallyUsedValues;

16302returnvectorizeTree(ExternallyUsedValues);

16303}

16304

16305Value *

16306BoUpSLP::vectorizeTree(constExtraValueToDebugLocsMap &ExternallyUsedValues,

16307Instruction *ReductionRoot) {

16308// All blocks must be scheduled before any instructions are inserted.

16309for (auto &BSIter : BlocksSchedules) {

16310 scheduleBlock(BSIter.second.get());

16311 }

16312// Clean Entry-to-LastInstruction table. It can be affected after scheduling,

16313// need to rebuild it.

16314 EntryToLastInstruction.clear();

16315

16316if (ReductionRoot)

16317 Builder.SetInsertPoint(ReductionRoot->getParent(),

16318 ReductionRoot->getIterator());

16319else

16320 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());

16321

16322// Emit gathered loads first to emit better code for the users of those

16323// gathered loads.

16324for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {

16325if (GatheredLoadsEntriesFirst.has_value() &&

16326 TE->Idx >= *GatheredLoadsEntriesFirst &&

16327 (!TE->isGather() || !TE->UserTreeIndices.empty())) {

16328assert((!TE->UserTreeIndices.empty() ||

16329 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&

16330"Expected gathered load node.");

16331 (void)vectorizeTree(TE.get(),/*PostponedPHIs=*/false);

16332 }

16333 }

16334// Postpone emission of PHIs operands to avoid cyclic dependencies issues.

16335 (void)vectorizeTree(VectorizableTree[0].get(),/*PostponedPHIs=*/true);

16336for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)

16337if (TE->State == TreeEntry::Vectorize &&

16338 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&

16339 TE->VectorizedValue)

16340 (void)vectorizeTree(TE.get(),/*PostponedPHIs=*/false);

16341// Run through the list of postponed gathers and emit them, replacing the temp

16342// emitted allocas with actual vector instructions.

16343ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();

16344DenseMap<Value *, SmallVector<TreeEntry *>> PostponedValues;

16345for (const TreeEntry *E : PostponedNodes) {

16346auto *TE =const_cast<TreeEntry *>(E);

16347if (auto *VecTE = getTreeEntry(TE->Scalars.front()))

16348if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(

16349 TE->UserTreeIndices.front().EdgeIdx)) &&

16350 VecTE->isSame(TE->Scalars))

16351// Found gather node which is absolutely the same as one of the

16352// vectorized nodes. It may happen after reordering.

16353continue;

16354auto *PrevVec = cast<Instruction>(TE->VectorizedValue);

16355 TE->VectorizedValue =nullptr;

16356auto *UserI =

16357 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);

16358// If user is a PHI node, its vector code have to be inserted right before

16359// block terminator. Since the node was delayed, there were some unresolved

16360// dependencies at the moment when stab instruction was emitted. In a case

16361// when any of these dependencies turn out an operand of another PHI, coming

16362// from this same block, position of a stab instruction will become invalid.

16363// The is because source vector that supposed to feed this gather node was

16364// inserted at the end of the block [after stab instruction]. So we need

16365// to adjust insertion point again to the end of block.

16366if (isa<PHINode>(UserI)) {

16367// Insert before all users.

16368Instruction *InsertPt = PrevVec->getParent()->getTerminator();

16369for (User *U : PrevVec->users()) {

16370if (U == UserI)

16371continue;

16372auto *UI = dyn_cast<Instruction>(U);

16373if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())

16374continue;

16375if (UI->comesBefore(InsertPt))

16376 InsertPt = UI;

16377 }

16378 Builder.SetInsertPoint(InsertPt);

16379 }else {

16380 Builder.SetInsertPoint(PrevVec);

16381 }

16382 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());

16383Value *Vec =vectorizeTree(TE,/*PostponedPHIs=*/false);

16384if (auto *VecI = dyn_cast<Instruction>(Vec);

16385 VecI && VecI->getParent() == Builder.GetInsertBlock() &&

16386 Builder.GetInsertPoint()->comesBefore(VecI))

16387 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),

16388 Builder.GetInsertPoint());

16389if (Vec->getType() != PrevVec->getType()) {

16390assert(Vec->getType()->isIntOrIntVectorTy() &&

16391 PrevVec->getType()->isIntOrIntVectorTy() &&

16392"Expected integer vector types only.");

16393 std::optional<bool> IsSigned;

16394for (Value *V : TE->Scalars) {

16395if (const TreeEntry *BaseTE = getTreeEntry(V)) {

16396auto It = MinBWs.find(BaseTE);

16397if (It != MinBWs.end()) {

16398 IsSigned = IsSigned.value_or(false) || It->second.second;

16399if (*IsSigned)

16400break;

16401 }

16402for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {

16403auto It = MinBWs.find(MNTE);

16404if (It != MinBWs.end()) {

16405 IsSigned = IsSigned.value_or(false) || It->second.second;

16406if (*IsSigned)

16407break;

16408 }

16409 }

16410if (IsSigned.value_or(false))

16411break;

16412// Scan through gather nodes.

16413for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {

16414auto It = MinBWs.find(BVE);

16415if (It != MinBWs.end()) {

16416 IsSigned = IsSigned.value_or(false) || It->second.second;

16417if (*IsSigned)

16418break;

16419 }

16420 }

16421if (IsSigned.value_or(false))

16422break;

16423if (auto *EE = dyn_cast<ExtractElementInst>(V)) {

16424 IsSigned =

16425 IsSigned.value_or(false) ||

16426 !isKnownNonNegative(EE->getVectorOperand(),SimplifyQuery(*DL));

16427continue;

16428 }

16429if (IsSigned.value_or(false))

16430break;

16431 }

16432 }

16433if (IsSigned.value_or(false)) {

16434// Final attempt - check user node.

16435auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);

16436if (It != MinBWs.end())

16437 IsSigned = It->second.second;

16438 }

16439assert(IsSigned &&

16440"Expected user node or perfect diamond match in MinBWs.");

16441 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);

16442 }

16443 PrevVec->replaceAllUsesWith(Vec);

16444 PostponedValues.try_emplace(Vec).first->second.push_back(TE);

16445// Replace the stub vector node, if it was used before for one of the

16446// buildvector nodes already.

16447auto It = PostponedValues.find(PrevVec);

16448if (It != PostponedValues.end()) {

16449for (TreeEntry *VTE : It->getSecond())

16450 VTE->VectorizedValue = Vec;

16451 }

16452eraseInstruction(PrevVec);

16453 }

16454

16455LLVM_DEBUG(dbgs() <<"SLP: Extracting " << ExternalUses.size()

16456 <<" values .\n");

16457

16458SmallVector<ShuffledInsertData<Value *>> ShuffledInserts;

16459// Maps vector instruction to original insertelement instruction

16460DenseMap<Value *, InsertElementInst *> VectorToInsertElement;

16461// Maps extract Scalar to the corresponding extractelement instruction in the

16462// basic block. Only one extractelement per block should be emitted.

16463DenseMap<Value *, DenseMap<BasicBlock *, std::pair<Value *, Value *>>>

16464 ScalarToEEs;

16465SmallDenseSet<Value *, 4> UsedInserts;

16466DenseMap<std::pair<Value *, Type *>,Value *> VectorCasts;

16467SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;

16468SmallDenseSet<ExtractElementInst *, 4> IgnoredExtracts;

16469// Extract all of the elements with the external uses.

16470for (constauto &ExternalUse : ExternalUses) {

16471Value *Scalar = ExternalUse.Scalar;

16472llvm::User *User = ExternalUse.User;

16473

16474// Skip users that we already RAUW. This happens when one instruction

16475// has multiple uses of the same value.

16476if (User && !is_contained(Scalar->users(),User))

16477continue;

16478 TreeEntry *E = getTreeEntry(Scalar);

16479assert(E &&"Invalid scalar");

16480assert(!E->isGather() &&"Extracting from a gather list");

16481// Non-instruction pointers are not deleted, just skip them.

16482if (E->getOpcode() == Instruction::GetElementPtr &&

16483 !isa<GetElementPtrInst>(Scalar))

16484continue;

16485

16486Value *Vec = E->VectorizedValue;

16487assert(Vec &&"Can't find vectorizable value");

16488

16489Value *Lane = Builder.getInt32(ExternalUse.Lane);

16490auto ExtractAndExtendIfNeeded = [&](Value *Vec) {

16491if (Scalar->getType() != Vec->getType()) {

16492Value *Ex =nullptr;

16493Value *ExV =nullptr;

16494auto *Inst = dyn_cast<Instruction>(Scalar);

16495bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);

16496auto It = ScalarToEEs.find(Scalar);

16497if (It != ScalarToEEs.end()) {

16498// No need to emit many extracts, just move the only one in the

16499// current block.

16500auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()

16501 : Builder.GetInsertBlock());

16502if (EEIt != It->second.end()) {

16503Value *PrevV = EEIt->second.first;

16504if (auto *I = dyn_cast<Instruction>(PrevV);

16505I && !ReplaceInst &&

16506 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&

16507 Builder.GetInsertPoint()->comesBefore(I)) {

16508I->moveBefore(*Builder.GetInsertPoint()->getParent(),

16509 Builder.GetInsertPoint());

16510if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))

16511 CI->moveAfter(I);

16512 }

16513 Ex = PrevV;

16514 ExV = EEIt->second.second ? EEIt->second.second : Ex;

16515 }

16516 }

16517if (!Ex) {

16518// "Reuse" the existing extract to improve final codegen.

16519if (ReplaceInst) {

16520// Leave the instruction as is, if it cheaper extracts and all

16521// operands are scalar.

16522if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {

16523 IgnoredExtracts.insert(EE);

16524 Ex = EE;

16525 }else {

16526auto *CloneInst = Inst->clone();

16527 CloneInst->insertBefore(Inst->getIterator());

16528if (Inst->hasName())

16529 CloneInst->takeName(Inst);

16530 Ex = CloneInst;

16531 }

16532 }elseif (auto *ES = dyn_cast<ExtractElementInst>(Scalar);

16533 ES && isa<Instruction>(Vec)) {

16534Value *V = ES->getVectorOperand();

16535auto *IVec = cast<Instruction>(Vec);

16536if (const TreeEntry *ETE = getTreeEntry(V))

16537 V = ETE->VectorizedValue;

16538if (auto *IV = dyn_cast<Instruction>(V);

16539 !IV ||IV == Vec ||IV->getParent() != IVec->getParent() ||

16540IV->comesBefore(IVec))

16541 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());

16542else

16543 Ex = Builder.CreateExtractElement(Vec, Lane);

16544 }elseif (auto *VecTy =

16545 dyn_cast<FixedVectorType>(Scalar->getType())) {

16546assert(SLPReVec &&"FixedVectorType is not expected.");

16547unsigned VecTyNumElements = VecTy->getNumElements();

16548// When REVEC is enabled, we need to extract a vector.

16549// Note: The element size of Scalar may be different from the

16550// element size of Vec.

16551 Ex =createExtractVector(Builder, Vec, VecTyNumElements,

16552 ExternalUse.Lane * VecTyNumElements);

16553 }else {

16554 Ex = Builder.CreateExtractElement(Vec, Lane);

16555 }

16556// If necessary, sign-extend or zero-extend ScalarRoot

16557// to the larger type.

16558 ExV = Ex;

16559if (Scalar->getType() != Ex->getType())

16560 ExV = Builder.CreateIntCast(

16561 Ex, Scalar->getType(),

16562 !isKnownNonNegative(Scalar,SimplifyQuery(*DL)));

16563auto *I = dyn_cast<Instruction>(Ex);

16564 ScalarToEEs[Scalar].try_emplace(I ?I->getParent()

16565 : &F->getEntryBlock(),

16566 std::make_pair(Ex, ExV));

16567 }

16568// The then branch of the previous if may produce constants, since 0

16569// operand might be a constant.

16570if (auto *ExI = dyn_cast<Instruction>(Ex);

16571 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {

16572 GatherShuffleExtractSeq.insert(ExI);

16573 CSEBlocks.insert(ExI->getParent());

16574 }

16575return ExV;

16576 }

16577assert(isa<FixedVectorType>(Scalar->getType()) &&

16578 isa<InsertElementInst>(Scalar) &&

16579"In-tree scalar of vector type is not insertelement?");

16580auto *IE = cast<InsertElementInst>(Scalar);

16581 VectorToInsertElement.try_emplace(Vec, IE);

16582return Vec;

16583 };

16584// If User == nullptr, the Scalar remains as scalar in vectorized

16585// instructions or is used as extra arg. Generate ExtractElement instruction

16586// and update the record for this scalar in ExternallyUsedValues.

16587if (!User) {

16588if (!ScalarsWithNullptrUser.insert(Scalar).second)

16589continue;

16590assert((ExternallyUsedValues.count(Scalar) ||

16591 Scalar->hasNUsesOrMore(UsesLimit) ||

16592 ExternalUsesAsOriginalScalar.contains(Scalar) ||

16593any_of(Scalar->users(),

16594 [&](llvm::User *U) {

16595 if (ExternalUsesAsOriginalScalar.contains(U))

16596 return true;

16597 TreeEntry *UseEntry = getTreeEntry(U);

16598 return UseEntry &&

16599 (UseEntry->State == TreeEntry::Vectorize ||

16600 UseEntry->State ==

16601 TreeEntry::StridedVectorize) &&

16602 (E->State == TreeEntry::Vectorize ||

16603 E->State == TreeEntry::StridedVectorize) &&

16604 doesInTreeUserNeedToExtract(

16605 Scalar, getRootEntryInstruction(*UseEntry),

16606 TLI, TTI);

16607 })) &&

16608"Scalar with nullptr User must be registered in "

16609"ExternallyUsedValues map or remain as scalar in vectorized "

16610"instructions");

16611if (auto *VecI = dyn_cast<Instruction>(Vec)) {

16612if (auto *PHI = dyn_cast<PHINode>(VecI)) {

16613if (PHI->getParent()->isLandingPad())

16614 Builder.SetInsertPoint(

16615PHI->getParent(),

16616 std::next(

16617PHI->getParent()->getLandingPadInst()->getIterator()));

16618else

16619 Builder.SetInsertPoint(PHI->getParent(),

16620PHI->getParent()->getFirstNonPHIIt());

16621 }else {

16622 Builder.SetInsertPoint(VecI->getParent(),

16623 std::next(VecI->getIterator()));

16624 }

16625 }else {

16626 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());

16627 }

16628Value *NewInst = ExtractAndExtendIfNeeded(Vec);

16629// Required to update internally referenced instructions.

16630if (Scalar != NewInst) {

16631assert((!isa<ExtractElementInst>(Scalar) ||

16632 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&

16633"Extractelements should not be replaced.");

16634 Scalar->replaceAllUsesWith(NewInst);

16635 }

16636continue;

16637 }

16638

16639if (auto *VU = dyn_cast<InsertElementInst>(User);

16640 VU && VU->getOperand(1) == Scalar) {

16641// Skip if the scalar is another vector op or Vec is not an instruction.

16642if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {

16643if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {

16644if (!UsedInserts.insert(VU).second)

16645continue;

16646// Need to use original vector, if the root is truncated.

16647auto BWIt = MinBWs.find(E);

16648if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {

16649auto *ScalarTy = FTy->getElementType();

16650auto Key = std::make_pair(Vec, ScalarTy);

16651auto VecIt = VectorCasts.find(Key);

16652if (VecIt == VectorCasts.end()) {

16653IRBuilderBase::InsertPointGuard Guard(Builder);

16654if (auto *IVec = dyn_cast<PHINode>(Vec)) {

16655if (IVec->getParent()->isLandingPad())

16656 Builder.SetInsertPoint(IVec->getParent(),

16657 std::next(IVec->getParent()

16658 ->getLandingPadInst()

16659 ->getIterator()));

16660else

16661 Builder.SetInsertPoint(

16662 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());

16663 }elseif (auto *IVec = dyn_cast<Instruction>(Vec)) {

16664 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());

16665 }

16666 Vec = Builder.CreateIntCast(

16667 Vec,

16668getWidenedType(

16669 ScalarTy,

16670 cast<FixedVectorType>(Vec->getType())->getNumElements()),

16671 BWIt->second.second);

16672 VectorCasts.try_emplace(Key, Vec);

16673 }else {

16674 Vec = VecIt->second;

16675 }

16676 }

16677

16678 std::optional<unsigned> InsertIdx =getElementIndex(VU);

16679if (InsertIdx) {

16680auto *It =find_if(

16681 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {

16682// Checks if 2 insertelements are from the same buildvector.

16683InsertElementInst *VecInsert =Data.InsertElements.front();

16684returnareTwoInsertFromSameBuildVector(

16685 VU, VecInsert,

16686 [](InsertElementInst *II) {returnII->getOperand(0); });

16687 });

16688unsignedIdx = *InsertIdx;

16689if (It == ShuffledInserts.end()) {

16690 (void)ShuffledInserts.emplace_back();

16691 It = std::next(ShuffledInserts.begin(),

16692 ShuffledInserts.size() - 1);

16693 }

16694SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];

16695if (Mask.empty())

16696 Mask.assign(FTy->getNumElements(),PoisonMaskElem);

16697 Mask[Idx] = ExternalUse.Lane;

16698 It->InsertElements.push_back(cast<InsertElementInst>(User));

16699continue;

16700 }

16701 }

16702 }

16703 }

16704

16705// Generate extracts for out-of-tree users.

16706// Find the insertion point for the extractelement lane.

16707if (auto *VecI = dyn_cast<Instruction>(Vec)) {

16708if (PHINode *PH = dyn_cast<PHINode>(User)) {

16709for (unsignedI : seq<unsigned>(0, PH->getNumIncomingValues())) {

16710if (PH->getIncomingValue(I) == Scalar) {

16711Instruction *IncomingTerminator =

16712 PH->getIncomingBlock(I)->getTerminator();

16713if (isa<CatchSwitchInst>(IncomingTerminator)) {

16714 Builder.SetInsertPoint(VecI->getParent(),

16715 std::next(VecI->getIterator()));

16716 }else {

16717 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());

16718 }

16719Value *NewInst = ExtractAndExtendIfNeeded(Vec);

16720 PH->setOperand(I, NewInst);

16721 }

16722 }

16723 }else {

16724 Builder.SetInsertPoint(cast<Instruction>(User));

16725Value *NewInst = ExtractAndExtendIfNeeded(Vec);

16726User->replaceUsesOfWith(Scalar, NewInst);

16727 }

16728 }else {

16729 Builder.SetInsertPoint(&F->getEntryBlock(),F->getEntryBlock().begin());

16730Value *NewInst = ExtractAndExtendIfNeeded(Vec);

16731User->replaceUsesOfWith(Scalar, NewInst);

16732 }

16733

16734LLVM_DEBUG(dbgs() <<"SLP: Replaced:" << *User <<".\n");

16735 }

16736

16737auto CreateShuffle = [&](Value *V1,Value *V2,ArrayRef<int> Mask) {

16738SmallVector<int> CombinedMask1(Mask.size(),PoisonMaskElem);

16739SmallVector<int> CombinedMask2(Mask.size(),PoisonMaskElem);

16740int VF = cast<FixedVectorType>(V1->getType())->getNumElements();

16741for (intI = 0, E = Mask.size();I < E; ++I) {

16742if (Mask[I] < VF)

16743 CombinedMask1[I] = Mask[I];

16744else

16745 CombinedMask2[I] = Mask[I] - VF;

16746 }

16747ShuffleInstructionBuilder ShuffleBuilder(

16748 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);

16749 ShuffleBuilder.add(V1, CombinedMask1);

16750if (V2)

16751 ShuffleBuilder.add(V2, CombinedMask2);

16752return ShuffleBuilder.finalize({}, {}, {});

16753 };

16754

16755auto &&ResizeToVF = [&CreateShuffle](Value *Vec,ArrayRef<int> Mask,

16756bool ForSingleMask) {

16757unsigned VF = Mask.size();

16758unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();

16759if (VF != VecVF) {

16760if (any_of(Mask, [VF](intIdx) {returnIdx >=static_cast<int>(VF); })) {

16761 Vec = CreateShuffle(Vec,nullptr, Mask);

16762return std::make_pair(Vec,true);

16763 }

16764if (!ForSingleMask) {

16765SmallVector<int> ResizeMask(VF,PoisonMaskElem);

16766for (unsignedI = 0;I < VF; ++I) {

16767if (Mask[I] !=PoisonMaskElem)

16768 ResizeMask[Mask[I]] = Mask[I];

16769 }

16770 Vec = CreateShuffle(Vec,nullptr, ResizeMask);

16771 }

16772 }

16773

16774return std::make_pair(Vec,false);

16775 };

16776// Perform shuffling of the vectorize tree entries for better handling of

16777// external extracts.

16778for (intI = 0, E = ShuffledInserts.size();I < E; ++I) {

16779// Find the first and the last instruction in the list of insertelements.

16780sort(ShuffledInserts[I].InsertElements,isFirstInsertElement);

16781InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();

16782InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();

16783 Builder.SetInsertPoint(LastInsert);

16784autoVector = ShuffledInserts[I].ValueMasks.takeVector();

16785Value *NewInst = performExtractsShuffleAction<Value>(

16786MutableArrayRef(Vector.data(),Vector.size()),

16787 FirstInsert->getOperand(0),

16788 [](Value *Vec) {

16789 return cast<VectorType>(Vec->getType())

16790 ->getElementCount()

16791 .getKnownMinValue();

16792 },

16793 ResizeToVF,

16794 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,

16795ArrayRef<Value *> Vals) {

16796 assert((Vals.size() == 1 || Vals.size() == 2) &&

16797"Expected exactly 1 or 2 input values.");

16798 if (Vals.size() == 1) {

16799// Do not create shuffle if the mask is a simple identity

16800// non-resizing mask.

16801 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())

16802 ->getNumElements() ||

16803 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))

16804 return CreateShuffle(Vals.front(), nullptr, Mask);

16805 return Vals.front();

16806 }

16807return CreateShuffle(Vals.front() ? Vals.front()

16808 : FirstInsert->getOperand(0),

16809 Vals.back(), Mask);

16810 });

16811auto It = ShuffledInserts[I].InsertElements.rbegin();

16812// Rebuild buildvector chain.

16813InsertElementInst *II =nullptr;

16814if (It != ShuffledInserts[I].InsertElements.rend())

16815II = *It;

16816SmallVector<Instruction *> Inserts;

16817while (It != ShuffledInserts[I].InsertElements.rend()) {

16818assert(II &&"Must be an insertelement instruction.");

16819if (*It ==II)

16820 ++It;

16821else

16822 Inserts.push_back(cast<Instruction>(II));

16823II = dyn_cast<InsertElementInst>(II->getOperand(0));

16824 }

16825for (Instruction *II :reverse(Inserts)) {

16826II->replaceUsesOfWith(II->getOperand(0), NewInst);

16827if (auto *NewI = dyn_cast<Instruction>(NewInst))

16828if (II->getParent() == NewI->getParent() &&II->comesBefore(NewI))

16829II->moveAfter(NewI);

16830 NewInst =II;

16831 }

16832 LastInsert->replaceAllUsesWith(NewInst);

16833for (InsertElementInst *IE :reverse(ShuffledInserts[I].InsertElements)) {

16834 IE->replaceUsesOfWith(IE->getOperand(0),

16835PoisonValue::get(IE->getOperand(0)->getType()));

16836 IE->replaceUsesOfWith(IE->getOperand(1),

16837PoisonValue::get(IE->getOperand(1)->getType()));

16838eraseInstruction(IE);

16839 }

16840 CSEBlocks.insert(LastInsert->getParent());

16841 }

16842

16843SmallVector<Instruction *> RemovedInsts;

16844// For each vectorized value:

16845for (auto &TEPtr : VectorizableTree) {

16846 TreeEntry *Entry = TEPtr.get();

16847

16848// No need to handle users of gathered values.

16849if (Entry->isGather())

16850continue;

16851

16852assert(Entry->VectorizedValue &&"Can't find vectorizable value");

16853

16854// For each lane:

16855for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {

16856Value *Scalar = Entry->Scalars[Lane];

16857

16858if (Entry->getOpcode() == Instruction::GetElementPtr &&

16859 !isa<GetElementPtrInst>(Scalar))

16860continue;

16861if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);

16862 EE && IgnoredExtracts.contains(EE))

16863continue;

16864if (isa<PoisonValue>(Scalar))

16865continue;

16866#ifndef NDEBUG

16867Type *Ty = Scalar->getType();

16868if (!Ty->isVoidTy()) {

16869for (User *U : Scalar->users()) {

16870LLVM_DEBUG(dbgs() <<"SLP: \tvalidating user:" << *U <<".\n");

16871

16872// It is legal to delete users in the ignorelist.

16873assert((getTreeEntry(U) ||

16874 (UserIgnoreList && UserIgnoreList->contains(U)) ||

16875 (isa_and_nonnull<Instruction>(U) &&

16876 isDeleted(cast<Instruction>(U)))) &&

16877"Deleting out-of-tree value");

16878 }

16879 }

16880#endif

16881LLVM_DEBUG(dbgs() <<"SLP: \tErasing scalar:" << *Scalar <<".\n");

16882auto *I = cast<Instruction>(Scalar);

16883 RemovedInsts.push_back(I);

16884 }

16885 }

16886

16887// Merge the DIAssignIDs from the about-to-be-deleted instructions into the

16888// new vector instruction.

16889if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))

16890V->mergeDIAssignID(RemovedInsts);

16891

16892// Clear up reduction references, if any.

16893if (UserIgnoreList) {

16894for (Instruction *I : RemovedInsts) {

16895const TreeEntry *IE = getTreeEntry(I);

16896if (IE->Idx != 0 &&

16897 !(VectorizableTree.front()->isGather() &&

16898 !IE->UserTreeIndices.empty() &&

16899 (ValueToGatherNodes.lookup(I).contains(

16900 VectorizableTree.front().get()) ||

16901any_of(IE->UserTreeIndices,

16902 [&](const EdgeInfo &EI) {

16903 return EI.UserTE == VectorizableTree.front().get() &&

16904 EI.EdgeIdx == UINT_MAX;

16905 }))) &&

16906 !(GatheredLoadsEntriesFirst.has_value() &&

16907IE->Idx >= *GatheredLoadsEntriesFirst &&

16908 VectorizableTree.front()->isGather() &&

16909is_contained(VectorizableTree.front()->Scalars,I)))

16910continue;

16911SmallVector<SelectInst *> LogicalOpSelects;

16912I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {

16913// Do not replace condition of the logical op in form select <cond>.

16914 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&

16915 (match(U.getUser(), m_LogicalAnd()) ||

16916 match(U.getUser(), m_LogicalOr())) &&

16917 U.getOperandNo() == 0;

16918 if (IsPoisoningLogicalOp) {

16919 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));

16920 return false;

16921 }

16922return UserIgnoreList->contains(U.getUser());

16923 });

16924// Replace conditions of the poisoning logical ops with the non-poison

16925// constant value.

16926for (SelectInst *SI : LogicalOpSelects)

16927SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));

16928 }

16929 }

16930// Retain to-be-deleted instructions for some debug-info bookkeeping and alias

16931// cache correctness.

16932// NOTE: removeInstructionAndOperands only marks the instruction for deletion

16933// - instructions are not deleted until later.

16934 removeInstructionsAndOperands(ArrayRef(RemovedInsts));

16935

16936 Builder.ClearInsertionPoint();

16937 InstrElementSize.clear();

16938

16939const TreeEntry &RootTE = *VectorizableTree.front();

16940Value *Vec = RootTE.VectorizedValue;

16941if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&

16942 It != MinBWs.end() &&

16943 ReductionBitWidth != It->second.first) {

16944IRBuilder<>::InsertPointGuard Guard(Builder);

16945 Builder.SetInsertPoint(ReductionRoot->getParent(),

16946 ReductionRoot->getIterator());

16947 Vec = Builder.CreateIntCast(

16948 Vec,

16949VectorType::get(Builder.getIntNTy(ReductionBitWidth),

16950 cast<VectorType>(Vec->getType())->getElementCount()),

16951 It->second.second);

16952 }

16953return Vec;

16954}

16955

16956voidBoUpSLP::optimizeGatherSequence() {

16957LLVM_DEBUG(dbgs() <<"SLP: Optimizing " << GatherShuffleExtractSeq.size()

16958 <<" gather sequences instructions.\n");

16959// LICM InsertElementInst sequences.

16960for (Instruction *I : GatherShuffleExtractSeq) {

16961if (isDeleted(I))

16962continue;

16963

16964// Check if this block is inside a loop.

16965Loop *L = LI->getLoopFor(I->getParent());

16966if (!L)

16967continue;

16968

16969// Check if it has a preheader.

16970BasicBlock *PreHeader = L->getLoopPreheader();

16971if (!PreHeader)

16972continue;

16973

16974// If the vector or the element that we insert into it are

16975// instructions that are defined in this basic block then we can't

16976// hoist this instruction.

16977if (any_of(I->operands(), [L](Value *V) {

16978 auto *OpI = dyn_cast<Instruction>(V);

16979 return OpI && L->contains(OpI);

16980 }))

16981continue;

16982

16983// We can hoist this instruction. Move it to the pre-header.

16984I->moveBefore(PreHeader->getTerminator()->getIterator());

16985 CSEBlocks.insert(PreHeader);

16986 }

16987

16988// Make a list of all reachable blocks in our CSE queue.

16989SmallVector<const DomTreeNode *, 8> CSEWorkList;

16990 CSEWorkList.reserve(CSEBlocks.size());

16991for (BasicBlock *BB : CSEBlocks)

16992if (DomTreeNode *N = DT->getNode(BB)) {

16993assert(DT->isReachableFromEntry(N));

16994 CSEWorkList.push_back(N);

16995 }

16996

16997// Sort blocks by domination. This ensures we visit a block after all blocks

16998// dominating it are visited.

16999llvm::sort(CSEWorkList, [](constDomTreeNode *A,constDomTreeNode *B) {

17000assert((A ==B) == (A->getDFSNumIn() ==B->getDFSNumIn()) &&

17001"Different nodes should have different DFS numbers");

17002returnA->getDFSNumIn() <B->getDFSNumIn();

17003 });

17004

17005// Less defined shuffles can be replaced by the more defined copies.

17006// Between two shuffles one is less defined if it has the same vector operands

17007// and its mask indeces are the same as in the first one or undefs. E.g.

17008// shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,

17009// poison, <0, 0, 0, 0>.

17010auto &&IsIdenticalOrLessDefined = [TTI =TTI](Instruction *I1,

17011Instruction *I2,

17012SmallVectorImpl<int> &NewMask) {

17013if (I1->getType() != I2->getType())

17014returnfalse;

17015auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);

17016auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);

17017if (!SI1 || !SI2)

17018return I1->isIdenticalTo(I2);

17019if (SI1->isIdenticalTo(SI2))

17020returntrue;

17021for (intI = 0, E = SI1->getNumOperands();I < E; ++I)

17022if (SI1->getOperand(I) != SI2->getOperand(I))

17023returnfalse;

17024// Check if the second instruction is more defined than the first one.

17025 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());

17026ArrayRef<int> SM1 = SI1->getShuffleMask();

17027// Count trailing undefs in the mask to check the final number of used

17028// registers.

17029unsigned LastUndefsCnt = 0;

17030for (intI = 0, E = NewMask.size();I < E; ++I) {

17031if (SM1[I] ==PoisonMaskElem)

17032 ++LastUndefsCnt;

17033else

17034 LastUndefsCnt = 0;

17035if (NewMask[I] !=PoisonMaskElem && SM1[I] !=PoisonMaskElem &&

17036 NewMask[I] != SM1[I])

17037returnfalse;

17038if (NewMask[I] ==PoisonMaskElem)

17039 NewMask[I] = SM1[I];

17040 }

17041// Check if the last undefs actually change the final number of used vector

17042// registers.

17043return SM1.size() - LastUndefsCnt > 1 &&

17044::getNumberOfParts(*TTI, SI1->getType()) ==

17045::getNumberOfParts(

17046 *TTI,getWidenedType(SI1->getType()->getElementType(),

17047 SM1.size() - LastUndefsCnt));

17048 };

17049// Perform O(N^2) search over the gather/shuffle sequences and merge identical

17050// instructions. TODO: We can further optimize this scan if we split the

17051// instructions into different buckets based on the insert lane.

17052SmallVector<Instruction *, 16> Visited;

17053for (autoI = CSEWorkList.begin(), E = CSEWorkList.end();I != E; ++I) {

17054assert(*I &&

17055 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&

17056"Worklist not sorted properly!");

17057BasicBlock *BB = (*I)->getBlock();

17058// For all instructions in blocks containing gather sequences:

17059for (Instruction &In :llvm::make_early_inc_range(*BB)) {

17060if (isDeleted(&In))

17061continue;

17062if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&

17063 !GatherShuffleExtractSeq.contains(&In))

17064continue;

17065

17066// Check if we can replace this instruction with any of the

17067// visited instructions.

17068bool Replaced =false;

17069for (Instruction *&V : Visited) {

17070SmallVector<int> NewMask;

17071if (IsIdenticalOrLessDefined(&In, V, NewMask) &&

17072 DT->dominates(V->getParent(), In.getParent())) {

17073 In.replaceAllUsesWith(V);

17074eraseInstruction(&In);

17075if (auto *SI = dyn_cast<ShuffleVectorInst>(V))

17076if (!NewMask.empty())

17077 SI->setShuffleMask(NewMask);

17078 Replaced =true;

17079break;

17080 }

17081if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&

17082 GatherShuffleExtractSeq.contains(V) &&

17083 IsIdenticalOrLessDefined(V, &In, NewMask) &&

17084 DT->dominates(In.getParent(), V->getParent())) {

17085 In.moveAfter(V);

17086 V->replaceAllUsesWith(&In);

17087eraseInstruction(V);

17088if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))

17089if (!NewMask.empty())

17090 SI->setShuffleMask(NewMask);

17091 V = &In;

17092 Replaced =true;

17093break;

17094 }

17095 }

17096if (!Replaced) {

17097assert(!is_contained(Visited, &In));

17098 Visited.push_back(&In);

17099 }

17100 }

17101 }

17102 CSEBlocks.clear();

17103 GatherShuffleExtractSeq.clear();

17104}

17105

17106BoUpSLP::ScheduleData *

17107BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {

17108 ScheduleData *Bundle =nullptr;

17109 ScheduleData *PrevInBundle =nullptr;

17110for (Value *V : VL) {

17111if (doesNotNeedToBeScheduled(V))

17112continue;

17113 ScheduleData *BundleMember = getScheduleData(V);

17114assert(BundleMember &&

17115"no ScheduleData for bundle member "

17116"(maybe not in same basic block)");

17117assert(BundleMember->isSchedulingEntity() &&

17118"bundle member already part of other bundle");

17119if (PrevInBundle) {

17120 PrevInBundle->NextInBundle = BundleMember;

17121 }else {

17122 Bundle = BundleMember;

17123 }

17124

17125// Group the instructions to a bundle.

17126 BundleMember->FirstInBundle = Bundle;

17127 PrevInBundle = BundleMember;

17128 }

17129assert(Bundle &&"Failed to find schedule bundle");

17130return Bundle;

17131}

17132

17133// Groups the instructions to a bundle (which is then a single scheduling entity)

17134// and schedules instructions until the bundle gets ready.

17135std::optional<BoUpSLP::ScheduleData *>

17136BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL,BoUpSLP *SLP,

17137const InstructionsState &S) {

17138// No need to schedule PHIs, insertelement, extractelement and extractvalue

17139// instructions.

17140if (isa<PHINode>(S.getMainOp()) ||

17141isVectorLikeInstWithConstOps(S.getMainOp()) ||doesNotNeedToSchedule(VL))

17142returnnullptr;

17143

17144// Initialize the instruction bundle.

17145Instruction *OldScheduleEnd = ScheduleEnd;

17146LLVM_DEBUG(dbgs() <<"SLP: bundle: " << *S.getMainOp() <<"\n");

17147

17148auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,

17149 ScheduleData *Bundle) {

17150// The scheduling region got new instructions at the lower end (or it is a

17151// new region for the first bundle). This makes it necessary to

17152// recalculate all dependencies.

17153// It is seldom that this needs to be done a second time after adding the

17154// initial bundle to the region.

17155if (ScheduleEnd != OldScheduleEnd) {

17156for (auto *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode())

17157if (ScheduleData *SD = getScheduleData(I))

17158 SD->clearDependencies();

17159 ReSchedule =true;

17160 }

17161if (Bundle) {

17162LLVM_DEBUG(dbgs() <<"SLP: try schedule bundle " << *Bundle

17163 <<" in block " << BB->getName() <<"\n");

17164 calculateDependencies(Bundle,/*InsertInReadyList=*/true, SLP);

17165 }

17166

17167if (ReSchedule) {

17168 resetSchedule();

17169 initialFillReadyList(ReadyInsts);

17170 }

17171

17172// Now try to schedule the new bundle or (if no bundle) just calculate

17173// dependencies. As soon as the bundle is "ready" it means that there are no

17174// cyclic dependencies and we can schedule it. Note that's important that we

17175// don't "schedule" the bundle yet (see cancelScheduling).

17176while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&

17177 !ReadyInsts.empty()) {

17178 ScheduleData *Picked = ReadyInsts.pop_back_val();

17179assert(Picked->isSchedulingEntity() && Picked->isReady() &&

17180"must be ready to schedule");

17181 schedule(Picked, ReadyInsts);

17182 }

17183 };

17184

17185// Make sure that the scheduling region contains all

17186// instructions of the bundle.

17187for (Value *V : VL) {

17188if (doesNotNeedToBeScheduled(V))

17189continue;

17190if (!extendSchedulingRegion(V, S)) {

17191// If the scheduling region got new instructions at the lower end (or it

17192// is a new region for the first bundle). This makes it necessary to

17193// recalculate all dependencies.

17194// Otherwise the compiler may crash trying to incorrectly calculate

17195// dependencies and emit instruction in the wrong order at the actual

17196// scheduling.

17197 TryScheduleBundleImpl(/*ReSchedule=*/false,nullptr);

17198return std::nullopt;

17199 }

17200 }

17201

17202bool ReSchedule =false;

17203for (Value *V : VL) {

17204if (doesNotNeedToBeScheduled(V))

17205continue;

17206 ScheduleData *BundleMember = getScheduleData(V);

17207assert(BundleMember &&

17208"no ScheduleData for bundle member (maybe not in same basic block)");

17209

17210// Make sure we don't leave the pieces of the bundle in the ready list when

17211// whole bundle might not be ready.

17212 ReadyInsts.remove(BundleMember);

17213

17214if (!BundleMember->IsScheduled)

17215continue;

17216// A bundle member was scheduled as single instruction before and now

17217// needs to be scheduled as part of the bundle. We just get rid of the

17218// existing schedule.

17219LLVM_DEBUG(dbgs() <<"SLP: reset schedule because " << *BundleMember

17220 <<" was already scheduled\n");

17221 ReSchedule =true;

17222 }

17223

17224auto *Bundle = buildBundle(VL);

17225 TryScheduleBundleImpl(ReSchedule, Bundle);

17226if (!Bundle->isReady()) {

17227 cancelScheduling(VL, S.getMainOp());

17228return std::nullopt;

17229 }

17230return Bundle;

17231}

17232

17233void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,

17234Value *OpValue) {

17235if (isa<PHINode>(OpValue) ||isVectorLikeInstWithConstOps(OpValue) ||

17236doesNotNeedToSchedule(VL))

17237return;

17238

17239if (doesNotNeedToBeScheduled(OpValue))

17240 OpValue = *find_if_not(VL,doesNotNeedToBeScheduled);

17241 ScheduleData *Bundle = getScheduleData(OpValue);

17242LLVM_DEBUG(dbgs() <<"SLP: cancel scheduling of " << *Bundle <<"\n");

17243assert(!Bundle->IsScheduled &&

17244"Can't cancel bundle which is already scheduled");

17245assert(Bundle->isSchedulingEntity() &&

17246 (Bundle->isPartOfBundle() ||needToScheduleSingleInstruction(VL)) &&

17247"tried to unbundle something which is not a bundle");

17248

17249// Remove the bundle from the ready list.

17250if (Bundle->isReady())

17251 ReadyInsts.remove(Bundle);

17252

17253// Un-bundle: make single instructions out of the bundle.

17254 ScheduleData *BundleMember = Bundle;

17255while (BundleMember) {

17256assert(BundleMember->FirstInBundle == Bundle &&"corrupt bundle links");

17257 BundleMember->FirstInBundle = BundleMember;

17258 ScheduleData *Next = BundleMember->NextInBundle;

17259 BundleMember->NextInBundle =nullptr;

17260 BundleMember->TE =nullptr;

17261if (BundleMember->unscheduledDepsInBundle() == 0) {

17262 ReadyInsts.insert(BundleMember);

17263 }

17264 BundleMember = Next;

17265 }

17266}

17267

17268BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {

17269// Allocate a new ScheduleData for the instruction.

17270if (ChunkPos >= ChunkSize) {

17271 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));

17272 ChunkPos = 0;

17273 }

17274return &(ScheduleDataChunks.back()[ChunkPos++]);

17275}

17276

17277bool BoUpSLP::BlockScheduling::extendSchedulingRegion(

17278Value *V,const InstructionsState &S) {

17279Instruction *I = dyn_cast<Instruction>(V);

17280assert(I &&"bundle member must be an instruction");

17281assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&

17282 !doesNotNeedToBeScheduled(I) &&

17283"phi nodes/insertelements/extractelements/extractvalues don't need to "

17284"be scheduled");

17285if (getScheduleData(I))

17286returntrue;

17287if (!ScheduleStart) {

17288// It's the first instruction in the new region.

17289 initScheduleData(I,I->getNextNode(),nullptr,nullptr);

17290 ScheduleStart =I;

17291 ScheduleEnd =I->getNextNode();

17292assert(ScheduleEnd &&"tried to vectorize a terminator?");

17293LLVM_DEBUG(dbgs() <<"SLP: initialize schedule region to " << *I <<"\n");

17294returntrue;

17295 }

17296// Search up and down at the same time, because we don't know if the new

17297// instruction is above or below the existing scheduling region.

17298// Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted

17299// against the budget. Otherwise debug info could affect codegen.

17300BasicBlock::reverse_iterator UpIter =

17301 ++ScheduleStart->getIterator().getReverse();

17302BasicBlock::reverse_iterator UpperEnd = BB->rend();

17303BasicBlock::iterator DownIter = ScheduleEnd->getIterator();

17304BasicBlock::iterator LowerEnd = BB->end();

17305auto IsAssumeLikeIntr = [](constInstruction &I) {

17306if (auto *II = dyn_cast<IntrinsicInst>(&I))

17307returnII->isAssumeLikeIntrinsic();

17308returnfalse;

17309 };

17310 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);

17311 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);

17312while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=I &&

17313 &*DownIter !=I) {

17314if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {

17315LLVM_DEBUG(dbgs() <<"SLP: exceeded schedule region size limit\n");

17316returnfalse;

17317 }

17318

17319 ++UpIter;

17320 ++DownIter;

17321

17322 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);

17323 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);

17324 }

17325if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==I)) {

17326assert(I->getParent() == ScheduleStart->getParent() &&

17327"Instruction is in wrong basic block.");

17328 initScheduleData(I, ScheduleStart,nullptr, FirstLoadStoreInRegion);

17329 ScheduleStart =I;

17330LLVM_DEBUG(dbgs() <<"SLP: extend schedule region start to " << *I

17331 <<"\n");

17332returntrue;

17333 }

17334assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==I)) &&

17335"Expected to reach top of the basic block or instruction down the "

17336"lower end.");

17337assert(I->getParent() == ScheduleEnd->getParent() &&

17338"Instruction is in wrong basic block.");

17339 initScheduleData(ScheduleEnd,I->getNextNode(), LastLoadStoreInRegion,

17340nullptr);

17341 ScheduleEnd =I->getNextNode();

17342assert(ScheduleEnd &&"tried to vectorize a terminator?");

17343LLVM_DEBUG(dbgs() <<"SLP: extend schedule region end to " << *I <<"\n");

17344returntrue;

17345}

17346

17347void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,

17348Instruction *ToI,

17349 ScheduleData *PrevLoadStore,

17350 ScheduleData *NextLoadStore) {

17351 ScheduleData *CurrentLoadStore = PrevLoadStore;

17352for (Instruction *I = FromI;I != ToI;I =I->getNextNode()) {

17353// No need to allocate data for non-schedulable instructions.

17354if (doesNotNeedToBeScheduled(I))

17355continue;

17356 ScheduleData *SD = ScheduleDataMap.lookup(I);

17357if (!SD) {

17358 SD = allocateScheduleDataChunks();

17359 ScheduleDataMap[I] = SD;

17360 }

17361assert(!isInSchedulingRegion(SD) &&

17362"new ScheduleData already in scheduling region");

17363 SD->init(SchedulingRegionID,I);

17364

17365if (I->mayReadOrWriteMemory() &&

17366 (!isa<IntrinsicInst>(I) ||

17367 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&

17368 cast<IntrinsicInst>(I)->getIntrinsicID() !=

17369 Intrinsic::pseudoprobe))) {

17370// Update the linked list of memory accessing instructions.

17371if (CurrentLoadStore) {

17372 CurrentLoadStore->NextLoadStore = SD;

17373 }else {

17374 FirstLoadStoreInRegion = SD;

17375 }

17376 CurrentLoadStore = SD;

17377 }

17378

17379if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

17380match(I, m_Intrinsic<Intrinsic::stackrestore>()))

17381 RegionHasStackSave =true;

17382 }

17383if (NextLoadStore) {

17384if (CurrentLoadStore)

17385 CurrentLoadStore->NextLoadStore = NextLoadStore;

17386 }else {

17387 LastLoadStoreInRegion = CurrentLoadStore;

17388 }

17389}

17390

17391void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,

17392bool InsertInReadyList,

17393BoUpSLP *SLP) {

17394assert(SD->isSchedulingEntity());

17395

17396SmallVector<ScheduleData *, 10> WorkList;

17397 WorkList.push_back(SD);

17398

17399while (!WorkList.empty()) {

17400 ScheduleData *SD = WorkList.pop_back_val();

17401for (ScheduleData *BundleMember = SD; BundleMember;

17402 BundleMember = BundleMember->NextInBundle) {

17403assert(isInSchedulingRegion(BundleMember));

17404if (BundleMember->hasValidDependencies())

17405continue;

17406

17407LLVM_DEBUG(dbgs() <<"SLP: update deps of " << *BundleMember

17408 <<"\n");

17409 BundleMember->Dependencies = 0;

17410 BundleMember->resetUnscheduledDeps();

17411

17412// Handle def-use chain dependencies.

17413for (User *U : BundleMember->Inst->users()) {

17414if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {

17415 BundleMember->Dependencies++;

17416 ScheduleData *DestBundle = UseSD->FirstInBundle;

17417if (!DestBundle->IsScheduled)

17418 BundleMember->incrementUnscheduledDeps(1);

17419if (!DestBundle->hasValidDependencies())

17420 WorkList.push_back(DestBundle);

17421 }

17422 }

17423

17424auto MakeControlDependent = [&](Instruction *I) {

17425auto *DepDest = getScheduleData(I);

17426assert(DepDest &&"must be in schedule window");

17427 DepDest->ControlDependencies.push_back(BundleMember);

17428 BundleMember->Dependencies++;

17429 ScheduleData *DestBundle = DepDest->FirstInBundle;

17430if (!DestBundle->IsScheduled)

17431 BundleMember->incrementUnscheduledDeps(1);

17432if (!DestBundle->hasValidDependencies())

17433 WorkList.push_back(DestBundle);

17434 };

17435

17436// Any instruction which isn't safe to speculate at the beginning of the

17437// block is control dependend on any early exit or non-willreturn call

17438// which proceeds it.

17439if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {

17440for (Instruction *I = BundleMember->Inst->getNextNode();

17441I != ScheduleEnd;I =I->getNextNode()) {

17442if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))

17443continue;

17444

17445// Add the dependency

17446 MakeControlDependent(I);

17447

17448if (!isGuaranteedToTransferExecutionToSuccessor(I))

17449// Everything past here must be control dependent on I.

17450break;

17451 }

17452 }

17453

17454if (RegionHasStackSave) {

17455// If we have an inalloc alloca instruction, it needs to be scheduled

17456// after any preceeding stacksave. We also need to prevent any alloca

17457// from reordering above a preceeding stackrestore.

17458if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||

17459match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {

17460for (Instruction *I = BundleMember->Inst->getNextNode();

17461I != ScheduleEnd;I =I->getNextNode()) {

17462if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||

17463match(I, m_Intrinsic<Intrinsic::stackrestore>()))

17464// Any allocas past here must be control dependent on I, and I

17465// must be memory dependend on BundleMember->Inst.

17466break;

17467

17468if (!isa<AllocaInst>(I))

17469continue;

17470

17471// Add the dependency

17472 MakeControlDependent(I);

17473 }

17474 }

17475

17476// In addition to the cases handle just above, we need to prevent

17477// allocas and loads/stores from moving below a stacksave or a

17478// stackrestore. Avoiding moving allocas below stackrestore is currently

17479// thought to be conservatism. Moving loads/stores below a stackrestore

17480// can lead to incorrect code.

17481if (isa<AllocaInst>(BundleMember->Inst) ||

17482 BundleMember->Inst->mayReadOrWriteMemory()) {

17483for (Instruction *I = BundleMember->Inst->getNextNode();

17484I != ScheduleEnd;I =I->getNextNode()) {

17485if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&

17486 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))

17487continue;

17488

17489// Add the dependency

17490 MakeControlDependent(I);

17491break;

17492 }

17493 }

17494 }

17495

17496// Handle the memory dependencies (if any).

17497 ScheduleData *DepDest = BundleMember->NextLoadStore;

17498if (!DepDest)

17499continue;

17500Instruction *SrcInst = BundleMember->Inst;

17501assert(SrcInst->mayReadOrWriteMemory() &&

17502"NextLoadStore list for non memory effecting bundle?");

17503MemoryLocation SrcLoc =getLocation(SrcInst);

17504bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();

17505unsigned NumAliased = 0;

17506unsigned DistToSrc = 1;

17507

17508for (; DepDest; DepDest = DepDest->NextLoadStore) {

17509assert(isInSchedulingRegion(DepDest));

17510

17511// We have two limits to reduce the complexity:

17512// 1) AliasedCheckLimit: It's a small limit to reduce calls to

17513// SLP->isAliased (which is the expensive part in this loop).

17514// 2) MaxMemDepDistance: It's for very large blocks and it aborts

17515// the whole loop (even if the loop is fast, it's quadratic).

17516// It's important for the loop break condition (see below) to

17517// check this limit even between two read-only instructions.

17518if (DistToSrc >=MaxMemDepDistance ||

17519 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&

17520 (NumAliased >=AliasedCheckLimit ||

17521 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {

17522

17523// We increment the counter only if the locations are aliased

17524// (instead of counting all alias checks). This gives a better

17525// balance between reduced runtime and accurate dependencies.

17526 NumAliased++;

17527

17528 DepDest->MemoryDependencies.push_back(BundleMember);

17529 BundleMember->Dependencies++;

17530 ScheduleData *DestBundle = DepDest->FirstInBundle;

17531if (!DestBundle->IsScheduled) {

17532 BundleMember->incrementUnscheduledDeps(1);

17533 }

17534if (!DestBundle->hasValidDependencies()) {

17535 WorkList.push_back(DestBundle);

17536 }

17537 }

17538

17539// Example, explaining the loop break condition: Let's assume our

17540// starting instruction is i0 and MaxMemDepDistance = 3.

17541//

17542// +--------v--v--v

17543// i0,i1,i2,i3,i4,i5,i6,i7,i8

17544// +--------^--^--^

17545//

17546// MaxMemDepDistance let us stop alias-checking at i3 and we add

17547// dependencies from i0 to i3,i4,.. (even if they are not aliased).

17548// Previously we already added dependencies from i3 to i6,i7,i8

17549// (because of MaxMemDepDistance). As we added a dependency from

17550// i0 to i3, we have transitive dependencies from i0 to i6,i7,i8

17551// and we can abort this loop at i6.

17552if (DistToSrc >= 2 *MaxMemDepDistance)

17553break;

17554 DistToSrc++;

17555 }

17556 }

17557if (InsertInReadyList && SD->isReady()) {

17558 ReadyInsts.insert(SD);

17559LLVM_DEBUG(dbgs() <<"SLP: gets ready on update: " << *SD->Inst

17560 <<"\n");

17561 }

17562 }

17563}

17564

17565void BoUpSLP::BlockScheduling::resetSchedule() {

17566assert(ScheduleStart &&

17567"tried to reset schedule on block which has not been scheduled");

17568for (Instruction *I = ScheduleStart;I != ScheduleEnd;I =I->getNextNode()) {

17569if (ScheduleData *SD = getScheduleData(I)) {

17570assert(isInSchedulingRegion(SD) &&

17571"ScheduleData not in scheduling region");

17572 SD->IsScheduled =false;

17573 SD->resetUnscheduledDeps();

17574 }

17575 }

17576 ReadyInsts.clear();

17577}

17578

17579void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

17580if (!BS->ScheduleStart)

17581return;

17582

17583LLVM_DEBUG(dbgs() <<"SLP: schedule block " << BS->BB->getName() <<"\n");

17584

17585// A key point - if we got here, pre-scheduling was able to find a valid

17586// scheduling of the sub-graph of the scheduling window which consists

17587// of all vector bundles and their transitive users. As such, we do not

17588// need to reschedule anything *outside of* that subgraph.

17589

17590 BS->resetSchedule();

17591

17592// For the real scheduling we use a more sophisticated ready-list: it is

17593// sorted by the original instruction location. This lets the final schedule

17594// be as close as possible to the original instruction order.

17595// WARNING: If changing this order causes a correctness issue, that means

17596// there is some missing dependence edge in the schedule data graph.

17597structScheduleDataCompare {

17598bool operator()(ScheduleData *SD1, ScheduleData *SD2) const{

17599return SD2->SchedulingPriority < SD1->SchedulingPriority;

17600 }

17601 };

17602 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;

17603

17604// Ensure that all dependency data is updated (for nodes in the sub-graph)

17605// and fill the ready-list with initial instructions.

17606intIdx = 0;

17607for (auto *I = BS->ScheduleStart;I != BS->ScheduleEnd;

17608I =I->getNextNode()) {

17609if (ScheduleData *SD = BS->getScheduleData(I)) {

17610 [[maybe_unused]] TreeEntry *SDTE = getTreeEntry(SD->Inst);

17611assert((isVectorLikeInstWithConstOps(SD->Inst) ||

17612 SD->isPartOfBundle() ==

17613 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&

17614"scheduler and vectorizer bundle mismatch");

17615 SD->FirstInBundle->SchedulingPriority =Idx++;

17616

17617if (SD->isSchedulingEntity() && SD->isPartOfBundle())

17618 BS->calculateDependencies(SD,false,this);

17619 }

17620 }

17621 BS->initialFillReadyList(ReadyInsts);

17622

17623Instruction *LastScheduledInst = BS->ScheduleEnd;

17624

17625// Do the "real" scheduling.

17626while (!ReadyInsts.empty()) {

17627 ScheduleData *Picked = *ReadyInsts.begin();

17628 ReadyInsts.erase(ReadyInsts.begin());

17629

17630// Move the scheduled instruction(s) to their dedicated places, if not

17631// there yet.

17632for (ScheduleData *BundleMember = Picked; BundleMember;

17633 BundleMember = BundleMember->NextInBundle) {

17634Instruction *PickedInst = BundleMember->Inst;

17635if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)

17636 PickedInst->moveAfter(LastScheduledInst->getPrevNode());

17637 LastScheduledInst = PickedInst;

17638 }

17639

17640 BS->schedule(Picked, ReadyInsts);

17641 }

17642

17643// Check that we didn't break any of our invariants.

17644#ifdef EXPENSIVE_CHECKS

17645 BS->verify();

17646#endif

17647

17648#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)

17649// Check that all schedulable entities got scheduled

17650for (auto *I = BS->ScheduleStart;I != BS->ScheduleEnd;I =I->getNextNode()) {

17651 ScheduleData *SD = BS->getScheduleData(I);

17652if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())

17653assert(SD->IsScheduled &&"must be scheduled at this point");

17654 }

17655#endif

17656

17657// Avoid duplicate scheduling of the block.

17658 BS->ScheduleStart =nullptr;

17659}

17660

17661unsignedBoUpSLP::getVectorElementSize(Value *V) {

17662// If V is a store, just return the width of the stored value (or value

17663// truncated just before storing) without traversing the expression tree.

17664// This is the common case.

17665if (auto *Store = dyn_cast<StoreInst>(V))

17666returnDL->getTypeSizeInBits(Store->getValueOperand()->getType());

17667

17668if (auto *IEI = dyn_cast<InsertElementInst>(V))

17669returngetVectorElementSize(IEI->getOperand(1));

17670

17671auto E = InstrElementSize.find(V);

17672if (E != InstrElementSize.end())

17673return E->second;

17674

17675// If V is not a store, we can traverse the expression tree to find loads

17676// that feed it. The type of the loaded value may indicate a more suitable

17677// width than V's type. We want to base the vector element size on the width

17678// of memory operations where possible.

17679SmallVector<std::tuple<Instruction *, BasicBlock *, unsigned>> Worklist;

17680SmallPtrSet<Instruction *, 16> Visited;

17681if (auto *I = dyn_cast<Instruction>(V)) {

17682 Worklist.emplace_back(I,I->getParent(), 0);

17683 Visited.insert(I);

17684 }

17685

17686// Traverse the expression tree in bottom-up order looking for loads. If we

17687// encounter an instruction we don't yet handle, we give up.

17688auto Width = 0u;

17689Value *FirstNonBool =nullptr;

17690while (!Worklist.empty()) {

17691auto [I, Parent, Level] = Worklist.pop_back_val();

17692

17693// We should only be looking at scalar instructions here. If the current

17694// instruction has a vector type, skip.

17695auto *Ty =I->getType();

17696if (isa<VectorType>(Ty))

17697continue;

17698if (Ty != Builder.getInt1Ty() && !FirstNonBool)

17699 FirstNonBool =I;

17700if (Level >RecursionMaxDepth)

17701continue;

17702

17703// If the current instruction is a load, update MaxWidth to reflect the

17704// width of the loaded value.

17705if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))

17706 Width = std::max<unsigned>(Width,DL->getTypeSizeInBits(Ty));

17707

17708// Otherwise, we need to visit the operands of the instruction. We only

17709// handle the interesting cases from buildTree here. If an operand is an

17710// instruction we haven't yet visited and from the same basic block as the

17711// user or the use is a PHI node, we add it to the worklist.

17712elseif (isa<PHINode,CastInst,GetElementPtrInst,CmpInst,SelectInst,

17713BinaryOperator,UnaryOperator>(I)) {

17714for (Use &U :I->operands()) {

17715if (auto *J = dyn_cast<Instruction>(U.get()))

17716if (Visited.insert(J).second &&

17717 (isa<PHINode>(I) || J->getParent() == Parent)) {

17718 Worklist.emplace_back(J, J->getParent(), Level + 1);

17719continue;

17720 }

17721if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())

17722 FirstNonBool = U.get();

17723 }

17724 }else {

17725break;

17726 }

17727 }

17728

17729// If we didn't encounter a memory access in the expression tree, or if we

17730// gave up for some reason, just return the width of V. Otherwise, return the

17731// maximum width we found.

17732if (!Width) {

17733if (V->getType() == Builder.getInt1Ty() && FirstNonBool)

17734 V = FirstNonBool;

17735 Width =DL->getTypeSizeInBits(V->getType());

17736 }

17737

17738for (Instruction *I : Visited)

17739 InstrElementSize[I] = Width;

17740

17741return Width;

17742}

17743

17744bool BoUpSLP::collectValuesToDemote(

17745const TreeEntry &E,bool IsProfitableToDemoteRoot,unsigned &BitWidth,

17746SmallVectorImpl<unsigned> &ToDemote,DenseSet<const TreeEntry *> &Visited,

17747constSmallDenseSet<unsigned, 8> &NodesToKeepBWs,unsigned &MaxDepthLevel,

17748bool &IsProfitableToDemote,bool IsTruncRoot) const{

17749// We can always demote constants.

17750if (all_of(E.Scalars, IsaPred<Constant>))

17751returntrue;

17752

17753unsigned OrigBitWidth =

17754DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());

17755if (OrigBitWidth ==BitWidth) {

17756 MaxDepthLevel = 1;

17757returntrue;

17758 }

17759

17760// Check if the node was analyzed already and must keep its original bitwidth.

17761if (NodesToKeepBWs.contains(E.Idx))

17762returnfalse;

17763

17764// If the value is not a vectorized instruction in the expression and not used

17765// by the insertelement instruction and not used in multiple vector nodes, it

17766// cannot be demoted.

17767bool IsSignedNode =any_of(E.Scalars, [&](Value *R) {

17768 if (isa<PoisonValue>(R))

17769 return false;

17770 return !isKnownNonNegative(R, SimplifyQuery(*DL));

17771 });

17772auto IsPotentiallyTruncated = [&](Value *V,unsigned &BitWidth) ->bool {

17773if (isa<PoisonValue>(V))

17774returntrue;

17775if (MultiNodeScalars.contains(V))

17776returnfalse;

17777// For lat shuffle of sext/zext with many uses need to check the extra bit

17778// for unsigned values, otherwise may have incorrect casting for reused

17779// scalars.

17780bool IsSignedVal = !isKnownNonNegative(V,SimplifyQuery(*DL));

17781if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >BitWidth) {

17782APInt Mask =APInt::getBitsSetFrom(OrigBitWidth,BitWidth);

17783if (MaskedValueIsZero(V, Mask,SimplifyQuery(*DL)))

17784returntrue;

17785 }

17786unsigned NumSignBits =ComputeNumSignBits(V, *DL, 0, AC,nullptr, DT);

17787unsigned BitWidth1 = OrigBitWidth - NumSignBits;

17788if (IsSignedNode)

17789 ++BitWidth1;

17790if (auto *I = dyn_cast<Instruction>(V)) {

17791APInt Mask = DB->getDemandedBits(I);

17792unsigned BitWidth2 =

17793 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());

17794while (!IsSignedNode && BitWidth2 < OrigBitWidth) {

17795APInt Mask =APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);

17796if (MaskedValueIsZero(V, Mask,SimplifyQuery(*DL)))

17797break;

17798 BitWidth2 *= 2;

17799 }

17800 BitWidth1 = std::min(BitWidth1, BitWidth2);

17801 }

17802BitWidth = std::max(BitWidth, BitWidth1);

17803returnBitWidth > 0 && OrigBitWidth >= (BitWidth * 2);

17804 };

17805auto FinalAnalysis = [&,TTI =TTI]() {

17806if (!IsProfitableToDemote)

17807returnfalse;

17808bool Res =all_of(

17809 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));

17810// Demote gathers.

17811if (Res && E.isGather()) {

17812// Check possible extractelement instructions bases and final vector

17813// length.

17814SmallPtrSet<Value *, 4> UniqueBases;

17815for (Value *V : E.Scalars) {

17816auto *EE = dyn_cast<ExtractElementInst>(V);

17817if (!EE)

17818continue;

17819 UniqueBases.insert(EE->getVectorOperand());

17820 }

17821constunsigned VF = E.Scalars.size();

17822Type *OrigScalarTy = E.Scalars.front()->getType();

17823if (UniqueBases.size() <= 2 ||

17824::getNumberOfParts(*TTI,getWidenedType(OrigScalarTy, VF)) ==

17825::getNumberOfParts(

17826 *TTI,

17827getWidenedType(

17828IntegerType::get(OrigScalarTy->getContext(),BitWidth),

17829 VF)))

17830 ToDemote.push_back(E.Idx);

17831 }

17832return Res;

17833 };

17834if (E.isGather() || !Visited.insert(&E).second ||

17835any_of(E.Scalars, [&](Value *V) {

17836 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {

17837 return isa<InsertElementInst>(U) && !getTreeEntry(U);

17838 });

17839 }))

17840return FinalAnalysis();

17841

17842if (any_of(E.Scalars, [&](Value *V) {

17843 return !all_of(V->users(), [=](User *U) {

17844 return getTreeEntry(U) ||

17845 (E.Idx == 0 && UserIgnoreList &&

17846 UserIgnoreList->contains(U)) ||

17847 (!isa<CmpInst>(U) && U->getType()->isSized() &&

17848 !U->getType()->isScalableTy() &&

17849 DL->getTypeSizeInBits(U->getType()) <= BitWidth);

17850 }) && !IsPotentiallyTruncated(V,BitWidth);

17851 }))

17852returnfalse;

17853

17854auto ProcessOperands = [&](ArrayRef<const TreeEntry *>Operands,

17855bool &NeedToExit) {

17856 NeedToExit =false;

17857unsigned InitLevel = MaxDepthLevel;

17858for (const TreeEntry *Op :Operands) {

17859unsigned Level = InitLevel;

17860if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot,BitWidth,

17861 ToDemote, Visited, NodesToKeepBWs, Level,

17862 IsProfitableToDemote, IsTruncRoot)) {

17863if (!IsProfitableToDemote)

17864returnfalse;

17865 NeedToExit =true;

17866if (!FinalAnalysis())

17867returnfalse;

17868continue;

17869 }

17870 MaxDepthLevel = std::max(MaxDepthLevel, Level);

17871 }

17872returntrue;

17873 };

17874auto AttemptCheckBitwidth =

17875 [&](function_ref<bool(unsigned,unsigned)> Checker,bool &NeedToExit) {

17876// Try all bitwidth < OrigBitWidth.

17877 NeedToExit =false;

17878unsigned BestFailBitwidth = 0;

17879for (;BitWidth < OrigBitWidth;BitWidth *= 2) {

17880if (Checker(BitWidth, OrigBitWidth))

17881returntrue;

17882if (BestFailBitwidth == 0 && FinalAnalysis())

17883 BestFailBitwidth =BitWidth;

17884 }

17885if (BitWidth >= OrigBitWidth) {

17886if (BestFailBitwidth == 0) {

17887BitWidth = OrigBitWidth;

17888returnfalse;

17889 }

17890 MaxDepthLevel = 1;

17891BitWidth = BestFailBitwidth;

17892 NeedToExit =true;

17893returntrue;

17894 }

17895returnfalse;

17896 };

17897auto TryProcessInstruction =

17898 [&](unsigned &BitWidth,ArrayRef<const TreeEntry *>Operands = {},

17899function_ref<bool(unsigned,unsigned)> Checker = {}) {

17900if (Operands.empty()) {

17901if (!IsTruncRoot)

17902 MaxDepthLevel = 1;

17903 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,

17904 std::ref(BitWidth)));

17905 }else {

17906// Several vectorized uses? Check if we can truncate it, otherwise -

17907// exit.

17908if (E.UserTreeIndices.size() > 1 &&

17909 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,

17910 std::ref(BitWidth))))

17911returnfalse;

17912bool NeedToExit =false;

17913if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))

17914returnfalse;

17915if (NeedToExit)

17916returntrue;

17917if (!ProcessOperands(Operands, NeedToExit))

17918returnfalse;

17919if (NeedToExit)

17920returntrue;

17921 }

17922

17923 ++MaxDepthLevel;

17924// Record the entry that we can demote.

17925 ToDemote.push_back(E.Idx);

17926return IsProfitableToDemote;

17927 };

17928switch (E.getOpcode()) {

17929

17930// We can always demote truncations and extensions. Since truncations can

17931// seed additional demotion, we save the truncated value.

17932case Instruction::Trunc:

17933if (IsProfitableToDemoteRoot)

17934 IsProfitableToDemote =true;

17935return TryProcessInstruction(BitWidth);

17936case Instruction::ZExt:

17937case Instruction::SExt:

17938 IsProfitableToDemote =true;

17939return TryProcessInstruction(BitWidth);

17940

17941// We can demote certain binary operations if we can demote both of their

17942// operands.

17943case Instruction::Add:

17944case Instruction::Sub:

17945case Instruction::Mul:

17946case Instruction::And:

17947case Instruction::Or:

17948case Instruction::Xor: {

17949return TryProcessInstruction(

17950BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});

17951 }

17952case Instruction::Freeze:

17953return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));

17954case Instruction::Shl: {

17955// If we are truncating the result of this SHL, and if it's a shift of an

17956// inrange amount, we can always perform a SHL in a smaller type.

17957auto ShlChecker = [&](unsignedBitWidth,unsigned) {

17958returnall_of(E.Scalars, [&](Value *V) {

17959 if (isa<PoisonValue>(V))

17960 return true;

17961 auto *I = cast<Instruction>(V);

17962 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);

17963 return AmtKnownBits.getMaxValue().ult(BitWidth);

17964 });

17965 };

17966return TryProcessInstruction(

17967BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);

17968 }

17969case Instruction::LShr: {

17970// If this is a truncate of a logical shr, we can truncate it to a smaller

17971// lshr iff we know that the bits we would otherwise be shifting in are

17972// already zeros.

17973auto LShrChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

17974returnall_of(E.Scalars, [&](Value *V) {

17975 if (isa<PoisonValue>(V))

17976 return true;

17977 auto *I = cast<Instruction>(V);

17978 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);

17979 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

17980 return AmtKnownBits.getMaxValue().ult(BitWidth) &&

17981 MaskedValueIsZero(I->getOperand(0), ShiftedBits,

17982 SimplifyQuery(*DL));

17983 });

17984 };

17985return TryProcessInstruction(

17986BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},

17987 LShrChecker);

17988 }

17989case Instruction::AShr: {

17990// If this is a truncate of an arithmetic shr, we can truncate it to a

17991// smaller ashr iff we know that all the bits from the sign bit of the

17992// original type and the sign bit of the truncate type are similar.

17993auto AShrChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

17994returnall_of(E.Scalars, [&](Value *V) {

17995 if (isa<PoisonValue>(V))

17996 return true;

17997 auto *I = cast<Instruction>(V);

17998 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);

17999 unsigned ShiftedBits = OrigBitWidth - BitWidth;

18000 return AmtKnownBits.getMaxValue().ult(BitWidth) &&

18001 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,

18002 nullptr, DT);

18003 });

18004 };

18005return TryProcessInstruction(

18006BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},

18007 AShrChecker);

18008 }

18009case Instruction::UDiv:

18010case Instruction::URem: {

18011// UDiv and URem can be truncated if all the truncated bits are zero.

18012auto Checker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

18013assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");

18014returnall_of(E.Scalars, [&](Value *V) {

18015 auto *I = cast<Instruction>(V);

18016 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

18017 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&

18018 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));

18019 });

18020 };

18021return TryProcessInstruction(

18022BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);

18023 }

18024

18025// We can demote selects if we can demote their true and false values.

18026case Instruction::Select: {

18027return TryProcessInstruction(

18028BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});

18029 }

18030

18031// We can demote phis if we can demote all their incoming operands. Note that

18032// we don't need to worry about cycles since we ensure single use above.

18033case Instruction::PHI: {

18034constunsigned NumOps = E.getNumOperands();

18035SmallVector<const TreeEntry *> Ops(NumOps);

18036transform(seq<unsigned>(0, NumOps), Ops.begin(),

18037 std::bind(&BoUpSLP::getOperandEntry,this, &E, _1));

18038

18039return TryProcessInstruction(BitWidth, Ops);

18040 }

18041

18042case Instruction::Call: {

18043auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());

18044if (!IC)

18045break;

18046Intrinsic::ID ID =getVectorIntrinsicIDForCall(IC, TLI);

18047if (ID != Intrinsic::abs &&ID != Intrinsic::smin &&

18048ID != Intrinsic::smax &&ID != Intrinsic::umin &&ID != Intrinsic::umax)

18049break;

18050SmallVector<const TreeEntry *, 2>Operands(1, getOperandEntry(&E, 0));

18051function_ref<bool(unsigned,unsigned)> CallChecker;

18052auto CompChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

18053assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");

18054returnall_of(E.Scalars, [&](Value *V) {

18055 auto *I = cast<Instruction>(V);

18056 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {

18057 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);

18058 return MaskedValueIsZero(I->getOperand(0), Mask,

18059 SimplifyQuery(*DL)) &&

18060 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));

18061 }

18062assert((ID == Intrinsic::smin ||ID == Intrinsic::smax) &&

18063"Expected min/max intrinsics only.");

18064unsigned SignBits = OrigBitWidth -BitWidth;

18065APInt Mask =APInt::getBitsSetFrom(OrigBitWidth,BitWidth - 1);

18066unsigned Op0SignBits =ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,

18067nullptr, DT);

18068unsigned Op1SignBits =ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,

18069nullptr, DT);

18070return SignBits <= Op0SignBits &&

18071 ((SignBits != Op0SignBits &&

18072 !isKnownNonNegative(I->getOperand(0),SimplifyQuery(*DL))) ||

18073MaskedValueIsZero(I->getOperand(0),Mask,

18074SimplifyQuery(*DL))) &&

18075 SignBits <= Op1SignBits &&

18076 ((SignBits != Op1SignBits &&

18077 !isKnownNonNegative(I->getOperand(1),SimplifyQuery(*DL))) ||

18078MaskedValueIsZero(I->getOperand(1),Mask,SimplifyQuery(*DL)));

18079 });

18080 };

18081auto AbsChecker = [&](unsignedBitWidth,unsigned OrigBitWidth) {

18082assert(BitWidth <= OrigBitWidth &&"Unexpected bitwidths!");

18083returnall_of(E.Scalars, [&](Value *V) {

18084 auto *I = cast<Instruction>(V);

18085 unsigned SignBits = OrigBitWidth - BitWidth;

18086 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);

18087 unsigned Op0SignBits =

18088 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);

18089 return SignBits <= Op0SignBits &&

18090 ((SignBits != Op0SignBits &&

18091 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||

18092 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));

18093 });

18094 };

18095if (ID != Intrinsic::abs) {

18096Operands.push_back(getOperandEntry(&E, 1));

18097 CallChecker = CompChecker;

18098 }else {

18099 CallChecker = AbsChecker;

18100 }

18101InstructionCost BestCost =

18102 std::numeric_limits<InstructionCost::CostType>::max();

18103unsigned BestBitWidth =BitWidth;

18104unsigned VF = E.Scalars.size();

18105// Choose the best bitwidth based on cost estimations.

18106auto Checker = [&](unsignedBitWidth,unsigned) {

18107unsigned MinBW =PowerOf2Ceil(BitWidth);

18108SmallVector<Type *> ArgTys =

18109buildIntrinsicArgTypes(IC,ID, VF, MinBW,TTI);

18110auto VecCallCosts =getVectorCallCosts(

18111 IC,getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),

18112TTI, TLI, ArgTys);

18113InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);

18114if (Cost < BestCost) {

18115 BestCost =Cost;

18116 BestBitWidth =BitWidth;

18117 }

18118returnfalse;

18119 };

18120 [[maybe_unused]]bool NeedToExit;

18121 (void)AttemptCheckBitwidth(Checker, NeedToExit);

18122BitWidth = BestBitWidth;

18123return TryProcessInstruction(BitWidth,Operands, CallChecker);

18124 }

18125

18126// Otherwise, conservatively give up.

18127default:

18128break;

18129 }

18130 MaxDepthLevel = 1;

18131return FinalAnalysis();

18132}

18133

18134staticRecurKind getRdxKind(Value *V);

18135

18136voidBoUpSLP::computeMinimumValueSizes() {

18137// We only attempt to truncate integer expressions.

18138bool IsStoreOrInsertElt =

18139 VectorizableTree.front()->hasState() &&

18140 (VectorizableTree.front()->getOpcode() == Instruction::Store ||

18141 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);

18142if ((IsStoreOrInsertElt || UserIgnoreList) &&

18143 ExtraBitWidthNodes.size() <= 1 &&

18144 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||

18145 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))

18146return;

18147

18148unsigned NodeIdx = 0;

18149if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())

18150 NodeIdx = 1;

18151

18152// Ensure the roots of the vectorizable tree don't form a cycle.

18153if (VectorizableTree[NodeIdx]->isGather() ||

18154 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||

18155 (NodeIdx != 0 &&any_of(VectorizableTree[NodeIdx]->UserTreeIndices,

18156 [NodeIdx](constEdgeInfo &EI) {

18157return EI.UserTE->Idx > NodeIdx;

18158 })))

18159return;

18160

18161// The first value node for store/insertelement is sext/zext/trunc? Skip it,

18162// resize to the final type.

18163bool IsTruncRoot =false;

18164bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;

18165SmallVector<unsigned> RootDemotes;

18166SmallDenseSet<unsigned, 8> NodesToKeepBWs;

18167if (NodeIdx != 0 &&

18168 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&

18169 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {

18170assert(IsStoreOrInsertElt &&"Expected store/insertelement seeded graph.");

18171 IsTruncRoot =true;

18172 RootDemotes.push_back(NodeIdx);

18173 IsProfitableToDemoteRoot =true;

18174 ++NodeIdx;

18175 }

18176

18177// Analyzed the reduction already and not profitable - exit.

18178if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))

18179return;

18180

18181SmallVector<unsigned> ToDemote;

18182auto ComputeMaxBitWidth =

18183 [&](const TreeEntry &E,bool IsTopRoot,bool IsProfitableToDemoteRoot,

18184unsigned Limit,bool IsTruncRoot,bool IsSignedCmp) ->unsigned {

18185 ToDemote.clear();

18186// Check if the root is trunc and the next node is gather/buildvector, then

18187// keep trunc in scalars, which is free in most cases.

18188if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&

18189 !NodesToKeepBWs.contains(E.Idx) &&

18190 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&

18191all_of(E.Scalars, [&](Value *V) {

18192return V->hasOneUse() || isa<Constant>(V) ||

18193 (!V->hasNUsesOrMore(UsesLimit) &&

18194none_of(V->users(), [&](User *U) {

18195 const TreeEntry *TE = getTreeEntry(U);

18196 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;

18197 if (TE == UserTE || !TE)

18198 return false;

18199 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,

18200 SelectInst>(U) ||

18201 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,

18202 SelectInst>(UserTE->getMainOp()))

18203 return true;

18204 unsigned UserTESz = DL->getTypeSizeInBits(

18205 UserTE->Scalars.front()->getType());

18206 auto It = MinBWs.find(TE);

18207 if (It != MinBWs.end() && It->second.first > UserTESz)

18208 return true;

18209 return DL->getTypeSizeInBits(U->getType()) > UserTESz;

18210 }));

18211 })) {

18212 ToDemote.push_back(E.Idx);

18213const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;

18214auto It = MinBWs.find(UserTE);

18215if (It != MinBWs.end())

18216return It->second.first;

18217unsigned MaxBitWidth =

18218DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());

18219 MaxBitWidth =bit_ceil(MaxBitWidth);

18220if (MaxBitWidth < 8 && MaxBitWidth > 1)

18221 MaxBitWidth = 8;

18222return MaxBitWidth;

18223 }

18224

18225if (!E.hasState())

18226return 0u;

18227

18228unsigned VF = E.getVectorFactor();

18229Type *ScalarTy = E.Scalars.front()->getType();

18230unsigned ScalarTyNumElements =getNumElements(ScalarTy);

18231auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());

18232if (!TreeRootIT)

18233return 0u;

18234

18235if (any_of(E.Scalars,

18236 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))

18237return 0u;

18238

18239unsigned NumParts =::getNumberOfParts(

18240 *TTI,getWidenedType(TreeRootIT, VF * ScalarTyNumElements));

18241

18242// The maximum bit width required to represent all the values that can be

18243// demoted without loss of precision. It would be safe to truncate the roots

18244// of the expression to this width.

18245unsigned MaxBitWidth = 1u;

18246

18247// True if the roots can be zero-extended back to their original type,

18248// rather than sign-extended. We know that if the leading bits are not

18249// demanded, we can safely zero-extend. So we initialize IsKnownPositive to

18250// True.

18251// Determine if the sign bit of all the roots is known to be zero. If not,

18252// IsKnownPositive is set to False.

18253bool IsKnownPositive = !IsSignedCmp &&all_of(E.Scalars, [&](Value *R) {

18254 if (isa<PoisonValue>(R))

18255 return true;

18256 KnownBits Known = computeKnownBits(R, *DL);

18257 return Known.isNonNegative();

18258 });

18259

18260// We first check if all the bits of the roots are demanded. If they're not,

18261// we can truncate the roots to this narrower type.

18262for (Value *Root : E.Scalars) {

18263if (isa<PoisonValue>(Root))

18264continue;

18265unsigned NumSignBits =ComputeNumSignBits(Root, *DL, 0, AC,nullptr, DT);

18266TypeSize NumTypeBits =

18267DL->getTypeSizeInBits(Root->getType()->getScalarType());

18268unsigned BitWidth1 = NumTypeBits - NumSignBits;

18269// If we can't prove that the sign bit is zero, we must add one to the

18270// maximum bit width to account for the unknown sign bit. This preserves

18271// the existing sign bit so we can safely sign-extend the root back to the

18272// original type. Otherwise, if we know the sign bit is zero, we will

18273// zero-extend the root instead.

18274//

18275// FIXME: This is somewhat suboptimal, as there will be cases where adding

18276// one to the maximum bit width will yield a larger-than-necessary

18277// type. In general, we need to add an extra bit only if we can't

18278// prove that the upper bit of the original type is equal to the

18279// upper bit of the proposed smaller type. If these two bits are

18280// the same (either zero or one) we know that sign-extending from

18281// the smaller type will result in the same value. Here, since we

18282// can't yet prove this, we are just making the proposed smaller

18283// type larger to ensure correctness.

18284if (!IsKnownPositive)

18285 ++BitWidth1;

18286

18287APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));

18288unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();

18289 MaxBitWidth =

18290 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);

18291 }

18292

18293if (MaxBitWidth < 8 && MaxBitWidth > 1)

18294 MaxBitWidth = 8;

18295

18296// If the original type is large, but reduced type does not improve the reg

18297// use - ignore it.

18298if (NumParts > 1 &&

18299 NumParts ==

18300::getNumberOfParts(

18301 *TTI,getWidenedType(IntegerType::get(F->getContext(),

18302bit_ceil(MaxBitWidth)),

18303 VF)))

18304return 0u;

18305

18306unsigned Opcode = E.getOpcode();

18307bool IsProfitableToDemote = Opcode == Instruction::Trunc ||

18308 Opcode == Instruction::SExt ||

18309 Opcode == Instruction::ZExt || NumParts > 1;

18310// Conservatively determine if we can actually truncate the roots of the

18311// expression. Collect the values that can be demoted in ToDemote and

18312// additional roots that require investigating in Roots.

18313DenseSet<const TreeEntry *> Visited;

18314unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;

18315bool NeedToDemote = IsProfitableToDemote;

18316

18317if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,

18318 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,

18319 NeedToDemote, IsTruncRoot) ||

18320 (MaxDepthLevel <= Limit &&

18321 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&

18322 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||

18323DL->getTypeSizeInBits(TreeRootIT) /

18324DL->getTypeSizeInBits(

18325 E.getMainOp()->getOperand(0)->getType()) >

18326 2)))))

18327return 0u;

18328// Round MaxBitWidth up to the next power-of-two.

18329 MaxBitWidth =bit_ceil(MaxBitWidth);

18330

18331return MaxBitWidth;

18332 };

18333

18334// If we can truncate the root, we must collect additional values that might

18335// be demoted as a result. That is, those seeded by truncations we will

18336// modify.

18337// Add reduction ops sizes, if any.

18338if (UserIgnoreList &&

18339 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {

18340// Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n

18341// x i1> to in)).

18342if (all_of(*UserIgnoreList,

18343 [](Value *V) {

18344return isa<PoisonValue>(V) ||

18345 cast<Instruction>(V)->getOpcode() == Instruction::Add;

18346 }) &&

18347 VectorizableTree.front()->State == TreeEntry::Vectorize &&

18348 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&

18349 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==

18350 Builder.getInt1Ty()) {

18351 ReductionBitWidth = 1;

18352 }else {

18353for (Value *V : *UserIgnoreList) {

18354if (isa<PoisonValue>(V))

18355continue;

18356unsigned NumSignBits =ComputeNumSignBits(V, *DL, 0, AC,nullptr, DT);

18357TypeSize NumTypeBits =DL->getTypeSizeInBits(V->getType());

18358unsigned BitWidth1 = NumTypeBits - NumSignBits;

18359if (!isKnownNonNegative(V,SimplifyQuery(*DL)))

18360 ++BitWidth1;

18361unsigned BitWidth2 = BitWidth1;

18362if (!RecurrenceDescriptor::isIntMinMaxRecurrenceKind(::getRdxKind(V))) {

18363APInt Mask = DB->getDemandedBits(cast<Instruction>(V));

18364 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();

18365 }

18366 ReductionBitWidth =

18367 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);

18368 }

18369if (ReductionBitWidth < 8 && ReductionBitWidth > 1)

18370 ReductionBitWidth = 8;

18371

18372 ReductionBitWidth =bit_ceil(ReductionBitWidth);

18373 }

18374 }

18375bool IsTopRoot = NodeIdx == 0;

18376while (NodeIdx < VectorizableTree.size() &&

18377 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&

18378 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {

18379 RootDemotes.push_back(NodeIdx);

18380 ++NodeIdx;

18381 IsTruncRoot =true;

18382 }

18383bool IsSignedCmp =false;

18384while (NodeIdx < VectorizableTree.size()) {

18385ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;

18386unsigned Limit = 2;

18387if (IsTopRoot &&

18388 ReductionBitWidth ==

18389DL->getTypeSizeInBits(

18390 VectorizableTree.front()->Scalars.front()->getType()))

18391 Limit = 3;

18392unsigned MaxBitWidth = ComputeMaxBitWidth(

18393 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,

18394 IsTruncRoot, IsSignedCmp);

18395if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {

18396if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)

18397 ReductionBitWidth =bit_ceil(MaxBitWidth);

18398elseif (MaxBitWidth == 0)

18399 ReductionBitWidth = 0;

18400 }

18401

18402for (unsignedIdx : RootDemotes) {

18403if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {

18404uint32_t OrigBitWidth =

18405DL->getTypeSizeInBits(V->getType()->getScalarType());

18406if (OrigBitWidth > MaxBitWidth) {

18407APInt Mask =APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);

18408returnMaskedValueIsZero(V, Mask,SimplifyQuery(*DL));

18409 }

18410returnfalse;

18411 }))

18412 ToDemote.push_back(Idx);

18413 }

18414 RootDemotes.clear();

18415 IsTopRoot =false;

18416 IsProfitableToDemoteRoot =true;

18417

18418if (ExtraBitWidthNodes.empty()) {

18419 NodeIdx = VectorizableTree.size();

18420 }else {

18421unsigned NewIdx = 0;

18422do {

18423 NewIdx = *ExtraBitWidthNodes.begin();

18424 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());

18425 }while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());

18426 NodeIdx = NewIdx;

18427 IsTruncRoot =

18428 NodeIdx < VectorizableTree.size() &&

18429any_of(VectorizableTree[NodeIdx]->UserTreeIndices,

18430 [](constEdgeInfo &EI) {

18431return EI.EdgeIdx == 0 &&

18432 EI.UserTE->getOpcode() == Instruction::Trunc &&

18433 !EI.UserTE->isAltShuffle();

18434 });

18435 IsSignedCmp =

18436 NodeIdx < VectorizableTree.size() &&

18437any_of(

18438 VectorizableTree[NodeIdx]->UserTreeIndices,

18439 [&](constEdgeInfo &EI) {

18440return (EI.UserTE->hasState() &&

18441 EI.UserTE->getOpcode() == Instruction::ICmp) &&

18442any_of(EI.UserTE->Scalars, [&](Value *V) {

18443 auto *IC = dyn_cast<ICmpInst>(V);

18444 return IC &&

18445 (IC->isSigned() ||

18446 !isKnownNonNegative(IC->getOperand(0),

18447 SimplifyQuery(*DL)) ||

18448 !isKnownNonNegative(IC->getOperand(1),

18449 SimplifyQuery(*DL)));

18450 });

18451 });

18452 }

18453

18454// If the maximum bit width we compute is less than the width of the roots'

18455// type, we can proceed with the narrowing. Otherwise, do nothing.

18456if (MaxBitWidth == 0 ||

18457 MaxBitWidth >=

18458 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())

18459 ->getBitWidth()) {

18460if (UserIgnoreList)

18461 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());

18462 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());

18463continue;

18464 }

18465

18466// Finally, map the values we can demote to the maximum bit with we

18467// computed.

18468for (unsignedIdx : ToDemote) {

18469 TreeEntry *TE = VectorizableTree[Idx].get();

18470if (MinBWs.contains(TE))

18471continue;

18472bool IsSigned =any_of(TE->Scalars, [&](Value *R) {

18473 if (isa<PoisonValue>(R))

18474 return false;

18475 return !isKnownNonNegative(R, SimplifyQuery(*DL));

18476 });

18477 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);

18478 }

18479 }

18480}

18481

18482PreservedAnalyses SLPVectorizerPass::run(Function &F,FunctionAnalysisManager &AM) {

18483auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);

18484auto *TTI = &AM.getResult<TargetIRAnalysis>(F);

18485auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);

18486auto *AA = &AM.getResult<AAManager>(F);

18487auto *LI = &AM.getResult<LoopAnalysis>(F);

18488auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);

18489auto *AC = &AM.getResult<AssumptionAnalysis>(F);

18490auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);

18491auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);

18492

18493bool Changed =runImpl(F, SE,TTI, TLI, AA, LI, DT, AC, DB, ORE);

18494if (!Changed)

18495returnPreservedAnalyses::all();

18496

18497PreservedAnalyses PA;

18498 PA.preserveSet<CFGAnalyses>();

18499return PA;

18500}

18501

18502boolSLPVectorizerPass::runImpl(Function &F,ScalarEvolution *SE_,

18503TargetTransformInfo *TTI_,

18504TargetLibraryInfo *TLI_,AAResults *AA_,

18505LoopInfo *LI_,DominatorTree *DT_,

18506AssumptionCache *AC_,DemandedBits *DB_,

18507OptimizationRemarkEmitter *ORE_) {

18508if (!RunSLPVectorization)

18509returnfalse;

18510 SE = SE_;

18511TTI = TTI_;

18512 TLI = TLI_;

18513 AA = AA_;

18514 LI = LI_;

18515 DT = DT_;

18516 AC = AC_;

18517 DB = DB_;

18518DL = &F.getDataLayout();

18519

18520 Stores.clear();

18521 GEPs.clear();

18522bool Changed =false;

18523

18524// If the target claims to have no vector registers don't attempt

18525// vectorization.

18526if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {

18527LLVM_DEBUG(

18528dbgs() <<"SLP: Didn't find any vector registers for target, abort.\n");

18529returnfalse;

18530 }

18531

18532// Don't vectorize when the attribute NoImplicitFloat is used.

18533if (F.hasFnAttribute(Attribute::NoImplicitFloat))

18534returnfalse;

18535

18536LLVM_DEBUG(dbgs() <<"SLP: Analyzing blocks in " <<F.getName() <<".\n");

18537

18538// Use the bottom up slp vectorizer to construct chains that start with

18539// store instructions.

18540BoUpSLP R(&F, SE,TTI, TLI, AA, LI, DT, AC, DB,DL, ORE_);

18541

18542// A general note: the vectorizer must use BoUpSLP::eraseInstruction() to

18543// delete instructions.

18544

18545// Update DFS numbers now so that we can use them for ordering.

18546 DT->updateDFSNumbers();

18547

18548// Scan the blocks in the function in post order.

18549for (auto *BB :post_order(&F.getEntryBlock())) {

18550if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))

18551continue;

18552

18553// Start new block - clear the list of reduction roots.

18554 R.clearReductionData();

18555 collectSeedInstructions(BB);

18556

18557// Vectorize trees that end at stores.

18558if (!Stores.empty()) {

18559LLVM_DEBUG(dbgs() <<"SLP: Found stores for " << Stores.size()

18560 <<" underlying objects.\n");

18561 Changed |= vectorizeStoreChains(R);

18562 }

18563

18564// Vectorize trees that end at reductions.

18565 Changed |= vectorizeChainsInBlock(BB, R);

18566

18567// Vectorize the index computations of getelementptr instructions. This

18568// is primarily intended to catch gather-like idioms ending at

18569// non-consecutive loads.

18570if (!GEPs.empty()) {

18571LLVM_DEBUG(dbgs() <<"SLP: Found GEPs for " << GEPs.size()

18572 <<" underlying objects.\n");

18573 Changed |= vectorizeGEPIndices(BB, R);

18574 }

18575 }

18576

18577if (Changed) {

18578 R.optimizeGatherSequence();

18579LLVM_DEBUG(dbgs() <<"SLP: vectorized \"" <<F.getName() <<"\"\n");

18580 }

18581return Changed;

18582}

18583

18584std::optional<bool>

18585SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain,BoUpSLP &R,

18586unsignedIdx,unsigned MinVF,

18587unsigned &Size) {

18588Size = 0;

18589LLVM_DEBUG(dbgs() <<"SLP: Analyzing a store chain of length " << Chain.size()

18590 <<"\n");

18591constunsigned Sz = R.getVectorElementSize(Chain[0]);

18592unsigned VF = Chain.size();

18593

18594if (!has_single_bit(Sz) ||

18595 !hasFullVectorsOrPowerOf2(

18596 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),

18597 VF) ||

18598 VF < 2 || VF < MinVF) {

18599// Check if vectorizing with a non-power-of-2 VF should be considered. At

18600// the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost

18601// all vector lanes are used.

18602if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))

18603returnfalse;

18604 }

18605

18606LLVM_DEBUG(dbgs() <<"SLP: Analyzing " << VF <<" stores at offset " <<Idx

18607 <<"\n");

18608

18609SetVector<Value *> ValOps;

18610for (Value *V : Chain)

18611 ValOps.insert(cast<StoreInst>(V)->getValueOperand());

18612// Operands are not same/alt opcodes or non-power-of-2 uniques - exit.

18613 InstructionsState S =getSameOpcode(ValOps.getArrayRef(), *TLI);

18614if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {

18615DenseSet<Value *> Stores(Chain.begin(), Chain.end());

18616bool IsAllowedSize =

18617hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),

18618 ValOps.size()) ||

18619 (VectorizeNonPowerOf2 &&has_single_bit(ValOps.size() + 1));

18620if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&

18621 (!S.getMainOp()->isSafeToRemove() ||

18622any_of(ValOps.getArrayRef(),

18623 [&](Value *V) {

18624 return !isa<ExtractElementInst>(V) &&

18625 (V->getNumUses() > Chain.size() ||

18626 any_of(V->users(), [&](User *U) {

18627 return !Stores.contains(U);

18628 }));

18629 }))) ||

18630 (ValOps.size() > Chain.size() / 2 && !S)) {

18631Size = (!IsAllowedSize && S) ? 1 : 2;

18632returnfalse;

18633 }

18634 }

18635if (R.isLoadCombineCandidate(Chain))

18636returntrue;

18637R.buildTree(Chain);

18638// Check if tree tiny and store itself or its value is not vectorized.

18639if (R.isTreeTinyAndNotFullyVectorizable()) {

18640if (R.isGathered(Chain.front()) ||

18641R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))

18642return std::nullopt;

18643Size =R.getCanonicalGraphSize();

18644returnfalse;

18645 }

18646R.reorderTopToBottom();

18647R.reorderBottomToTop();

18648R.transformNodes();

18649R.buildExternalUses();

18650

18651R.computeMinimumValueSizes();

18652

18653Size =R.getCanonicalGraphSize();

18654if (S && S.getOpcode() == Instruction::Load)

18655Size = 2;// cut off masked gather small trees

18656InstructionCost Cost =R.getTreeCost();

18657

18658LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost <<" for VF=" << VF <<"\n");

18659if (Cost < -SLPCostThreshold) {

18660LLVM_DEBUG(dbgs() <<"SLP: Decided to vectorize cost = " <<Cost <<"\n");

18661

18662using namespaceore;

18663

18664R.getORE()->emit(OptimizationRemark(SV_NAME,"StoresVectorized",

18665 cast<StoreInst>(Chain[0]))

18666 <<"Stores SLP vectorized with cost " <<NV("Cost",Cost)

18667 <<" and with tree size "

18668 <<NV("TreeSize",R.getTreeSize()));

18669

18670R.vectorizeTree();

18671returntrue;

18672 }

18673

18674returnfalse;

18675}

18676

18677/// Checks if the quadratic mean deviation is less than 90% of the mean size.

18678staticboolcheckTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,

18679boolFirst) {

18680unsigned Num = 0;

18681uint64_t Sum = std::accumulate(

18682 Sizes.begin(), Sizes.end(),static_cast<uint64_t>(0),

18683 [&](uint64_t V,const std::pair<unsigned, unsigned> &Val) {

18684 unsigned Size = First ? Val.first : Val.second;

18685 if (Size == 1)

18686 return V;

18687 ++Num;

18688 return V + Size;

18689 });

18690if (Num == 0)

18691returntrue;

18692uint64_t Mean = Sum / Num;

18693if (Mean == 0)

18694returntrue;

18695uint64_t Dev = std::accumulate(

18696 Sizes.begin(), Sizes.end(),static_cast<uint64_t>(0),

18697 [&](uint64_t V,const std::pair<unsigned, unsigned> &Val) {

18698 unsigned P = First ? Val.first : Val.second;

18699 if (P == 1)

18700 return V;

18701 return V + (P - Mean) * (P - Mean);

18702 }) /

18703 Num;

18704return Dev * 81 / (Mean * Mean) == 0;

18705}

18706

18707bool SLPVectorizerPass::vectorizeStores(

18708ArrayRef<StoreInst *> Stores,BoUpSLP &R,

18709DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>

18710 &Visited) {

18711// We may run into multiple chains that merge into a single chain. We mark the

18712// stores that we vectorized so that we don't visit the same store twice.

18713BoUpSLP::ValueSet VectorizedStores;

18714bool Changed =false;

18715

18716structStoreDistCompare {

18717bool operator()(const std::pair<unsigned, int> &Op1,

18718const std::pair<unsigned, int> &Op2) const{

18719return Op1.second < Op2.second;

18720 }

18721 };

18722// A set of pairs (index of store in Stores array ref, Distance of the store

18723// address relative to base store address in units).

18724usingStoreIndexToDistSet =

18725 std::set<std::pair<unsigned, int>, StoreDistCompare>;

18726auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {

18727int PrevDist = -1;

18728BoUpSLP::ValueList Operands;

18729// Collect the chain into a list.

18730for (auto [Idx,Data] :enumerate(Set)) {

18731if (Operands.empty() ||Data.second - PrevDist == 1) {

18732Operands.push_back(Stores[Data.first]);

18733 PrevDist =Data.second;

18734if (Idx !=Set.size() - 1)

18735continue;

18736 }

18737auto E =make_scope_exit([&, &DataVar =Data]() {

18738Operands.clear();

18739Operands.push_back(Stores[DataVar.first]);

18740 PrevDist = DataVar.second;

18741 });

18742

18743if (Operands.size() <= 1 ||

18744 !Visited

18745 .insert({Operands.front(),

18746 cast<StoreInst>(Operands.front())->getValueOperand(),

18747 Operands.back(),

18748 cast<StoreInst>(Operands.back())->getValueOperand(),

18749 Operands.size()})

18750 .second)

18751continue;

18752

18753unsigned MaxVecRegSize =R.getMaxVecRegSize();

18754unsigned EltSize =R.getVectorElementSize(Operands[0]);

18755unsigned MaxElts =llvm::bit_floor(MaxVecRegSize / EltSize);

18756

18757unsigned MaxVF =

18758 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);

18759auto *Store = cast<StoreInst>(Operands[0]);

18760Type *StoreTy =Store->getValueOperand()->getType();

18761Type *ValueTy = StoreTy;

18762if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))

18763 ValueTy = Trunc->getSrcTy();

18764unsigned MinVF = std::max<unsigned>(

18765 2,PowerOf2Ceil(TTI->getStoreMinimumVF(

18766R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,

18767 ValueTy)));

18768

18769if (MaxVF < MinVF) {

18770LLVM_DEBUG(dbgs() <<"SLP: Vectorization infeasible as MaxVF (" << MaxVF

18771 <<") < "

18772 <<"MinVF (" << MinVF <<")\n");

18773continue;

18774 }

18775

18776unsigned NonPowerOf2VF = 0;

18777if (VectorizeNonPowerOf2) {

18778// First try vectorizing with a non-power-of-2 VF. At the moment, only

18779// consider cases where VF + 1 is a power-of-2, i.e. almost all vector

18780// lanes are used.

18781unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);

18782if (has_single_bit(CandVF + 1)) {

18783 NonPowerOf2VF = CandVF;

18784assert(NonPowerOf2VF != MaxVF &&

18785"Non-power-of-2 VF should not be equal to MaxVF");

18786 }

18787 }

18788

18789unsigned MaxRegVF = MaxVF;

18790 MaxVF = std::min<unsigned>(MaxVF,bit_floor(Operands.size()));

18791if (MaxVF < MinVF) {

18792LLVM_DEBUG(dbgs() <<"SLP: Vectorization infeasible as MaxVF (" << MaxVF

18793 <<") < "

18794 <<"MinVF (" << MinVF <<")\n");

18795continue;

18796 }

18797

18798unsigned Sz = 1 +Log2_32(MaxVF) -Log2_32(MinVF);

18799SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));

18800unsignedSize = MinVF;

18801for_each(reverse(CandidateVFs), [&](unsigned &VF) {

18802 VF =Size > MaxVF ? NonPowerOf2VF :Size;

18803Size *= 2;

18804 });

18805unsignedEnd =Operands.size();

18806unsigned Repeat = 0;

18807constexprunsigned MaxAttempts = 4;

18808OwningArrayRef<std::pair<unsigned, unsigned>> RangeSizes(Operands.size());

18809for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {

18810P.first =P.second = 1;

18811 });

18812DenseMap<Value *, std::pair<unsigned, unsigned>> NonSchedulable;

18813auto IsNotVectorized = [](boolFirst,

18814const std::pair<unsigned, unsigned> &P) {

18815returnFirst ?P.first > 0 :P.second > 0;

18816 };

18817auto IsVectorized = [](boolFirst,

18818const std::pair<unsigned, unsigned> &P) {

18819returnFirst ?P.first == 0 :P.second == 0;

18820 };

18821auto VFIsProfitable = [](boolFirst,unsignedSize,

18822const std::pair<unsigned, unsigned> &P) {

18823returnFirst ?Size >=P.first :Size >=P.second;

18824 };

18825auto FirstSizeSame = [](unsignedSize,

18826const std::pair<unsigned, unsigned> &P) {

18827returnSize ==P.first;

18828 };

18829while (true) {

18830 ++Repeat;

18831bool RepeatChanged =false;

18832bool AnyProfitableGraph =false;

18833for (unsignedSize : CandidateVFs) {

18834 AnyProfitableGraph =false;

18835unsigned StartIdx = std::distance(

18836 RangeSizes.begin(),

18837find_if(RangeSizes, std::bind(IsNotVectorized,Size >= MaxRegVF,

18838 std::placeholders::_1)));

18839while (StartIdx <End) {

18840unsigned EndIdx =

18841 std::distance(RangeSizes.begin(),

18842find_if(RangeSizes.drop_front(StartIdx),

18843 std::bind(IsVectorized,Size >= MaxRegVF,

18844 std::placeholders::_1)));

18845unsigned Sz = EndIdx >=End ?End : EndIdx;

18846for (unsigned Cnt = StartIdx; Cnt +Size <= Sz;) {

18847if (!checkTreeSizes(RangeSizes.slice(Cnt,Size),

18848Size >= MaxRegVF)) {

18849 ++Cnt;

18850continue;

18851 }

18852ArrayRef<Value *> Slice =ArrayRef(Operands).slice(Cnt,Size);

18853assert(all_of(Slice,

18854 [&](Value *V) {

18855return cast<StoreInst>(V)

18856 ->getValueOperand()

18857 ->getType() ==

18858 cast<StoreInst>(Slice.front())

18859 ->getValueOperand()

18860 ->getType();

18861 }) &&

18862"Expected all operands of same type.");

18863if (!NonSchedulable.empty()) {

18864auto [NonSchedSizeMax, NonSchedSizeMin] =

18865 NonSchedulable.lookup(Slice.front());

18866if (NonSchedSizeMax > 0 && NonSchedSizeMin <=Size) {

18867 Cnt += NonSchedSizeMax;

18868continue;

18869 }

18870 }

18871unsigned TreeSize;

18872 std::optional<bool> Res =

18873 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);

18874if (!Res) {

18875 NonSchedulable

18876 .try_emplace(Slice.front(), std::make_pair(Size,Size))

18877 .first->getSecond()

18878 .second =Size;

18879 }elseif (*Res) {

18880// Mark the vectorized stores so that we don't vectorize them

18881// again.

18882 VectorizedStores.insert(Slice.begin(), Slice.end());

18883// Mark the vectorized stores so that we don't vectorize them

18884// again.

18885 AnyProfitableGraph = RepeatChanged = Changed =true;

18886// If we vectorized initial block, no need to try to vectorize

18887// it again.

18888for_each(RangeSizes.slice(Cnt,Size),

18889 [](std::pair<unsigned, unsigned> &P) {

18890 P.first = P.second = 0;

18891 });

18892if (Cnt < StartIdx + MinVF) {

18893for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),

18894 [](std::pair<unsigned, unsigned> &P) {

18895 P.first = P.second = 0;

18896 });

18897 StartIdx = Cnt +Size;

18898 }

18899if (Cnt > Sz -Size - MinVF) {

18900for_each(RangeSizes.slice(Cnt +Size, Sz - (Cnt +Size)),

18901 [](std::pair<unsigned, unsigned> &P) {

18902 P.first = P.second = 0;

18903 });

18904if (Sz ==End)

18905End = Cnt;

18906 Sz = Cnt;

18907 }

18908 Cnt +=Size;

18909continue;

18910 }

18911if (Size > 2 && Res &&

18912 !all_of(RangeSizes.slice(Cnt,Size),

18913 std::bind(VFIsProfitable,Size >= MaxRegVF, TreeSize,

18914 std::placeholders::_1))) {

18915 Cnt +=Size;

18916continue;

18917 }

18918// Check for the very big VFs that we're not rebuilding same

18919// trees, just with larger number of elements.

18920if (Size > MaxRegVF && TreeSize > 1 &&

18921all_of(RangeSizes.slice(Cnt,Size),

18922 std::bind(FirstSizeSame, TreeSize,

18923 std::placeholders::_1))) {

18924 Cnt +=Size;

18925while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)

18926 ++Cnt;

18927continue;

18928 }

18929if (TreeSize > 1)

18930for_each(RangeSizes.slice(Cnt,Size),

18931 [&](std::pair<unsigned, unsigned> &P) {

18932 if (Size >= MaxRegVF)

18933 P.second = std::max(P.second, TreeSize);

18934 else

18935 P.first = std::max(P.first, TreeSize);

18936 });

18937 ++Cnt;

18938 AnyProfitableGraph =true;

18939 }

18940if (StartIdx >=End)

18941break;

18942if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)

18943 AnyProfitableGraph =true;

18944 StartIdx = std::distance(

18945 RangeSizes.begin(),

18946find_if(RangeSizes.drop_front(Sz),

18947 std::bind(IsNotVectorized,Size >= MaxRegVF,

18948 std::placeholders::_1)));

18949 }

18950if (!AnyProfitableGraph &&Size >= MaxRegVF &&has_single_bit(Size))

18951break;

18952 }

18953// All values vectorized - exit.

18954if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {

18955returnP.first == 0 &&P.second == 0;

18956 }))

18957break;

18958// Check if tried all attempts or no need for the last attempts at all.

18959if (Repeat >= MaxAttempts ||

18960 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))

18961break;

18962constexprunsigned StoresLimit = 64;

18963constunsigned MaxTotalNum = std::min<unsigned>(

18964Operands.size(),

18965static_cast<unsigned>(

18966End -

18967 std::distance(

18968 RangeSizes.begin(),

18969find_if(RangeSizes, std::bind(IsNotVectorized,true,

18970 std::placeholders::_1))) +

18971 1));

18972unsigned VF =bit_ceil(CandidateVFs.front()) * 2;

18973unsigned Limit =

18974getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);

18975 CandidateVFs.clear();

18976if (bit_floor(Limit) == VF)

18977 CandidateVFs.push_back(Limit);

18978if (VF > MaxTotalNum || VF >= StoresLimit)

18979break;

18980for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {

18981if (P.first != 0)

18982P.first = std::max(P.second,P.first);

18983 });

18984// Last attempt to vectorize max number of elements, if all previous

18985// attempts were unsuccessful because of the cost issues.

18986 CandidateVFs.push_back(VF);

18987 }

18988 }

18989 };

18990

18991// Stores pair (first: index of the store into Stores array ref, address of

18992// which taken as base, second: sorted set of pairs {index, dist}, which are

18993// indices of stores in the set and their store location distances relative to

18994// the base address).

18995

18996// Need to store the index of the very first store separately, since the set

18997// may be reordered after the insertion and the first store may be moved. This

18998// container allows to reduce number of calls of getPointersDiff() function.

18999SmallVector<std::pair<unsigned, StoreIndexToDistSet>> SortedStores;

19000// Inserts the specified store SI with the given index Idx to the set of the

19001// stores. If the store with the same distance is found already - stop

19002// insertion, try to vectorize already found stores. If some stores from this

19003// sequence were not vectorized - try to vectorize them with the new store

19004// later. But this logic is applied only to the stores, that come before the

19005// previous store with the same distance.

19006// Example:

19007// 1. store x, %p

19008// 2. store y, %p+1

19009// 3. store z, %p+2

19010// 4. store a, %p

19011// 5. store b, %p+3

19012// - Scan this from the last to first store. The very first bunch of stores is

19013// {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores

19014// vector).

19015// - The next store in the list - #1 - has the same distance from store #5 as

19016// the store #4.

19017// - Try to vectorize sequence of stores 4,2,3,5.

19018// - If all these stores are vectorized - just drop them.

19019// - If some of them are not vectorized (say, #3 and #5), do extra analysis.

19020// - Start new stores sequence.

19021// The new bunch of stores is {1, {1, 0}}.

19022// - Add the stores from previous sequence, that were not vectorized.

19023// Here we consider the stores in the reversed order, rather they are used in

19024// the IR (Stores are reversed already, see vectorizeStoreChains() function).

19025// Store #3 can be added -> comes after store #4 with the same distance as

19026// store #1.

19027// Store #5 cannot be added - comes before store #4.

19028// This logic allows to improve the compile time, we assume that the stores

19029// after previous store with the same distance most likely have memory

19030// dependencies and no need to waste compile time to try to vectorize them.

19031// - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.

19032auto FillStoresSet = [&](unsignedIdx,StoreInst *SI) {

19033for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {

19034 std::optional<int> Diff =getPointersDiff(

19035 Stores[Set.first]->getValueOperand()->getType(),

19036 Stores[Set.first]->getPointerOperand(),

19037SI->getValueOperand()->getType(),SI->getPointerOperand(), *DL, *SE,

19038/*StrictCheck=*/true);

19039if (!Diff)

19040continue;

19041auto It =Set.second.find(std::make_pair(Idx, *Diff));

19042if (It ==Set.second.end()) {

19043Set.second.emplace(Idx, *Diff);

19044return;

19045 }

19046// Try to vectorize the first found set to avoid duplicate analysis.

19047 TryToVectorize(Set.second);

19048unsigned ItIdx = It->first;

19049int ItDist = It->second;

19050 StoreIndexToDistSet PrevSet;

19051copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),

19052 [&](const std::pair<unsigned, int> &Pair) {

19053 return Pair.first > ItIdx;

19054 });

19055Set.second.clear();

19056Set.first =Idx;

19057Set.second.emplace(Idx, 0);

19058// Insert stores that followed previous match to try to vectorize them

19059// with this store.

19060unsigned StartIdx = ItIdx + 1;

19061SmallBitVector UsedStores(Idx - StartIdx);

19062// Distances to previously found dup store (or this store, since they

19063// store to the same addresses).

19064SmallVector<int> Dists(Idx - StartIdx, 0);

19065for (const std::pair<unsigned, int> &Pair :reverse(PrevSet)) {

19066// Do not try to vectorize sequences, we already tried.

19067if (VectorizedStores.contains(Stores[Pair.first]))

19068break;

19069unsigned BI = Pair.first - StartIdx;

19070 UsedStores.set(BI);

19071 Dists[BI] = Pair.second - ItDist;

19072 }

19073for (unsignedI = StartIdx;I <Idx; ++I) {

19074unsigned BI =I - StartIdx;

19075if (UsedStores.test(BI))

19076Set.second.emplace(I, Dists[BI]);

19077 }

19078return;

19079 }

19080auto &Res = SortedStores.emplace_back();

19081 Res.first =Idx;

19082 Res.second.emplace(Idx, 0);

19083 };

19084Type *PrevValTy =nullptr;

19085for (auto [I, SI] :enumerate(Stores)) {

19086if (R.isDeleted(SI))

19087continue;

19088if (!PrevValTy)

19089 PrevValTy =SI->getValueOperand()->getType();

19090// Check that we do not try to vectorize stores of different types.

19091if (PrevValTy !=SI->getValueOperand()->getType()) {

19092for (auto &Set : SortedStores)

19093 TryToVectorize(Set.second);

19094 SortedStores.clear();

19095 PrevValTy =SI->getValueOperand()->getType();

19096 }

19097 FillStoresSet(I, SI);

19098 }

19099

19100// Final vectorization attempt.

19101for (auto &Set : SortedStores)

19102 TryToVectorize(Set.second);

19103

19104return Changed;

19105}

19106

19107void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {

19108// Initialize the collections. We will make a single pass over the block.

19109 Stores.clear();

19110 GEPs.clear();

19111

19112// Visit the store and getelementptr instructions in BB and organize them in

19113// Stores and GEPs according to the underlying objects of their pointer

19114// operands.

19115for (Instruction &I : *BB) {

19116// Ignore store instructions that are volatile or have a pointer operand

19117// that doesn't point to a scalar type.

19118if (auto *SI = dyn_cast<StoreInst>(&I)) {

19119if (!SI->isSimple())

19120continue;

19121if (!isValidElementType(SI->getValueOperand()->getType()))

19122continue;

19123 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);

19124 }

19125

19126// Ignore getelementptr instructions that have more than one index, a

19127// constant index, or a pointer operand that doesn't point to a scalar

19128// type.

19129elseif (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {

19130if (GEP->getNumIndices() != 1)

19131continue;

19132Value *Idx =GEP->idx_begin()->get();

19133if (isa<Constant>(Idx))

19134continue;

19135if (!isValidElementType(Idx->getType()))

19136continue;

19137if (GEP->getType()->isVectorTy())

19138continue;

19139 GEPs[GEP->getPointerOperand()].push_back(GEP);

19140 }

19141 }

19142}

19143

19144bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL,BoUpSLP &R,

19145bool MaxVFOnly) {

19146if (VL.size() < 2)

19147returnfalse;

19148

19149LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize a list of length = "

19150 << VL.size() <<".\n");

19151

19152// Check that all of the parts are instructions of the same type,

19153// we permit an alternate opcode via InstructionsState.

19154 InstructionsState S =getSameOpcode(VL, *TLI);

19155if (!S)

19156returnfalse;

19157

19158Instruction *I0 = S.getMainOp();

19159// Make sure invalid types (including vector type) are rejected before

19160// determining vectorization factor for scalar instructions.

19161for (Value *V : VL) {

19162Type *Ty =V->getType();

19163if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {

19164// NOTE: the following will give user internal llvm type name, which may

19165// not be useful.

19166R.getORE()->emit([&]() {

19167 std::string TypeStr;

19168llvm::raw_string_ostream rso(TypeStr);

19169 Ty->print(rso);

19170returnOptimizationRemarkMissed(SV_NAME,"UnsupportedType", I0)

19171 <<"Cannot SLP vectorize list: type "

19172 << TypeStr +" is unsupported by vectorizer";

19173 });

19174returnfalse;

19175 }

19176 }

19177

19178Type *ScalarTy =getValueType(VL[0]);

19179unsigned Sz =R.getVectorElementSize(I0);

19180unsigned MinVF =R.getMinVF(Sz);

19181unsigned MaxVF = std::max<unsigned>(

19182getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);

19183 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);

19184if (MaxVF < 2) {

19185R.getORE()->emit([&]() {

19186returnOptimizationRemarkMissed(SV_NAME,"SmallVF", I0)

19187 <<"Cannot SLP vectorize list: vectorization factor "

19188 <<"less than 2 is not supported";

19189 });

19190returnfalse;

19191 }

19192

19193bool Changed =false;

19194bool CandidateFound =false;

19195InstructionCost MinCost =SLPCostThreshold.getValue();

19196

19197unsigned NextInst = 0, MaxInst = VL.size();

19198for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;

19199 VF =getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {

19200// No actual vectorization should happen, if number of parts is the same as

19201// provided vectorization factor (i.e. the scalar type is used for vector

19202// code during codegen).

19203auto *VecTy =getWidenedType(ScalarTy, VF);

19204if (TTI->getNumberOfParts(VecTy) == VF)

19205continue;

19206for (unsignedI = NextInst;I < MaxInst; ++I) {

19207unsigned ActualVF = std::min(MaxInst -I, VF);

19208

19209if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))

19210continue;

19211

19212if (MaxVFOnly && ActualVF < MaxVF)

19213break;

19214if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))

19215break;

19216

19217SmallVector<Value *> Ops(ActualVF,nullptr);

19218unsignedIdx = 0;

19219for (Value *V : VL.drop_front(I)) {

19220// Check that a previous iteration of this loop did not delete the

19221// Value.

19222if (auto *Inst = dyn_cast<Instruction>(V);

19223 !Inst || !R.isDeleted(Inst)) {

19224 Ops[Idx] =V;

19225 ++Idx;

19226if (Idx == ActualVF)

19227break;

19228 }

19229 }

19230// Not enough vectorizable instructions - exit.

19231if (Idx != ActualVF)

19232break;

19233

19234LLVM_DEBUG(dbgs() <<"SLP: Analyzing " << ActualVF <<" operations "

19235 <<"\n");

19236

19237R.buildTree(Ops);

19238if (R.isTreeTinyAndNotFullyVectorizable())

19239continue;

19240R.reorderTopToBottom();

19241R.reorderBottomToTop(

19242/*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&

19243 !R.doesRootHaveInTreeUses());

19244R.transformNodes();

19245R.buildExternalUses();

19246

19247R.computeMinimumValueSizes();

19248InstructionCost Cost =R.getTreeCost();

19249 CandidateFound =true;

19250 MinCost = std::min(MinCost,Cost);

19251

19252LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost

19253 <<" for VF=" << ActualVF <<"\n");

19254if (Cost < -SLPCostThreshold) {

19255LLVM_DEBUG(dbgs() <<"SLP: Vectorizing list at cost:" <<Cost <<".\n");

19256R.getORE()->emit(OptimizationRemark(SV_NAME,"VectorizedList",

19257 cast<Instruction>(Ops[0]))

19258 <<"SLP vectorized with cost " <<ore::NV("Cost",Cost)

19259 <<" and with tree size "

19260 <<ore::NV("TreeSize",R.getTreeSize()));

19261

19262R.vectorizeTree();

19263// Move to the next bundle.

19264I += VF - 1;

19265 NextInst =I + 1;

19266 Changed =true;

19267 }

19268 }

19269 }

19270

19271if (!Changed && CandidateFound) {

19272R.getORE()->emit([&]() {

19273returnOptimizationRemarkMissed(SV_NAME,"NotBeneficial", I0)

19274 <<"List vectorization was possible but not beneficial with cost "

19275 <<ore::NV("Cost", MinCost) <<" >= "

19276 <<ore::NV("Treshold", -SLPCostThreshold);

19277 });

19278 }elseif (!Changed) {

19279R.getORE()->emit([&]() {

19280returnOptimizationRemarkMissed(SV_NAME,"NotPossible", I0)

19281 <<"Cannot SLP vectorize list: vectorization was impossible"

19282 <<" with available vectorization factors";

19283 });

19284 }

19285return Changed;

19286}

19287

19288bool SLPVectorizerPass::tryToVectorize(Instruction *I,BoUpSLP &R) {

19289if (!I)

19290returnfalse;

19291

19292if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))

19293returnfalse;

19294

19295Value *P =I->getParent();

19296

19297// Vectorize in current basic block only.

19298auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));

19299auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));

19300if (!Op0 || !Op1 || Op0->getParent() !=P || Op1->getParent() !=P ||

19301R.isDeleted(Op0) ||R.isDeleted(Op1))

19302returnfalse;

19303

19304// First collect all possible candidates

19305SmallVector<std::pair<Value *, Value *>, 4> Candidates;

19306 Candidates.emplace_back(Op0, Op1);

19307

19308auto *A = dyn_cast<BinaryOperator>(Op0);

19309auto *B = dyn_cast<BinaryOperator>(Op1);

19310// Try to skip B.

19311if (A &&B &&B->hasOneUse()) {

19312auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));

19313auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));

19314if (B0 && B0->getParent() ==P && !R.isDeleted(B0))

19315 Candidates.emplace_back(A, B0);

19316if (B1 && B1->getParent() ==P && !R.isDeleted(B1))

19317 Candidates.emplace_back(A, B1);

19318 }

19319// Try to skip A.

19320if (B &&A &&A->hasOneUse()) {

19321auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));

19322auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));

19323if (A0 && A0->getParent() ==P && !R.isDeleted(A0))

19324 Candidates.emplace_back(A0,B);

19325if (A1 && A1->getParent() ==P && !R.isDeleted(A1))

19326 Candidates.emplace_back(A1,B);

19327 }

19328

19329if (Candidates.size() == 1)

19330return tryToVectorizeList({Op0, Op1},R);

19331

19332// We have multiple options. Try to pick the single best.

19333 std::optional<int> BestCandidate =R.findBestRootPair(Candidates);

19334if (!BestCandidate)

19335returnfalse;

19336return tryToVectorizeList(

19337 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},R);

19338}

19339

19340namespace{

19341

19342/// Model horizontal reductions.

19343///

19344/// A horizontal reduction is a tree of reduction instructions that has values

19345/// that can be put into a vector as its leaves. For example:

19346///

19347/// mul mul mul mul

19348/// \ / \ /

19349/// + +

19350/// \ /

19351/// +

19352/// This tree has "mul" as its leaf values and "+" as its reduction

19353/// instructions. A reduction can feed into a store or a binary operation

19354/// feeding a phi.

19355/// ...

19356/// \ /

19357/// +

19358/// |

19359/// phi +=

19360///

19361/// Or:

19362/// ...

19363/// \ /

19364/// +

19365/// |

19366/// *p =

19367///

19368classHorizontalReduction {

19369usingReductionOpsType =SmallVector<Value *, 16>;

19370usingReductionOpsListType =SmallVector<ReductionOpsType, 2>;

19371 ReductionOpsListType ReductionOps;

19372 /// List of possibly reduced values.

19373SmallVector<SmallVector<Value *>> ReducedVals;

19374 /// Maps reduced value to the corresponding reduction operation.

19375SmallDenseMap<Value *, SmallVector<Instruction *>, 16> ReducedValsToOps;

19376WeakTrackingVH ReductionRoot;

19377 /// The type of reduction operation.

19378RecurKind RdxKind;

19379 /// Checks if the optimization of original scalar identity operations on

19380 /// matched horizontal reductions is enabled and allowed.

19381bool IsSupportedHorRdxIdentityOp =false;

19382

19383staticbool isCmpSelMinMax(Instruction *I) {

19384returnmatch(I,m_Select(m_Cmp(),m_Value(),m_Value())) &&

19385RecurrenceDescriptor::isMinMaxRecurrenceKind(getRdxKind(I));

19386 }

19387

19388// And/or are potentially poison-safe logical patterns like:

19389// select x, y, false

19390// select x, true, y

19391staticbool isBoolLogicOp(Instruction *I) {

19392return isa<SelectInst>(I) &&

19393 (match(I,m_LogicalAnd()) ||match(I,m_LogicalOr()));

19394 }

19395

19396 /// Checks if instruction is associative and can be vectorized.

19397staticbool isVectorizable(RecurKind Kind,Instruction *I) {

19398if (Kind == RecurKind::None)

19399returnfalse;

19400

19401// Integer ops that map to select instructions or intrinsics are fine.

19402if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind) ||

19403 isBoolLogicOp(I))

19404returntrue;

19405

19406if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {

19407// FP min/max are associative except for NaN and -0.0. We do not

19408// have to rule out -0.0 here because the intrinsic semantics do not

19409// specify a fixed result for it.

19410returnI->getFastMathFlags().noNaNs();

19411 }

19412

19413if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)

19414returntrue;

19415

19416returnI->isAssociative();

19417 }

19418

19419staticValue *getRdxOperand(Instruction *I,unsignedIndex) {

19420// Poison-safe 'or' takes the form: select X, true, Y

19421// To make that work with the normal operand processing, we skip the

19422// true value operand.

19423// TODO: Change the code and data structures to handle this without a hack.

19424if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) &&Index == 1)

19425returnI->getOperand(2);

19426returnI->getOperand(Index);

19427 }

19428

19429 /// Creates reduction operation with the current opcode.

19430staticValue *createOp(IRBuilderBase &Builder,RecurKind Kind,Value *LHS,

19431Value *RHS,constTwine &Name,bool UseSelect) {

19432switch (Kind) {

19433case RecurKind::Or: {

19434if (UseSelect &&

19435LHS->getType() ==CmpInst::makeCmpResultType(LHS->getType()))

19436return Builder.CreateSelect(LHS, Builder.getTrue(),RHS,Name);

19437unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);

19438return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,

19439Name);

19440 }

19441case RecurKind::And: {

19442if (UseSelect &&

19443LHS->getType() ==CmpInst::makeCmpResultType(LHS->getType()))

19444return Builder.CreateSelect(LHS,RHS, Builder.getFalse(),Name);

19445unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);

19446return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,

19447Name);

19448 }

19449case RecurKind::Add:

19450case RecurKind::Mul:

19451case RecurKind::Xor:

19452case RecurKind::FAdd:

19453case RecurKind::FMul: {

19454unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(Kind);

19455return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode,LHS,RHS,

19456Name);

19457 }

19458case RecurKind::SMax:

19459case RecurKind::SMin:

19460case RecurKind::UMax:

19461case RecurKind::UMin:

19462if (UseSelect) {

19463CmpInst::Predicate Pred =llvm::getMinMaxReductionPredicate(Kind);

19464Value *Cmp = Builder.CreateICmp(Pred,LHS,RHS,Name);

19465return Builder.CreateSelect(Cmp,LHS,RHS,Name);

19466 }

19467 [[fallthrough]];

19468case RecurKind::FMax:

19469case RecurKind::FMin:

19470case RecurKind::FMaximum:

19471case RecurKind::FMinimum: {

19472Intrinsic::ID Id =llvm::getMinMaxReductionIntrinsicOp(Kind);

19473return Builder.CreateBinaryIntrinsic(Id,LHS,RHS);

19474 }

19475default:

19476llvm_unreachable("Unknown reduction operation.");

19477 }

19478 }

19479

19480 /// Creates reduction operation with the current opcode with the IR flags

19481 /// from \p ReductionOps, dropping nuw/nsw flags.

19482staticValue *createOp(IRBuilderBase &Builder,RecurKind RdxKind,Value *LHS,

19483Value *RHS,constTwine &Name,

19484const ReductionOpsListType &ReductionOps) {

19485bool UseSelect = ReductionOps.size() == 2 ||

19486// Logical or/and.

19487 (ReductionOps.size() == 1 &&

19488any_of(ReductionOps.front(), IsaPred<SelectInst>));

19489assert((!UseSelect || ReductionOps.size() != 2 ||

19490 isa<SelectInst>(ReductionOps[1][0])) &&

19491"Expected cmp + select pairs for reduction");

19492Value *Op = createOp(Builder, RdxKind,LHS,RHS,Name, UseSelect);

19493if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(RdxKind)) {

19494if (auto *Sel = dyn_cast<SelectInst>(Op)) {

19495propagateIRFlags(Sel->getCondition(), ReductionOps[0],nullptr,

19496/*IncludeWrapFlags=*/false);

19497propagateIRFlags(Op, ReductionOps[1],nullptr,

19498/*IncludeWrapFlags=*/false);

19499returnOp;

19500 }

19501 }

19502propagateIRFlags(Op, ReductionOps[0],nullptr,/*IncludeWrapFlags=*/false);

19503returnOp;

19504 }

19505

19506public:

19507staticRecurKind getRdxKind(Value *V) {

19508auto *I = dyn_cast<Instruction>(V);

19509if (!I)

19510return RecurKind::None;

19511if (match(I,m_Add(m_Value(),m_Value())))

19512return RecurKind::Add;

19513if (match(I,m_Mul(m_Value(),m_Value())))

19514return RecurKind::Mul;

19515if (match(I,m_And(m_Value(),m_Value())) ||

19516match(I,m_LogicalAnd(m_Value(),m_Value())))

19517return RecurKind::And;

19518if (match(I,m_Or(m_Value(),m_Value())) ||

19519match(I,m_LogicalOr(m_Value(),m_Value())))

19520return RecurKind::Or;

19521if (match(I,m_Xor(m_Value(),m_Value())))

19522return RecurKind::Xor;

19523if (match(I,m_FAdd(m_Value(),m_Value())))

19524return RecurKind::FAdd;

19525if (match(I,m_FMul(m_Value(),m_Value())))

19526return RecurKind::FMul;

19527

19528if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(),m_Value())))

19529return RecurKind::FMax;

19530if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(),m_Value())))

19531return RecurKind::FMin;

19532

19533if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(),m_Value())))

19534return RecurKind::FMaximum;

19535if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(),m_Value())))

19536return RecurKind::FMinimum;

19537// This matches either cmp+select or intrinsics. SLP is expected to handle

19538// either form.

19539// TODO: If we are canonicalizing to intrinsics, we can remove several

19540// special-case paths that deal with selects.

19541if (match(I,m_SMax(m_Value(),m_Value())))

19542return RecurKind::SMax;

19543if (match(I,m_SMin(m_Value(),m_Value())))

19544return RecurKind::SMin;

19545if (match(I,m_UMax(m_Value(),m_Value())))

19546return RecurKind::UMax;

19547if (match(I,m_UMin(m_Value(),m_Value())))

19548return RecurKind::UMin;

19549

19550if (auto *Select = dyn_cast<SelectInst>(I)) {

19551// Try harder: look for min/max pattern based on instructions producing

19552// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).

19553// During the intermediate stages of SLP, it's very common to have

19554// pattern like this (since optimizeGatherSequence is run only once

19555// at the end):

19556// %1 = extractelement <2 x i32> %a, i32 0

19557// %2 = extractelement <2 x i32> %a, i32 1

19558// %cond = icmp sgt i32 %1, %2

19559// %3 = extractelement <2 x i32> %a, i32 0

19560// %4 = extractelement <2 x i32> %a, i32 1

19561// %select = select i1 %cond, i32 %3, i32 %4

19562CmpPredicate Pred;

19563Instruction *L1;

19564Instruction *L2;

19565

19566Value *LHS =Select->getTrueValue();

19567Value *RHS =Select->getFalseValue();

19568Value *Cond =Select->getCondition();

19569

19570// TODO: Support inverse predicates.

19571if (match(Cond,m_Cmp(Pred,m_Specific(LHS),m_Instruction(L2)))) {

19572if (!isa<ExtractElementInst>(RHS) ||

19573 !L2->isIdenticalTo(cast<Instruction>(RHS)))

19574return RecurKind::None;

19575 }elseif (match(Cond,m_Cmp(Pred,m_Instruction(L1),m_Specific(RHS)))) {

19576if (!isa<ExtractElementInst>(LHS) ||

19577 !L1->isIdenticalTo(cast<Instruction>(LHS)))

19578return RecurKind::None;

19579 }else {

19580if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))

19581return RecurKind::None;

19582if (!match(Cond,m_Cmp(Pred,m_Instruction(L1),m_Instruction(L2))) ||

19583 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||

19584 !L2->isIdenticalTo(cast<Instruction>(RHS)))

19585return RecurKind::None;

19586 }

19587

19588switch (Pred) {

19589default:

19590return RecurKind::None;

19591caseCmpInst::ICMP_SGT:

19592caseCmpInst::ICMP_SGE:

19593return RecurKind::SMax;

19594caseCmpInst::ICMP_SLT:

19595caseCmpInst::ICMP_SLE:

19596return RecurKind::SMin;

19597caseCmpInst::ICMP_UGT:

19598caseCmpInst::ICMP_UGE:

19599return RecurKind::UMax;

19600caseCmpInst::ICMP_ULT:

19601caseCmpInst::ICMP_ULE:

19602return RecurKind::UMin;

19603 }

19604 }

19605return RecurKind::None;

19606 }

19607

19608 /// Get the index of the first operand.

19609staticunsigned getFirstOperandIndex(Instruction *I) {

19610return isCmpSelMinMax(I) ? 1 : 0;

19611 }

19612

19613private:

19614 /// Total number of operands in the reduction operation.

19615staticunsigned getNumberOfOperands(Instruction *I) {

19616return isCmpSelMinMax(I) ? 3 : 2;

19617 }

19618

19619 /// Checks if the instruction is in basic block \p BB.

19620 /// For a cmp+sel min/max reduction check that both ops are in \p BB.

19621staticbool hasSameParent(Instruction *I,BasicBlock *BB) {

19622if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {

19623auto *Sel = cast<SelectInst>(I);

19624auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());

19625return Sel->getParent() == BB &&Cmp &&Cmp->getParent() == BB;

19626 }

19627returnI->getParent() == BB;

19628 }

19629

19630 /// Expected number of uses for reduction operations/reduced values.

19631staticbool hasRequiredNumberOfUses(bool IsCmpSelMinMax,Instruction *I) {

19632if (IsCmpSelMinMax) {

19633// SelectInst must be used twice while the condition op must have single

19634// use only.

19635if (auto *Sel = dyn_cast<SelectInst>(I))

19636return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();

19637returnI->hasNUses(2);

19638 }

19639

19640// Arithmetic reduction operation must be used once only.

19641returnI->hasOneUse();

19642 }

19643

19644 /// Initializes the list of reduction operations.

19645void initReductionOps(Instruction *I) {

19646if (isCmpSelMinMax(I))

19647 ReductionOps.assign(2, ReductionOpsType());

19648else

19649 ReductionOps.assign(1, ReductionOpsType());

19650 }

19651

19652 /// Add all reduction operations for the reduction instruction \p I.

19653void addReductionOps(Instruction *I) {

19654if (isCmpSelMinMax(I)) {

19655 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());

19656 ReductionOps[1].emplace_back(I);

19657 }else {

19658 ReductionOps[0].emplace_back(I);

19659 }

19660 }

19661

19662staticbool isGoodForReduction(ArrayRef<Value *> Data) {

19663int Sz = Data.size();

19664auto *I = dyn_cast<Instruction>(Data.front());

19665return Sz > 1 ||isConstant(Data.front()) ||

19666 (I && !isa<LoadInst>(I) &&isValidForAlternation(I->getOpcode()));

19667 }

19668

19669public:

19670HorizontalReduction() =default;

19671

19672 /// Try to find a reduction tree.

19673bool matchAssociativeReduction(BoUpSLP &R,Instruction *Root,

19674ScalarEvolution &SE,constDataLayout &DL,

19675constTargetLibraryInfo &TLI) {

19676 RdxKind = HorizontalReduction::getRdxKind(Root);

19677if (!isVectorizable(RdxKind, Root))

19678returnfalse;

19679

19680// Analyze "regular" integer/FP types for reductions - no target-specific

19681// types or pointers.

19682Type *Ty = Root->getType();

19683if (!isValidElementType(Ty) || Ty->isPointerTy())

19684returnfalse;

19685

19686// Though the ultimate reduction may have multiple uses, its condition must

19687// have only single use.

19688if (auto *Sel = dyn_cast<SelectInst>(Root))

19689if (!Sel->getCondition()->hasOneUse())

19690returnfalse;

19691

19692 ReductionRoot = Root;

19693

19694// Iterate through all the operands of the possible reduction tree and

19695// gather all the reduced values, sorting them by their value id.

19696BasicBlock *BB = Root->getParent();

19697bool IsCmpSelMinMax = isCmpSelMinMax(Root);

19698SmallVector<std::pair<Instruction *, unsigned>> Worklist(

19699 1, std::make_pair(Root, 0));

19700// Checks if the operands of the \p TreeN instruction are also reduction

19701// operations or should be treated as reduced values or an extra argument,

19702// which is not part of the reduction.

19703auto CheckOperands = [&](Instruction *TreeN,

19704SmallVectorImpl<Value *> &PossibleReducedVals,

19705SmallVectorImpl<Instruction *> &ReductionOps,

19706unsigned Level) {

19707for (intI :reverse(seq<int>(getFirstOperandIndex(TreeN),

19708 getNumberOfOperands(TreeN)))) {

19709Value *EdgeVal = getRdxOperand(TreeN,I);

19710 ReducedValsToOps[EdgeVal].push_back(TreeN);

19711auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);

19712// If the edge is not an instruction, or it is different from the main

19713// reduction opcode or has too many uses - possible reduced value.

19714// Also, do not try to reduce const values, if the operation is not

19715// foldable.

19716if (!EdgeInst || Level >RecursionMaxDepth ||

19717getRdxKind(EdgeInst) != RdxKind ||

19718 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||

19719 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||

19720 !isVectorizable(RdxKind, EdgeInst) ||

19721 (R.isAnalyzedReductionRoot(EdgeInst) &&

19722all_of(EdgeInst->operands(), IsaPred<Constant>))) {

19723 PossibleReducedVals.push_back(EdgeVal);

19724continue;

19725 }

19726 ReductionOps.push_back(EdgeInst);

19727 }

19728 };

19729// Try to regroup reduced values so that it gets more profitable to try to

19730// reduce them. Values are grouped by their value ids, instructions - by

19731// instruction op id and/or alternate op id, plus do extra analysis for

19732// loads (grouping them by the distabce between pointers) and cmp

19733// instructions (grouping them by the predicate).

19734SmallMapVector<

19735 size_t,SmallMapVector<size_t, SmallMapVector<Value *, unsigned, 2>, 2>,

19736 8>

19737 PossibleReducedVals;

19738 initReductionOps(Root);

19739DenseMap<std::pair<size_t, Value *>,SmallVector<LoadInst *>> LoadsMap;

19740SmallSet<size_t, 2> LoadKeyUsed;

19741

19742auto GenerateLoadsSubkey = [&](size_tKey,LoadInst *LI) {

19743Key =hash_combine(hash_value(LI->getParent()), Key);

19744Value *Ptr =

19745getUnderlyingObject(LI->getPointerOperand(),RecursionMaxDepth);

19746if (!LoadKeyUsed.insert(Key).second) {

19747auto LIt = LoadsMap.find(std::make_pair(Key,Ptr));

19748if (LIt != LoadsMap.end()) {

19749for (LoadInst *RLI : LIt->second) {

19750if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),

19751 LI->getType(), LI->getPointerOperand(),DL, SE,

19752/*StrictCheck=*/true))

19753returnhash_value(RLI->getPointerOperand());

19754 }

19755for (LoadInst *RLI : LIt->second) {

19756if (arePointersCompatible(RLI->getPointerOperand(),

19757 LI->getPointerOperand(), TLI)) {

19758hash_code SubKey =hash_value(RLI->getPointerOperand());

19759return SubKey;

19760 }

19761 }

19762if (LIt->second.size() > 2) {

19763hash_code SubKey =

19764hash_value(LIt->second.back()->getPointerOperand());

19765return SubKey;

19766 }

19767 }

19768 }

19769 LoadsMap.try_emplace(std::make_pair(Key,Ptr))

19770 .first->second.push_back(LI);

19771returnhash_value(LI->getPointerOperand());

19772 };

19773

19774while (!Worklist.empty()) {

19775auto [TreeN, Level] = Worklist.pop_back_val();

19776SmallVector<Value *> PossibleRedVals;

19777SmallVector<Instruction *> PossibleReductionOps;

19778 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);

19779 addReductionOps(TreeN);

19780// Add reduction values. The values are sorted for better vectorization

19781// results.

19782for (Value *V : PossibleRedVals) {

19783size_tKey,Idx;

19784 std::tie(Key,Idx) =generateKeySubkey(V, &TLI, GenerateLoadsSubkey,

19785/*AllowAlternate=*/false);

19786 ++PossibleReducedVals[Key][Idx]

19787 .insert(std::make_pair(V, 0))

19788 .first->second;

19789 }

19790for (Instruction *I :reverse(PossibleReductionOps))

19791 Worklist.emplace_back(I,I->getParent() == BB ? 0 : Level + 1);

19792 }

19793auto PossibleReducedValsVect = PossibleReducedVals.takeVector();

19794// Sort values by the total number of values kinds to start the reduction

19795// from the longest possible reduced values sequences.

19796for (auto &PossibleReducedVals : PossibleReducedValsVect) {

19797auto PossibleRedVals = PossibleReducedVals.second.takeVector();

19798SmallVector<SmallVector<Value *>> PossibleRedValsVect;

19799for (auto It = PossibleRedVals.begin(),E = PossibleRedVals.end();

19800 It !=E; ++It) {

19801 PossibleRedValsVect.emplace_back();

19802auto RedValsVect = It->second.takeVector();

19803stable_sort(RedValsVect,llvm::less_second());

19804for (const std::pair<Value *, unsigned> &Data : RedValsVect)

19805 PossibleRedValsVect.back().append(Data.second, Data.first);

19806 }

19807stable_sort(PossibleRedValsVect, [](constauto &P1,constauto &P2) {

19808returnP1.size() > P2.size();

19809 });

19810int NewIdx = -1;

19811for (ArrayRef<Value *> Data : PossibleRedValsVect) {

19812if (NewIdx < 0 ||

19813 (!isGoodForReduction(Data) &&

19814 (!isa<LoadInst>(Data.front()) ||

19815 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||

19816getUnderlyingObject(

19817 cast<LoadInst>(Data.front())->getPointerOperand()) !=

19818getUnderlyingObject(

19819 cast<LoadInst>(ReducedVals[NewIdx].front())

19820 ->getPointerOperand())))) {

19821 NewIdx = ReducedVals.size();

19822 ReducedVals.emplace_back();

19823 }

19824 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());

19825 }

19826 }

19827// Sort the reduced values by number of same/alternate opcode and/or pointer

19828// operand.

19829stable_sort(ReducedVals, [](ArrayRef<Value *> P1,ArrayRef<Value *> P2) {

19830returnP1.size() > P2.size();

19831 });

19832returntrue;

19833 }

19834

19835 /// Attempt to vectorize the tree found by matchAssociativeReduction.

19836Value *tryToReduce(BoUpSLP &V,constDataLayout &DL,TargetTransformInfo *TTI,

19837constTargetLibraryInfo &TLI,AssumptionCache *AC) {

19838constunsigned ReductionLimit =VectorizeNonPowerOf2 ? 3 : 4;

19839constexprunsigned RegMaxNumber = 4;

19840constexprunsigned RedValsMaxNumber = 128;

19841// If there are a sufficient number of reduction values, reduce

19842// to a nearby power-of-2. We can safely generate oversized

19843// vectors and rely on the backend to split them to legal sizes.

19844if (unsigned NumReducedVals = std::accumulate(

19845 ReducedVals.begin(), ReducedVals.end(), 0,

19846 [](unsigned Num,ArrayRef<Value *> Vals) ->unsigned {

19847 if (!isGoodForReduction(Vals))

19848 return Num;

19849 return Num + Vals.size();

19850 });

19851 NumReducedVals < ReductionLimit &&

19852all_of(ReducedVals, [](ArrayRef<Value *> RedV) {

19853return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);

19854 })) {

19855for (ReductionOpsType &RdxOps : ReductionOps)

19856for (Value *RdxOp : RdxOps)

19857V.analyzedReductionRoot(cast<Instruction>(RdxOp));

19858returnnullptr;

19859 }

19860

19861IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),

19862TargetFolder(DL));

19863 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));

19864

19865// Track the reduced values in case if they are replaced by extractelement

19866// because of the vectorization.

19867DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *

19868 ReducedVals.front().size());

19869

19870// The compare instruction of a min/max is the insertion point for new

19871// instructions and may be replaced with a new compare instruction.

19872auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {

19873assert(isa<SelectInst>(RdxRootInst) &&

19874"Expected min/max reduction to have select root instruction");

19875Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();

19876assert(isa<Instruction>(ScalarCond) &&

19877"Expected min/max reduction to have compare condition");

19878return cast<Instruction>(ScalarCond);

19879 };

19880

19881bool AnyBoolLogicOp =any_of(ReductionOps.back(), [](Value *V) {

19882 return isBoolLogicOp(cast<Instruction>(V));

19883 });

19884// Return new VectorizedTree, based on previous value.

19885auto GetNewVectorizedTree = [&](Value *VectorizedTree,Value *Res) {

19886if (VectorizedTree) {

19887// Update the final value in the reduction.

19888 Builder.SetCurrentDebugLocation(

19889 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());

19890if (AnyBoolLogicOp) {

19891auto It = ReducedValsToOps.find(VectorizedTree);

19892auto It1 = ReducedValsToOps.find(Res);

19893if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||

19894isGuaranteedNotToBePoison(VectorizedTree, AC) ||

19895 (It != ReducedValsToOps.end() &&

19896any_of(It->getSecond(), [&](Instruction *I) {

19897 return isBoolLogicOp(I) &&

19898 getRdxOperand(I, 0) == VectorizedTree;

19899 }))) {

19900 ;

19901 }elseif (isGuaranteedNotToBePoison(Res, AC) ||

19902 (It1 != ReducedValsToOps.end() &&

19903any_of(It1->getSecond(), [&](Instruction *I) {

19904 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;

19905 }))) {

19906std::swap(VectorizedTree, Res);

19907 }else {

19908 VectorizedTree = Builder.CreateFreeze(VectorizedTree);

19909 }

19910 }

19911

19912return createOp(Builder, RdxKind, VectorizedTree, Res,"op.rdx",

19913 ReductionOps);

19914 }

19915// Initialize the final value in the reduction.

19916return Res;

19917 };

19918SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *

19919 ReductionOps.front().size());

19920for (ReductionOpsType &RdxOps : ReductionOps)

19921for (Value *RdxOp : RdxOps) {

19922if (!RdxOp)

19923continue;

19924 IgnoreList.insert(RdxOp);

19925 }

19926// Intersect the fast-math-flags from all reduction operations.

19927FastMathFlags RdxFMF;

19928 RdxFMF.set();

19929for (Value *U : IgnoreList)

19930if (auto *FPMO = dyn_cast<FPMathOperator>(U))

19931 RdxFMF &= FPMO->getFastMathFlags();

19932bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));

19933

19934// Need to track reduced vals, they may be changed during vectorization of

19935// subvectors.

19936for (ArrayRef<Value *> Candidates : ReducedVals)

19937for (Value *V : Candidates)

19938 TrackedVals.try_emplace(V, V);

19939

19940auto At = [](SmallMapVector<Value *, unsigned, 16> &MV,

19941Value *V) ->unsigned & {

19942auto *It = MV.find(V);

19943assert(It != MV.end() &&"Unable to find given key.");

19944return It->second;

19945 };

19946

19947DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());

19948// List of the values that were reduced in other trees as part of gather

19949// nodes and thus requiring extract if fully vectorized in other trees.

19950SmallPtrSet<Value *, 4> RequiredExtract;

19951WeakTrackingVH VectorizedTree =nullptr;

19952bool CheckForReusedReductionOps =false;

19953// Try to vectorize elements based on their type.

19954SmallVector<InstructionsState> States;

19955for (ArrayRef<Value *> RV : ReducedVals)

19956 States.push_back(getSameOpcode(RV, TLI));

19957for (unsignedI = 0,E = ReducedVals.size();I <E; ++I) {

19958ArrayRef<Value *> OrigReducedVals = ReducedVals[I];

19959 InstructionsState S = States[I];

19960SmallVector<Value *> Candidates;

19961 Candidates.reserve(2 * OrigReducedVals.size());

19962DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());

19963for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {

19964Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);

19965// Check if the reduction value was not overriden by the extractelement

19966// instruction because of the vectorization and exclude it, if it is not

19967// compatible with other values.

19968// Also check if the instruction was folded to constant/other value.

19969auto *Inst = dyn_cast<Instruction>(RdxVal);

19970if ((Inst &&isVectorLikeInstWithConstOps(Inst) &&

19971 (!S || !S.isOpcodeOrAlt(Inst))) ||

19972 (S && !Inst))

19973continue;

19974 Candidates.push_back(RdxVal);

19975 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);

19976 }

19977bool ShuffledExtracts =false;

19978// Try to handle shuffled extractelements.

19979if (S && S.getOpcode() == Instruction::ExtractElement &&

19980 !S.isAltShuffle() &&I + 1 <E) {

19981SmallVector<Value *> CommonCandidates(Candidates);

19982for (Value *RV : ReducedVals[I + 1]) {

19983Value *RdxVal = TrackedVals.at(RV);

19984// Check if the reduction value was not overriden by the

19985// extractelement instruction because of the vectorization and

19986// exclude it, if it is not compatible with other values.

19987auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);

19988if (!Inst)

19989continue;

19990 CommonCandidates.push_back(RdxVal);

19991 TrackedToOrig.try_emplace(RdxVal, RV);

19992 }

19993SmallVector<int>Mask;

19994if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {

19995 ++I;

19996 Candidates.swap(CommonCandidates);

19997 ShuffledExtracts =true;

19998 }

19999 }

20000

20001// Emit code for constant values.

20002if (Candidates.size() > 1 &&allConstant(Candidates)) {

20003Value *Res = Candidates.front();

20004Value *OrigV = TrackedToOrig.at(Candidates.front());

20005 ++VectorizedVals.try_emplace(OrigV).first->getSecond();

20006for (Value *VC :ArrayRef(Candidates).drop_front()) {

20007 Res = createOp(Builder, RdxKind, Res, VC,"const.rdx", ReductionOps);

20008Value *OrigV = TrackedToOrig.at(VC);

20009 ++VectorizedVals.try_emplace(OrigV).first->getSecond();

20010if (auto *ResI = dyn_cast<Instruction>(Res))

20011V.analyzedReductionRoot(ResI);

20012 }

20013 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);

20014continue;

20015 }

20016

20017unsigned NumReducedVals = Candidates.size();

20018if (NumReducedVals < ReductionLimit &&

20019 (NumReducedVals < 2 || !isSplat(Candidates)))

20020continue;

20021

20022// Check if we support repeated scalar values processing (optimization of

20023// original scalar identity operations on matched horizontal reductions).

20024 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&

20025 RdxKind != RecurKind::FMul &&

20026 RdxKind != RecurKind::FMulAdd;

20027// Gather same values.

20028SmallMapVector<Value *, unsigned, 16> SameValuesCounter;

20029if (IsSupportedHorRdxIdentityOp)

20030for (Value *V : Candidates) {

20031Value *OrigV = TrackedToOrig.at(V);

20032 ++SameValuesCounter.try_emplace(OrigV).first->second;

20033 }

20034// Used to check if the reduced values used same number of times. In this

20035// case the compiler may produce better code. E.g. if reduced values are

20036// aabbccdd (8 x values), then the first node of the tree will have a node

20037// for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.

20038// Plus, the final reduction will be performed on <8 x aabbccdd>.

20039// Instead compiler may build <4 x abcd> tree immediately, + reduction (4

20040// x abcd) * 2.

20041// Currently it only handles add/fadd/xor. and/or/min/max do not require

20042// this analysis, other operations may require an extra estimation of

20043// the profitability.

20044bool SameScaleFactor =false;

20045bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&

20046 SameValuesCounter.size() != Candidates.size();

20047BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;

20048if (OptReusedScalars) {

20049 SameScaleFactor =

20050 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||

20051 RdxKind == RecurKind::Xor) &&

20052all_of(drop_begin(SameValuesCounter),

20053 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {

20054returnP.second == SameValuesCounter.front().second;

20055 });

20056 Candidates.resize(SameValuesCounter.size());

20057transform(SameValuesCounter, Candidates.begin(),

20058 [&](constauto &P) { return TrackedVals.at(P.first); });

20059 NumReducedVals = Candidates.size();

20060// Have a reduction of the same element.

20061if (NumReducedVals == 1) {

20062Value *OrigV = TrackedToOrig.at(Candidates.front());

20063unsigned Cnt = At(SameValuesCounter, OrigV);

20064Value *RedVal =

20065 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);

20066 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);

20067 VectorizedVals.try_emplace(OrigV, Cnt);

20068 ExternallyUsedValues.insert(OrigV);

20069continue;

20070 }

20071 }

20072

20073unsigned MaxVecRegSize =V.getMaxVecRegSize();

20074unsigned EltSize =V.getVectorElementSize(Candidates[0]);

20075constunsigned MaxElts = std::clamp<unsigned>(

20076llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,

20077 RegMaxNumber * RedValsMaxNumber);

20078

20079unsigned ReduxWidth = NumReducedVals;

20080auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {

20081unsigned NumParts, NumRegs;

20082Type *ScalarTy = Candidates.front()->getType();

20083 ReduxWidth =

20084getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);

20085VectorType *Tp =getWidenedType(ScalarTy, ReduxWidth);

20086 NumParts =::getNumberOfParts(TTI, Tp);

20087 NumRegs =

20088TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));

20089while (NumParts > NumRegs) {

20090assert(ReduxWidth > 0 &&"ReduxWidth is unexpectedly 0.");

20091 ReduxWidth =bit_floor(ReduxWidth - 1);

20092VectorType *Tp =getWidenedType(ScalarTy, ReduxWidth);

20093 NumParts =::getNumberOfParts(TTI, Tp);

20094 NumRegs =

20095TTI.getNumberOfRegisters(TTI.getRegisterClassForType(true, Tp));

20096 }

20097if (NumParts > NumRegs / 2)

20098 ReduxWidth =bit_floor(ReduxWidth);

20099return ReduxWidth;

20100 };

20101if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))

20102 ReduxWidth = GetVectorFactor(ReduxWidth);

20103 ReduxWidth = std::min(ReduxWidth, MaxElts);

20104

20105unsigned Start = 0;

20106unsigned Pos = Start;

20107// Restarts vectorization attempt with lower vector factor.

20108unsigned PrevReduxWidth = ReduxWidth;

20109bool CheckForReusedReductionOpsLocal =false;

20110auto AdjustReducedVals = [&](bool IgnoreVL =false) {

20111bool IsAnyRedOpGathered = !IgnoreVL &&V.isAnyGathered(IgnoreList);

20112if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {

20113// Check if any of the reduction ops are gathered. If so, worth

20114// trying again with less number of reduction ops.

20115 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;

20116 }

20117 ++Pos;

20118if (Pos < NumReducedVals - ReduxWidth + 1)

20119return IsAnyRedOpGathered;

20120 Pos = Start;

20121 --ReduxWidth;

20122if (ReduxWidth > 1)

20123 ReduxWidth = GetVectorFactor(ReduxWidth);

20124return IsAnyRedOpGathered;

20125 };

20126bool AnyVectorized =false;

20127SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;

20128while (Pos < NumReducedVals - ReduxWidth + 1 &&

20129 ReduxWidth >= ReductionLimit) {

20130// Dependency in tree of the reduction ops - drop this attempt, try

20131// later.

20132if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&

20133 Start == 0) {

20134 CheckForReusedReductionOps =true;

20135break;

20136 }

20137 PrevReduxWidth = ReduxWidth;

20138ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);

20139// Been analyzed already - skip.

20140if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||

20141 (!has_single_bit(ReduxWidth) &&

20142 (IgnoredCandidates.contains(

20143 std::make_pair(Pos,bit_floor(ReduxWidth))) ||

20144 IgnoredCandidates.contains(

20145 std::make_pair(Pos + (ReduxWidth -bit_floor(ReduxWidth)),

20146bit_floor(ReduxWidth))))) ||

20147V.areAnalyzedReductionVals(VL)) {

20148 (void)AdjustReducedVals(/*IgnoreVL=*/true);

20149continue;

20150 }

20151// Early exit if any of the reduction values were deleted during

20152// previous vectorization attempts.

20153if (any_of(VL, [&V](Value *RedVal) {

20154auto *RedValI = dyn_cast<Instruction>(RedVal);

20155if (!RedValI)

20156returnfalse;

20157returnV.isDeleted(RedValI);

20158 }))

20159break;

20160V.buildTree(VL, IgnoreList);

20161if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {

20162if (!AdjustReducedVals())

20163V.analyzedReductionVals(VL);

20164continue;

20165 }

20166if (V.isLoadCombineReductionCandidate(RdxKind)) {

20167if (!AdjustReducedVals())

20168V.analyzedReductionVals(VL);

20169continue;

20170 }

20171V.reorderTopToBottom();

20172// No need to reorder the root node at all.

20173V.reorderBottomToTop(/*IgnoreReorder=*/true);

20174// Keep extracted other reduction values, if they are used in the

20175// vectorization trees.

20176BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(

20177 ExternallyUsedValues);

20178// The reduction root is used as the insertion point for new

20179// instructions, so set it as externally used to prevent it from being

20180// deleted.

20181 LocalExternallyUsedValues.insert(ReductionRoot);

20182for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {

20183if (Cnt ==I || (ShuffledExtracts && Cnt ==I - 1))

20184continue;

20185for (Value *V : ReducedVals[Cnt])

20186if (isa<Instruction>(V))

20187 LocalExternallyUsedValues.insert(TrackedVals[V]);

20188 }

20189if (!IsSupportedHorRdxIdentityOp) {

20190// Number of uses of the candidates in the vector of values.

20191assert(SameValuesCounter.empty() &&

20192"Reused values counter map is not empty");

20193for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {

20194if (Cnt >= Pos && Cnt < Pos + ReduxWidth)

20195continue;

20196Value *V = Candidates[Cnt];

20197Value *OrigV = TrackedToOrig.at(V);

20198 ++SameValuesCounter.try_emplace(OrigV).first->second;

20199 }

20200 }

20201V.transformNodes();

20202SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());

20203// Gather externally used values.

20204SmallPtrSet<Value *, 4> Visited;

20205for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {

20206if (Cnt >= Pos && Cnt < Pos + ReduxWidth)

20207continue;

20208Value *RdxVal = Candidates[Cnt];

20209if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())

20210 RdxVal = It->second;

20211if (!Visited.insert(RdxVal).second)

20212continue;

20213// Check if the scalar was vectorized as part of the vectorization

20214// tree but not the top node.

20215if (!VLScalars.contains(RdxVal) &&V.isVectorized(RdxVal)) {

20216 LocalExternallyUsedValues.insert(RdxVal);

20217continue;

20218 }

20219Value *OrigV = TrackedToOrig.at(RdxVal);

20220unsigned NumOps =

20221 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);

20222if (NumOps != ReducedValsToOps.at(OrigV).size())

20223 LocalExternallyUsedValues.insert(RdxVal);

20224 }

20225// Do not need the list of reused scalars in regular mode anymore.

20226if (!IsSupportedHorRdxIdentityOp)

20227 SameValuesCounter.clear();

20228for (Value *RdxVal : VL)

20229if (RequiredExtract.contains(RdxVal))

20230 LocalExternallyUsedValues.insert(RdxVal);

20231V.buildExternalUses(LocalExternallyUsedValues);

20232

20233V.computeMinimumValueSizes();

20234

20235// Estimate cost.

20236InstructionCost TreeCost =V.getTreeCost(VL);

20237InstructionCost ReductionCost =

20238 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);

20239InstructionCost Cost = TreeCost + ReductionCost;

20240LLVM_DEBUG(dbgs() <<"SLP: Found cost = " <<Cost

20241 <<" for reduction\n");

20242if (!Cost.isValid())

20243break;

20244if (Cost >= -SLPCostThreshold) {

20245V.getORE()->emit([&]() {

20246returnOptimizationRemarkMissed(SV_NAME,"HorSLPNotBeneficial",

20247 ReducedValsToOps.at(VL[0]).front())

20248 <<"Vectorizing horizontal reduction is possible "

20249 <<"but not beneficial with cost " <<ore::NV("Cost",Cost)

20250 <<" and threshold "

20251 <<ore::NV("Threshold", -SLPCostThreshold);

20252 });

20253if (!AdjustReducedVals()) {

20254V.analyzedReductionVals(VL);

20255unsignedOffset = Pos == Start ? Pos : Pos - 1;

20256if (ReduxWidth > ReductionLimit &&V.isTreeNotExtendable()) {

20257// Add subvectors of VL to the list of the analyzed values.

20258for (unsigned VF =getFloorFullVectorNumberOfElements(

20259 *TTI, VL.front()->getType(), ReduxWidth - 1);

20260 VF >= ReductionLimit;

20261 VF =getFloorFullVectorNumberOfElements(

20262 *TTI, VL.front()->getType(), VF - 1)) {

20263if (has_single_bit(VF) &&

20264V.getCanonicalGraphSize() !=V.getTreeSize())

20265continue;

20266for (unsignedIdx : seq<unsigned>(ReduxWidth - VF))

20267 IgnoredCandidates.insert(std::make_pair(Offset +Idx, VF));

20268 }

20269 }

20270 }

20271continue;

20272 }

20273

20274LLVM_DEBUG(dbgs() <<"SLP: Vectorizing horizontal reduction at cost:"

20275 <<Cost <<". (HorRdx)\n");

20276V.getORE()->emit([&]() {

20277returnOptimizationRemark(SV_NAME,"VectorizedHorizontalReduction",

20278 ReducedValsToOps.at(VL[0]).front())

20279 <<"Vectorized horizontal reduction with cost "

20280 <<ore::NV("Cost",Cost) <<" and with tree size "

20281 <<ore::NV("TreeSize",V.getTreeSize());

20282 });

20283

20284 Builder.setFastMathFlags(RdxFMF);

20285

20286// Emit a reduction. If the root is a select (min/max idiom), the insert

20287// point is the compare condition of that select.

20288Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);

20289Instruction *InsertPt = RdxRootInst;

20290if (IsCmpSelMinMax)

20291 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);

20292

20293// Vectorize a tree.

20294Value *VectorizedRoot =

20295V.vectorizeTree(LocalExternallyUsedValues, InsertPt);

20296// Update TrackedToOrig mapping, since the tracked values might be

20297// updated.

20298for (Value *RdxVal : Candidates) {

20299Value *OrigVal = TrackedToOrig.at(RdxVal);

20300Value *TransformedRdxVal = TrackedVals.at(OrigVal);

20301if (TransformedRdxVal != RdxVal)

20302 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);

20303 }

20304

20305 Builder.SetInsertPoint(InsertPt);

20306

20307// To prevent poison from leaking across what used to be sequential,

20308// safe, scalar boolean logic operations, the reduction operand must be

20309// frozen.

20310if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))

20311 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);

20312

20313// Emit code to correctly handle reused reduced values, if required.

20314if (OptReusedScalars && !SameScaleFactor) {

20315 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,

20316 SameValuesCounter, TrackedToOrig);

20317 }

20318

20319Value *ReducedSubTree;

20320Type *ScalarTy = VL.front()->getType();

20321if (isa<FixedVectorType>(ScalarTy)) {

20322assert(SLPReVec &&"FixedVectorType is not expected.");

20323unsigned ScalarTyNumElements =getNumElements(ScalarTy);

20324 ReducedSubTree =PoisonValue::get(FixedVectorType::get(

20325 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));

20326for (unsignedI : seq<unsigned>(ScalarTyNumElements)) {

20327// Do reduction for each lane.

20328// e.g., do reduce add for

20329// VL[0] = <4 x Ty> <a, b, c, d>

20330// VL[1] = <4 x Ty> <e, f, g, h>

20331// Lane[0] = <2 x Ty> <a, e>

20332// Lane[1] = <2 x Ty> <b, f>

20333// Lane[2] = <2 x Ty> <c, g>

20334// Lane[3] = <2 x Ty> <d, h>

20335// result[0] = reduce add Lane[0]

20336// result[1] = reduce add Lane[1]

20337// result[2] = reduce add Lane[2]

20338// result[3] = reduce add Lane[3]

20339SmallVector<int, 16>Mask =

20340createStrideMask(I, ScalarTyNumElements, VL.size());

20341Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);

20342 ReducedSubTree = Builder.CreateInsertElement(

20343 ReducedSubTree,

20344 emitReduction(Lane, Builder,TTI, RdxRootInst->getType()),I);

20345 }

20346 }else {

20347 ReducedSubTree = emitReduction(VectorizedRoot, Builder,TTI,

20348 RdxRootInst->getType());

20349 }

20350if (ReducedSubTree->getType() != VL.front()->getType()) {

20351assert(ReducedSubTree->getType() != VL.front()->getType() &&

20352"Expected different reduction type.");

20353 ReducedSubTree =

20354 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),

20355V.isSignedMinBitwidthRootNode());

20356 }

20357

20358// Improved analysis for add/fadd/xor reductions with same scale factor

20359// for all operands of reductions. We can emit scalar ops for them

20360// instead.

20361if (OptReusedScalars && SameScaleFactor)

20362 ReducedSubTree = emitScaleForReusedOps(

20363 ReducedSubTree, Builder, SameValuesCounter.front().second);

20364

20365 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);

20366// Count vectorized reduced values to exclude them from final reduction.

20367for (Value *RdxVal : VL) {

20368Value *OrigV = TrackedToOrig.at(RdxVal);

20369if (IsSupportedHorRdxIdentityOp) {

20370 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));

20371continue;

20372 }

20373 ++VectorizedVals.try_emplace(OrigV).first->getSecond();

20374if (!V.isVectorized(RdxVal))

20375 RequiredExtract.insert(RdxVal);

20376 }

20377 Pos += ReduxWidth;

20378 Start = Pos;

20379 ReduxWidth = NumReducedVals - Pos;

20380if (ReduxWidth > 1)

20381 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);

20382 AnyVectorized =true;

20383 }

20384if (OptReusedScalars && !AnyVectorized) {

20385for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {

20386Value *RdxVal = TrackedVals.at(P.first);

20387Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,P.second);

20388 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);

20389 VectorizedVals.try_emplace(P.first,P.second);

20390 }

20391continue;

20392 }

20393 }

20394if (VectorizedTree) {

20395// Reorder operands of bool logical op in the natural order to avoid

20396// possible problem with poison propagation. If not possible to reorder

20397// (both operands are originally RHS), emit an extra freeze instruction

20398// for the LHS operand.

20399// I.e., if we have original code like this:

20400// RedOp1 = select i1 ?, i1 LHS, i1 false

20401// RedOp2 = select i1 RHS, i1 ?, i1 false

20402

20403// Then, we swap LHS/RHS to create a new op that matches the poison

20404// semantics of the original code.

20405

20406// If we have original code like this and both values could be poison:

20407// RedOp1 = select i1 ?, i1 LHS, i1 false

20408// RedOp2 = select i1 ?, i1 RHS, i1 false

20409

20410// Then, we must freeze LHS in the new op.

20411auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS,Value *&RHS,

20412Instruction *RedOp1,

20413Instruction *RedOp2,

20414bool InitStep) {

20415if (!AnyBoolLogicOp)

20416return;

20417if (isBoolLogicOp(RedOp1) && ((!InitStep &&LHS == VectorizedTree) ||

20418 getRdxOperand(RedOp1, 0) ==LHS ||

20419isGuaranteedNotToBePoison(LHS, AC)))

20420return;

20421if (isBoolLogicOp(RedOp2) && ((!InitStep &&RHS == VectorizedTree) ||

20422 getRdxOperand(RedOp2, 0) ==RHS ||

20423isGuaranteedNotToBePoison(RHS, AC))) {

20424std::swap(LHS,RHS);

20425return;

20426 }

20427if (LHS != VectorizedTree)

20428LHS = Builder.CreateFreeze(LHS);

20429 };

20430// Finish the reduction.

20431// Need to add extra arguments and not vectorized possible reduction

20432// values.

20433// Try to avoid dependencies between the scalar remainders after

20434// reductions.

20435auto FinalGen =

20436 [&](ArrayRef<std::pair<Instruction *, Value *>> InstVals,

20437bool InitStep) {

20438unsigned Sz = InstVals.size();

20439SmallVector<std::pair<Instruction *, Value *>> ExtraReds(Sz / 2 +

20440 Sz % 2);

20441for (unsignedI = 0,E = (Sz / 2) * 2;I <E;I += 2) {

20442Instruction *RedOp = InstVals[I + 1].first;

20443 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());

20444Value *RdxVal1 = InstVals[I].second;

20445Value *StableRdxVal1 = RdxVal1;

20446auto It1 = TrackedVals.find(RdxVal1);

20447if (It1 != TrackedVals.end())

20448 StableRdxVal1 = It1->second;

20449Value *RdxVal2 = InstVals[I + 1].second;

20450Value *StableRdxVal2 = RdxVal2;

20451auto It2 = TrackedVals.find(RdxVal2);

20452if (It2 != TrackedVals.end())

20453 StableRdxVal2 = It2->second;

20454// To prevent poison from leaking across what used to be

20455// sequential, safe, scalar boolean logic operations, the

20456// reduction operand must be frozen.

20457 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,

20458 RedOp, InitStep);

20459Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,

20460 StableRdxVal2,"op.rdx", ReductionOps);

20461 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);

20462 }

20463if (Sz % 2 == 1)

20464 ExtraReds[Sz / 2] = InstVals.back();

20465return ExtraReds;

20466 };

20467SmallVector<std::pair<Instruction *, Value *>> ExtraReductions;

20468 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),

20469 VectorizedTree);

20470SmallPtrSet<Value *, 8> Visited;

20471for (ArrayRef<Value *> Candidates : ReducedVals) {

20472for (Value *RdxVal : Candidates) {

20473if (!Visited.insert(RdxVal).second)

20474continue;

20475unsigned NumOps = VectorizedVals.lookup(RdxVal);

20476for (Instruction *RedOp :

20477ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))

20478 ExtraReductions.emplace_back(RedOp, RdxVal);

20479 }

20480 }

20481// Iterate through all not-vectorized reduction values/extra arguments.

20482bool InitStep =true;

20483while (ExtraReductions.size() > 1) {

20484SmallVector<std::pair<Instruction *, Value *>> NewReds =

20485 FinalGen(ExtraReductions, InitStep);

20486 ExtraReductions.swap(NewReds);

20487 InitStep =false;

20488 }

20489 VectorizedTree = ExtraReductions.front().second;

20490

20491 ReductionRoot->replaceAllUsesWith(VectorizedTree);

20492

20493// The original scalar reduction is expected to have no remaining

20494// uses outside the reduction tree itself. Assert that we got this

20495// correct, replace internal uses with undef, and mark for eventual

20496// deletion.

20497#ifndef NDEBUG

20498SmallSet<Value *, 4> IgnoreSet;

20499for (ArrayRef<Value *> RdxOps : ReductionOps)

20500 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());

20501#endif

20502for (ArrayRef<Value *> RdxOps : ReductionOps) {

20503for (Value *Ignore : RdxOps) {

20504if (!Ignore)

20505continue;

20506#ifndef NDEBUG

20507for (auto *U :Ignore->users()) {

20508assert(IgnoreSet.count(U) &&

20509"All users must be either in the reduction ops list.");

20510 }

20511#endif

20512if (!Ignore->use_empty()) {

20513Value *P =PoisonValue::get(Ignore->getType());

20514Ignore->replaceAllUsesWith(P);

20515 }

20516 }

20517V.removeInstructionsAndOperands(RdxOps);

20518 }

20519 }elseif (!CheckForReusedReductionOps) {

20520for (ReductionOpsType &RdxOps : ReductionOps)

20521for (Value *RdxOp : RdxOps)

20522V.analyzedReductionRoot(cast<Instruction>(RdxOp));

20523 }

20524return VectorizedTree;

20525 }

20526

20527private:

20528 /// Calculate the cost of a reduction.

20529InstructionCost getReductionCost(TargetTransformInfo *TTI,

20530ArrayRef<Value *> ReducedVals,

20531bool IsCmpSelMinMax,FastMathFlags FMF,

20532constBoUpSLP &R) {

20533TTI::TargetCostKind CostKind =TTI::TCK_RecipThroughput;

20534Type *ScalarTy = ReducedVals.front()->getType();

20535unsigned ReduxWidth = ReducedVals.size();

20536FixedVectorType *VectorTy =R.getReductionType();

20537InstructionCost VectorCost = 0, ScalarCost;

20538// If all of the reduced values are constant, the vector cost is 0, since

20539// the reduction value can be calculated at the compile time.

20540bool AllConsts =allConstant(ReducedVals);

20541auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {

20542InstructionCost Cost = 0;

20543// Scalar cost is repeated for N-1 elements.

20544int Cnt = ReducedVals.size();

20545for (Value *RdxVal : ReducedVals) {

20546if (Cnt == 1)

20547break;

20548 --Cnt;

20549if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {

20550Cost += GenCostFn();

20551continue;

20552 }

20553InstructionCost ScalarCost = 0;

20554for (User *U : RdxVal->users()) {

20555auto *RdxOp = cast<Instruction>(U);

20556if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {

20557 ScalarCost +=TTI->getInstructionCost(RdxOp,CostKind);

20558continue;

20559 }

20560 ScalarCost =InstructionCost::getInvalid();

20561break;

20562 }

20563if (ScalarCost.isValid())

20564Cost += ScalarCost;

20565else

20566Cost += GenCostFn();

20567 }

20568returnCost;

20569 };

20570switch (RdxKind) {

20571case RecurKind::Add:

20572case RecurKind::Mul:

20573case RecurKind::Or:

20574case RecurKind::And:

20575case RecurKind::Xor:

20576case RecurKind::FAdd:

20577case RecurKind::FMul: {

20578unsigned RdxOpcode =RecurrenceDescriptor::getOpcode(RdxKind);

20579if (!AllConsts) {

20580if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {

20581assert(SLPReVec &&"FixedVectorType is not expected.");

20582unsigned ScalarTyNumElements = VecTy->getNumElements();

20583for (unsignedI : seq<unsigned>(ReducedVals.size())) {

20584 VectorCost +=TTI->getShuffleCost(

20585TTI::SK_PermuteSingleSrc, VectorTy,

20586createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));

20587 VectorCost +=TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,

20588CostKind);

20589 }

20590 VectorCost +=TTI->getScalarizationOverhead(

20591 VecTy,APInt::getAllOnes(ScalarTyNumElements),/*Insert*/true,

20592/*Extract*/false,TTI::TCK_RecipThroughput);

20593 }else {

20594Type *RedTy = VectorTy->getElementType();

20595auto [RType, IsSigned] =R.getRootNodeTypeWithNoCast().value_or(

20596 std::make_pair(RedTy,true));

20597if (RType == RedTy) {

20598 VectorCost =TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,

20599 FMF,CostKind);

20600 }else {

20601 VectorCost =TTI->getExtendedReductionCost(

20602 RdxOpcode, !IsSigned, RedTy,getWidenedType(RType, ReduxWidth),

20603 FMF,CostKind);

20604 }

20605 }

20606 }

20607 ScalarCost = EvaluateScalarCost([&]() {

20608returnTTI->getArithmeticInstrCost(RdxOpcode, ScalarTy,CostKind);

20609 });

20610break;

20611 }

20612case RecurKind::FMax:

20613case RecurKind::FMin:

20614case RecurKind::FMaximum:

20615case RecurKind::FMinimum:

20616case RecurKind::SMax:

20617case RecurKind::SMin:

20618case RecurKind::UMax:

20619case RecurKind::UMin: {

20620Intrinsic::ID Id =getMinMaxReductionIntrinsicOp(RdxKind);

20621if (!AllConsts)

20622 VectorCost =TTI->getMinMaxReductionCost(Id, VectorTy, FMF,CostKind);

20623 ScalarCost = EvaluateScalarCost([&]() {

20624IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);

20625returnTTI->getIntrinsicInstrCost(ICA,CostKind);

20626 });

20627break;

20628 }

20629default:

20630llvm_unreachable("Expected arithmetic or min/max reduction operation");

20631 }

20632

20633LLVM_DEBUG(dbgs() <<"SLP: Adding cost " << VectorCost - ScalarCost

20634 <<" for reduction of " <<shortBundleName(ReducedVals)

20635 <<" (It is a splitting reduction)\n");

20636return VectorCost - ScalarCost;

20637 }

20638

20639 /// Emit a horizontal reduction of the vectorized value.

20640Value *emitReduction(Value *VectorizedValue,IRBuilderBase &Builder,

20641constTargetTransformInfo *TTI,Type *DestTy) {

20642assert(VectorizedValue &&"Need to have a vectorized tree node");

20643assert(RdxKind != RecurKind::FMulAdd &&

20644"A call to the llvm.fmuladd intrinsic is not handled yet");

20645

20646auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());

20647if (FTy->getScalarType() == Builder.getInt1Ty() &&

20648 RdxKind == RecurKind::Add &&

20649 DestTy->getScalarType() != FTy->getScalarType()) {

20650// Convert vector_reduce_add(ZExt(<n x i1>)) to

20651// ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).

20652Value *V = Builder.CreateBitCast(

20653 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));

20654 ++NumVectorInstructions;

20655return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);

20656 }

20657 ++NumVectorInstructions;

20658returncreateSimpleReduction(Builder, VectorizedValue, RdxKind);

20659 }

20660

20661 /// Emits optimized code for unique scalar value reused \p Cnt times.

20662Value *emitScaleForReusedOps(Value *VectorizedValue,IRBuilderBase &Builder,

20663unsigned Cnt) {

20664assert(IsSupportedHorRdxIdentityOp &&

20665"The optimization of matched scalar identity horizontal reductions "

20666"must be supported.");

20667if (Cnt == 1)

20668return VectorizedValue;

20669switch (RdxKind) {

20670case RecurKind::Add: {

20671// res = mul vv, n

20672Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);

20673LLVM_DEBUG(dbgs() <<"SLP: Add (to-mul) " << Cnt <<"of "

20674 << VectorizedValue <<". (HorRdx)\n");

20675return Builder.CreateMul(VectorizedValue, Scale);

20676 }

20677case RecurKind::Xor: {

20678// res = n % 2 ? 0 : vv

20679LLVM_DEBUG(dbgs() <<"SLP: Xor " << Cnt <<"of " << VectorizedValue

20680 <<". (HorRdx)\n");

20681if (Cnt % 2 == 0)

20682returnConstant::getNullValue(VectorizedValue->getType());

20683return VectorizedValue;

20684 }

20685case RecurKind::FAdd: {

20686// res = fmul v, n

20687Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);

20688LLVM_DEBUG(dbgs() <<"SLP: FAdd (to-fmul) " << Cnt <<"of "

20689 << VectorizedValue <<". (HorRdx)\n");

20690return Builder.CreateFMul(VectorizedValue, Scale);

20691 }

20692case RecurKind::And:

20693case RecurKind::Or:

20694case RecurKind::SMax:

20695case RecurKind::SMin:

20696case RecurKind::UMax:

20697case RecurKind::UMin:

20698case RecurKind::FMax:

20699case RecurKind::FMin:

20700case RecurKind::FMaximum:

20701case RecurKind::FMinimum:

20702// res = vv

20703return VectorizedValue;

20704case RecurKind::Mul:

20705case RecurKind::FMul:

20706case RecurKind::FMulAdd:

20707case RecurKind::IAnyOf:

20708case RecurKind::FAnyOf:

20709case RecurKind::IFindLastIV:

20710case RecurKind::FFindLastIV:

20711case RecurKind::None:

20712llvm_unreachable("Unexpected reduction kind for repeated scalar.");

20713 }

20714returnnullptr;

20715 }

20716

20717 /// Emits actual operation for the scalar identity values, found during

20718 /// horizontal reduction analysis.

20719Value *

20720 emitReusedOps(Value *VectorizedValue,IRBuilderBase &Builder,BoUpSLP &R,

20721constSmallMapVector<Value *, unsigned, 16> &SameValuesCounter,

20722constDenseMap<Value *, Value *> &TrackedToOrig) {

20723assert(IsSupportedHorRdxIdentityOp &&

20724"The optimization of matched scalar identity horizontal reductions "

20725"must be supported.");

20726ArrayRef<Value *> VL =R.getRootNodeScalars();

20727auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());

20728if (VTy->getElementType() != VL.front()->getType()) {

20729 VectorizedValue = Builder.CreateIntCast(

20730 VectorizedValue,

20731getWidenedType(VL.front()->getType(), VTy->getNumElements()),

20732R.isSignedMinBitwidthRootNode());

20733 }

20734switch (RdxKind) {

20735case RecurKind::Add: {

20736// root = mul prev_root, <1, 1, n, 1>

20737SmallVector<Constant *> Vals;

20738for (Value *V : VL) {

20739unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));

20740 Vals.push_back(ConstantInt::get(V->getType(), Cnt,/*IsSigned=*/false));

20741 }

20742auto *Scale =ConstantVector::get(Vals);

20743LLVM_DEBUG(dbgs() <<"SLP: Add (to-mul) " << Scale <<"of "

20744 << VectorizedValue <<". (HorRdx)\n");

20745return Builder.CreateMul(VectorizedValue, Scale);

20746 }

20747case RecurKind::And:

20748case RecurKind::Or:

20749// No need for multiple or/and(s).

20750LLVM_DEBUG(dbgs() <<"SLP: And/or of same " << VectorizedValue

20751 <<". (HorRdx)\n");

20752return VectorizedValue;

20753case RecurKind::SMax:

20754case RecurKind::SMin:

20755case RecurKind::UMax:

20756case RecurKind::UMin:

20757case RecurKind::FMax:

20758case RecurKind::FMin:

20759case RecurKind::FMaximum:

20760case RecurKind::FMinimum:

20761// No need for multiple min/max(s) of the same value.

20762LLVM_DEBUG(dbgs() <<"SLP: Max/min of same " << VectorizedValue

20763 <<". (HorRdx)\n");

20764return VectorizedValue;

20765case RecurKind::Xor: {

20766// Replace values with even number of repeats with 0, since

20767// x xor x = 0.

20768// root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,

20769// 7>, if elements 4th and 6th elements have even number of repeats.

20770SmallVector<int>Mask(

20771 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),

20772PoisonMaskElem);

20773 std::iota(Mask.begin(),Mask.end(), 0);

20774bool NeedShuffle =false;

20775for (unsignedI = 0, VF = VL.size();I < VF; ++I) {

20776Value *V = VL[I];

20777unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));

20778if (Cnt % 2 == 0) {

20779Mask[I] = VF;

20780 NeedShuffle =true;

20781 }

20782 }

20783LLVM_DEBUG(dbgs() <<"SLP: Xor <";for (intI

20784 : Mask)dbgs()

20785 <<I <<" ";

20786dbgs() <<"> of " << VectorizedValue <<". (HorRdx)\n");

20787if (NeedShuffle)

20788 VectorizedValue = Builder.CreateShuffleVector(

20789 VectorizedValue,

20790 ConstantVector::getNullValue(VectorizedValue->getType()),Mask);

20791return VectorizedValue;

20792 }

20793case RecurKind::FAdd: {

20794// root = fmul prev_root, <1.0, 1.0, n.0, 1.0>

20795SmallVector<Constant *> Vals;

20796for (Value *V : VL) {

20797unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));

20798 Vals.push_back(ConstantFP::get(V->getType(), Cnt));

20799 }

20800auto *Scale =ConstantVector::get(Vals);

20801return Builder.CreateFMul(VectorizedValue, Scale);

20802 }

20803case RecurKind::Mul:

20804case RecurKind::FMul:

20805case RecurKind::FMulAdd:

20806case RecurKind::IAnyOf:

20807case RecurKind::FAnyOf:

20808case RecurKind::IFindLastIV:

20809case RecurKind::FFindLastIV:

20810case RecurKind::None:

20811llvm_unreachable("Unexpected reduction kind for reused scalars.");

20812 }

20813returnnullptr;

20814 }

20815};

20816}// end anonymous namespace

20817

20818/// Gets recurrence kind from the specified value.

20819staticRecurKind getRdxKind(Value *V) {

20820return HorizontalReduction::getRdxKind(V);

20821}

20822static std::optional<unsigned>getAggregateSize(Instruction *InsertInst) {

20823if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))

20824return cast<FixedVectorType>(IE->getType())->getNumElements();

20825

20826unsigned AggregateSize = 1;

20827auto *IV = cast<InsertValueInst>(InsertInst);

20828Type *CurrentType =IV->getType();

20829do {

20830if (auto *ST = dyn_cast<StructType>(CurrentType)) {

20831for (auto *Elt : ST->elements())

20832if (Elt != ST->getElementType(0))// check homogeneity

20833return std::nullopt;

20834 AggregateSize *= ST->getNumElements();

20835 CurrentType = ST->getElementType(0);

20836 }elseif (auto *AT = dyn_cast<ArrayType>(CurrentType)) {

20837 AggregateSize *= AT->getNumElements();

20838 CurrentType = AT->getElementType();

20839 }elseif (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {

20840 AggregateSize *= VT->getNumElements();

20841return AggregateSize;

20842 }elseif (CurrentType->isSingleValueType()) {

20843return AggregateSize;

20844 }else {

20845return std::nullopt;

20846 }

20847 }while (true);

20848}

20849

20850staticvoidfindBuildAggregate_rec(Instruction *LastInsertInst,

20851TargetTransformInfo *TTI,

20852SmallVectorImpl<Value *> &BuildVectorOpds,

20853SmallVectorImpl<Value *> &InsertElts,

20854unsigned OperandOffset,constBoUpSLP &R) {

20855do {

20856Value *InsertedOperand = LastInsertInst->getOperand(1);

20857 std::optional<unsigned> OperandIndex =

20858getElementIndex(LastInsertInst, OperandOffset);

20859if (!OperandIndex || R.isDeleted(LastInsertInst))

20860return;

20861if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {

20862findBuildAggregate_rec(cast<Instruction>(InsertedOperand),TTI,

20863 BuildVectorOpds, InsertElts, *OperandIndex, R);

20864

20865 }else {

20866 BuildVectorOpds[*OperandIndex] = InsertedOperand;

20867 InsertElts[*OperandIndex] = LastInsertInst;

20868 }

20869 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));

20870 }while (LastInsertInst !=nullptr &&

20871 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&

20872 LastInsertInst->hasOneUse());

20873}

20874

20875/// Recognize construction of vectors like

20876/// %ra = insertelement <4 x float> poison, float %s0, i32 0

20877/// %rb = insertelement <4 x float> %ra, float %s1, i32 1

20878/// %rc = insertelement <4 x float> %rb, float %s2, i32 2

20879/// %rd = insertelement <4 x float> %rc, float %s3, i32 3

20880/// starting from the last insertelement or insertvalue instruction.

20881///

20882/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},

20883/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.

20884/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.

20885///

20886/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.

20887///

20888/// \return true if it matches.

20889staticboolfindBuildAggregate(Instruction *LastInsertInst,

20890TargetTransformInfo *TTI,

20891SmallVectorImpl<Value *> &BuildVectorOpds,

20892SmallVectorImpl<Value *> &InsertElts,

20893constBoUpSLP &R) {

20894

20895assert((isa<InsertElementInst>(LastInsertInst) ||

20896 isa<InsertValueInst>(LastInsertInst)) &&

20897"Expected insertelement or insertvalue instruction!");

20898

20899assert((BuildVectorOpds.empty() && InsertElts.empty()) &&

20900"Expected empty result vectors!");

20901

20902 std::optional<unsigned> AggregateSize =getAggregateSize(LastInsertInst);

20903if (!AggregateSize)

20904returnfalse;

20905 BuildVectorOpds.resize(*AggregateSize);

20906 InsertElts.resize(*AggregateSize);

20907

20908findBuildAggregate_rec(LastInsertInst,TTI, BuildVectorOpds, InsertElts, 0,

20909 R);

20910llvm::erase(BuildVectorOpds,nullptr);

20911llvm::erase(InsertElts,nullptr);

20912if (BuildVectorOpds.size() >= 2)

20913returntrue;

20914

20915returnfalse;

20916}

20917

20918/// Try and get a reduction instruction from a phi node.

20919///

20920/// Given a phi node \p P in a block \p ParentBB, consider possible reductions

20921/// if they come from either \p ParentBB or a containing loop latch.

20922///

20923/// \returns A candidate reduction value if possible, or \code nullptr \endcode

20924/// if not possible.

20925staticInstruction *getReductionInstr(constDominatorTree *DT,PHINode *P,

20926BasicBlock *ParentBB,LoopInfo *LI) {

20927// There are situations where the reduction value is not dominated by the

20928// reduction phi. Vectorizing such cases has been reported to cause

20929// miscompiles. See PR25787.

20930auto DominatedReduxValue = [&](Value *R) {

20931return isa<Instruction>(R) &&

20932 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());

20933 };

20934

20935Instruction *Rdx =nullptr;

20936

20937// Return the incoming value if it comes from the same BB as the phi node.

20938if (P->getIncomingBlock(0) == ParentBB) {

20939 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));

20940 }elseif (P->getIncomingBlock(1) == ParentBB) {

20941 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));

20942 }

20943

20944if (Rdx && DominatedReduxValue(Rdx))

20945return Rdx;

20946

20947// Otherwise, check whether we have a loop latch to look at.

20948Loop *BBL = LI->getLoopFor(ParentBB);

20949if (!BBL)

20950returnnullptr;

20951BasicBlock *BBLatch = BBL->getLoopLatch();

20952if (!BBLatch)

20953returnnullptr;

20954

20955// There is a loop latch, return the incoming value if it comes from

20956// that. This reduction pattern occasionally turns up.

20957if (P->getIncomingBlock(0) == BBLatch) {

20958 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));

20959 }elseif (P->getIncomingBlock(1) == BBLatch) {

20960 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));

20961 }

20962

20963if (Rdx && DominatedReduxValue(Rdx))

20964return Rdx;

20965

20966returnnullptr;

20967}

20968

20969staticboolmatchRdxBop(Instruction *I,Value *&V0,Value *&V1) {

20970if (match(I,m_BinOp(m_Value(V0),m_Value(V1))))

20971returntrue;

20972if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0),m_Value(V1))))

20973returntrue;

20974if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0),m_Value(V1))))

20975returntrue;

20976if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0),m_Value(V1))))

20977returntrue;

20978if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0),m_Value(V1))))

20979returntrue;

20980if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0),m_Value(V1))))

20981returntrue;

20982if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0),m_Value(V1))))

20983returntrue;

20984if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0),m_Value(V1))))

20985returntrue;

20986if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0),m_Value(V1))))

20987returntrue;

20988returnfalse;

20989}

20990

20991/// We could have an initial reduction that is not an add.

20992/// r *= v1 + v2 + v3 + v4

20993/// In such a case start looking for a tree rooted in the first '+'.

20994/// \Returns the new root if found, which may be nullptr if not an instruction.

20995staticInstruction *tryGetSecondaryReductionRoot(PHINode *Phi,

20996Instruction *Root) {

20997assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||

20998 isa<IntrinsicInst>(Root)) &&

20999"Expected binop, select, or intrinsic for reduction matching");

21000Value *LHS =

21001 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));

21002Value *RHS =

21003 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);

21004if (LHS == Phi)

21005return dyn_cast<Instruction>(RHS);

21006if (RHS == Phi)

21007return dyn_cast<Instruction>(LHS);

21008returnnullptr;

21009}

21010

21011/// \p Returns the first operand of \p I that does not match \p Phi. If

21012/// operand is not an instruction it returns nullptr.

21013staticInstruction *getNonPhiOperand(Instruction *I,PHINode *Phi) {

21014Value *Op0 =nullptr;

21015Value *Op1 =nullptr;

21016if (!matchRdxBop(I, Op0, Op1))

21017returnnullptr;

21018return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);

21019}

21020

21021/// \Returns true if \p I is a candidate instruction for reduction vectorization.

21022staticboolisReductionCandidate(Instruction *I) {

21023boolIsSelect =match(I,m_Select(m_Value(),m_Value(),m_Value()));

21024Value *B0 =nullptr, *B1 =nullptr;

21025bool IsBinop =matchRdxBop(I, B0, B1);

21026return IsBinop ||IsSelect;

21027}

21028

21029bool SLPVectorizerPass::vectorizeHorReduction(

21030PHINode *P,Instruction *Root,BasicBlock *BB,BoUpSLP &R,

21031SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {

21032if (!ShouldVectorizeHor)

21033returnfalse;

21034bool TryOperandsAsNewSeeds =P && isa<BinaryOperator>(Root);

21035

21036if (Root->getParent() != BB || isa<PHINode>(Root))

21037returnfalse;

21038

21039// If we can find a secondary reduction root, use that instead.

21040auto SelectRoot = [&]() {

21041if (TryOperandsAsNewSeeds &&isReductionCandidate(Root) &&

21042 HorizontalReduction::getRdxKind(Root) !=RecurKind::None)

21043if (Instruction *NewRoot =tryGetSecondaryReductionRoot(P, Root))

21044return NewRoot;

21045return Root;

21046 };

21047

21048// Start analysis starting from Root instruction. If horizontal reduction is

21049// found, try to vectorize it. If it is not a horizontal reduction or

21050// vectorization is not possible or not effective, and currently analyzed

21051// instruction is a binary operation, try to vectorize the operands, using

21052// pre-order DFS traversal order. If the operands were not vectorized, repeat

21053// the same procedure considering each operand as a possible root of the

21054// horizontal reduction.

21055// Interrupt the process if the Root instruction itself was vectorized or all

21056// sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.

21057// If a horizintal reduction was not matched or vectorized we collect

21058// instructions for possible later attempts for vectorization.

21059 std::queue<std::pair<Instruction *, unsigned>>Stack;

21060Stack.emplace(SelectRoot(), 0);

21061SmallPtrSet<Value *, 8> VisitedInstrs;

21062bool Res =false;

21063auto &&TryToReduce = [this, &R](Instruction *Inst) ->Value * {

21064if (R.isAnalyzedReductionRoot(Inst))

21065returnnullptr;

21066if (!isReductionCandidate(Inst))

21067returnnullptr;

21068HorizontalReduction HorRdx;

21069if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))

21070returnnullptr;

21071return HorRdx.tryToReduce(R, *DL,TTI, *TLI, AC);

21072 };

21073auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {

21074if (TryOperandsAsNewSeeds && FutureSeed == Root) {

21075 FutureSeed =getNonPhiOperand(Root,P);

21076if (!FutureSeed)

21077returnfalse;

21078 }

21079// Do not collect CmpInst or InsertElementInst/InsertValueInst as their

21080// analysis is done separately.

21081if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))

21082 PostponedInsts.push_back(FutureSeed);

21083returntrue;

21084 };

21085

21086while (!Stack.empty()) {

21087Instruction *Inst;

21088unsigned Level;

21089 std::tie(Inst, Level) =Stack.front();

21090Stack.pop();

21091// Do not try to analyze instruction that has already been vectorized.

21092// This may happen when we vectorize instruction operands on a previous

21093// iteration while stack was populated before that happened.

21094if (R.isDeleted(Inst))

21095continue;

21096if (Value *VectorizedV = TryToReduce(Inst)) {

21097 Res =true;

21098if (auto *I = dyn_cast<Instruction>(VectorizedV)) {

21099// Try to find another reduction.

21100Stack.emplace(I, Level);

21101continue;

21102 }

21103if (R.isDeleted(Inst))

21104continue;

21105 }else {

21106// We could not vectorize `Inst` so try to use it as a future seed.

21107if (!TryAppendToPostponedInsts(Inst)) {

21108assert(Stack.empty() &&"Expected empty stack");

21109break;

21110 }

21111 }

21112

21113// Try to vectorize operands.

21114// Continue analysis for the instruction from the same basic block only to

21115// save compile time.

21116if (++Level <RecursionMaxDepth)

21117for (auto *Op : Inst->operand_values())

21118if (VisitedInstrs.insert(Op).second)

21119if (auto *I = dyn_cast<Instruction>(Op))

21120// Do not try to vectorize CmpInst operands, this is done

21121// separately.

21122if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&

21123 !R.isDeleted(I) &&I->getParent() == BB)

21124Stack.emplace(I, Level);

21125 }

21126return Res;

21127}

21128

21129bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P,Instruction *Root,

21130BasicBlock *BB,BoUpSLP &R) {

21131SmallVector<WeakTrackingVH> PostponedInsts;

21132bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);

21133 Res |= tryToVectorize(PostponedInsts, R);

21134return Res;

21135}

21136

21137bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,

21138BoUpSLP &R) {

21139bool Res =false;

21140for (Value *V : Insts)

21141if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))

21142 Res |= tryToVectorize(Inst, R);

21143return Res;

21144}

21145

21146bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,

21147BasicBlock *BB,BoUpSLP &R,

21148bool MaxVFOnly) {

21149if (!R.canMapToVector(IVI->getType()))

21150returnfalse;

21151

21152SmallVector<Value *, 16> BuildVectorOpds;

21153SmallVector<Value *, 16> BuildVectorInsts;

21154if (!findBuildAggregate(IVI,TTI, BuildVectorOpds, BuildVectorInsts, R))

21155returnfalse;

21156

21157if (MaxVFOnly && BuildVectorOpds.size() == 2) {

21158R.getORE()->emit([&]() {

21159returnOptimizationRemarkMissed(SV_NAME,"NotPossible", IVI)

21160 <<"Cannot SLP vectorize list: only 2 elements of buildvalue, "

21161"trying reduction first.";

21162 });

21163returnfalse;

21164 }

21165LLVM_DEBUG(dbgs() <<"SLP: array mappable to vector: " << *IVI <<"\n");

21166// Aggregate value is unlikely to be processed in vector register.

21167return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);

21168}

21169

21170bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,

21171BasicBlock *BB,BoUpSLP &R,

21172bool MaxVFOnly) {

21173SmallVector<Value *, 16> BuildVectorInsts;

21174SmallVector<Value *, 16> BuildVectorOpds;

21175SmallVector<int>Mask;

21176if (!findBuildAggregate(IEI,TTI, BuildVectorOpds, BuildVectorInsts, R) ||

21177 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&

21178isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))

21179returnfalse;

21180

21181if (MaxVFOnly && BuildVectorInsts.size() == 2) {

21182R.getORE()->emit([&]() {

21183returnOptimizationRemarkMissed(SV_NAME,"NotPossible", IEI)

21184 <<"Cannot SLP vectorize list: only 2 elements of buildvector, "

21185"trying reduction first.";

21186 });

21187returnfalse;

21188 }

21189LLVM_DEBUG(dbgs() <<"SLP: array mappable to vector: " << *IEI <<"\n");

21190return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);

21191}

21192

21193template <typename T>

21194staticbooltryToVectorizeSequence(

21195SmallVectorImpl<T *> &Incoming,function_ref<bool(T *,T *)> Comparator,

21196function_ref<bool(T *,T *)> AreCompatible,

21197function_ref<bool(ArrayRef<T *>,bool)> TryToVectorizeHelper,

21198bool MaxVFOnly,BoUpSLP &R) {

21199bool Changed =false;

21200// Sort by type, parent, operands.

21201stable_sort(Incoming, Comparator);

21202

21203// Try to vectorize elements base on their type.

21204SmallVector<T *> Candidates;

21205SmallVector<T *> VL;

21206for (auto *IncIt =Incoming.begin(), *E =Incoming.end(); IncIt != E;

21207 VL.clear()) {

21208// Look for the next elements with the same type, parent and operand

21209// kinds.

21210auto *I = dyn_cast<Instruction>(*IncIt);

21211if (!I || R.isDeleted(I)) {

21212 ++IncIt;

21213continue;

21214 }

21215auto *SameTypeIt = IncIt;

21216while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||

21217 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||

21218 AreCompatible(*SameTypeIt, *IncIt))) {

21219auto *I = dyn_cast<Instruction>(*SameTypeIt);

21220 ++SameTypeIt;

21221if (I && !R.isDeleted(I))

21222 VL.push_back(cast<T>(I));

21223 }

21224

21225// Try to vectorize them.

21226unsigned NumElts = VL.size();

21227LLVM_DEBUG(dbgs() <<"SLP: Trying to vectorize starting at nodes ("

21228 << NumElts <<")\n");

21229// The vectorization is a 3-state attempt:

21230// 1. Try to vectorize instructions with the same/alternate opcodes with the

21231// size of maximal register at first.

21232// 2. Try to vectorize remaining instructions with the same type, if

21233// possible. This may result in the better vectorization results rather than

21234// if we try just to vectorize instructions with the same/alternate opcodes.

21235// 3. Final attempt to try to vectorize all instructions with the

21236// same/alternate ops only, this may result in some extra final

21237// vectorization.

21238if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {

21239// Success start over because instructions might have been changed.

21240 Changed =true;

21241 VL.swap(Candidates);

21242 Candidates.clear();

21243for (T *V : VL) {

21244if (auto *I = dyn_cast<Instruction>(V);I && !R.isDeleted(I))

21245 Candidates.push_back(V);

21246 }

21247 }else {

21248 /// \Returns the minimum number of elements that we will attempt to

21249 /// vectorize.

21250auto GetMinNumElements = [&R](Value *V) {

21251unsigned EltSize = R.getVectorElementSize(V);

21252return std::max(2U, R.getMaxVecRegSize() / EltSize);

21253 };

21254if (NumElts < GetMinNumElements(*IncIt) &&

21255 (Candidates.empty() ||

21256 Candidates.front()->getType() == (*IncIt)->getType())) {

21257for (T *V : VL) {

21258if (auto *I = dyn_cast<Instruction>(V);I && !R.isDeleted(I))

21259 Candidates.push_back(V);

21260 }

21261 }

21262 }

21263// Final attempt to vectorize instructions with the same types.

21264if (Candidates.size() > 1 &&

21265 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {

21266if (TryToVectorizeHelper(Candidates,/*MaxVFOnly=*/false)) {

21267// Success start over because instructions might have been changed.

21268 Changed =true;

21269 }elseif (MaxVFOnly) {

21270// Try to vectorize using small vectors.

21271SmallVector<T *> VL;

21272for (auto *It = Candidates.begin(), *End = Candidates.end(); It !=End;

21273 VL.clear()) {

21274auto *I = dyn_cast<Instruction>(*It);

21275if (!I || R.isDeleted(I)) {

21276 ++It;

21277continue;

21278 }

21279auto *SameTypeIt = It;

21280while (SameTypeIt !=End &&

21281 (!isa<Instruction>(*SameTypeIt) ||

21282 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||

21283 AreCompatible(*SameTypeIt, *It))) {

21284auto *I = dyn_cast<Instruction>(*SameTypeIt);

21285 ++SameTypeIt;

21286if (I && !R.isDeleted(I))

21287 VL.push_back(cast<T>(I));

21288 }

21289unsigned NumElts = VL.size();

21290if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),

21291/*MaxVFOnly=*/false))

21292 Changed =true;

21293 It = SameTypeIt;

21294 }

21295 }

21296 Candidates.clear();

21297 }

21298

21299// Start over at the next instruction of a different type (or the end).

21300 IncIt = SameTypeIt;

21301 }

21302return Changed;

21303}

21304

21305/// Compare two cmp instructions. If IsCompatibility is true, function returns

21306/// true if 2 cmps have same/swapped predicates and mos compatible corresponding

21307/// operands. If IsCompatibility is false, function implements strict weak

21308/// ordering relation between two cmp instructions, returning true if the first

21309/// instruction is "less" than the second, i.e. its predicate is less than the

21310/// predicate of the second or the operands IDs are less than the operands IDs

21311/// of the second cmp instruction.

21312template <bool IsCompatibility>

21313staticboolcompareCmp(Value *V,Value *V2,TargetLibraryInfo &TLI,

21314constDominatorTree &DT) {

21315assert(isValidElementType(V->getType()) &&

21316isValidElementType(V2->getType()) &&

21317"Expected valid element types only.");

21318if (V == V2)

21319return IsCompatibility;

21320auto *CI1 = cast<CmpInst>(V);

21321auto *CI2 = cast<CmpInst>(V2);

21322if (CI1->getOperand(0)->getType()->getTypeID() <

21323 CI2->getOperand(0)->getType()->getTypeID())

21324return !IsCompatibility;

21325if (CI1->getOperand(0)->getType()->getTypeID() >

21326 CI2->getOperand(0)->getType()->getTypeID())

21327returnfalse;

21328if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <

21329 CI2->getOperand(0)->getType()->getScalarSizeInBits())

21330return !IsCompatibility;

21331if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >

21332 CI2->getOperand(0)->getType()->getScalarSizeInBits())

21333returnfalse;

21334CmpInst::Predicate Pred1 = CI1->getPredicate();

21335CmpInst::Predicate Pred2 = CI2->getPredicate();

21336CmpInst::Predicate SwapPred1 =CmpInst::getSwappedPredicate(Pred1);

21337CmpInst::Predicate SwapPred2 =CmpInst::getSwappedPredicate(Pred2);

21338CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);

21339CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);

21340if (BasePred1 < BasePred2)

21341return !IsCompatibility;

21342if (BasePred1 > BasePred2)

21343returnfalse;

21344// Compare operands.

21345bool CI1Preds = Pred1 == BasePred1;

21346bool CI2Preds = Pred2 == BasePred1;

21347for (intI = 0, E = CI1->getNumOperands();I < E; ++I) {

21348auto *Op1 = CI1->getOperand(CI1Preds ?I : E -I - 1);

21349auto *Op2 = CI2->getOperand(CI2Preds ?I : E -I - 1);

21350if (Op1 == Op2)

21351continue;

21352if (Op1->getValueID() < Op2->getValueID())

21353return !IsCompatibility;

21354if (Op1->getValueID() > Op2->getValueID())

21355returnfalse;

21356if (auto *I1 = dyn_cast<Instruction>(Op1))

21357if (auto *I2 = dyn_cast<Instruction>(Op2)) {

21358if (IsCompatibility) {

21359if (I1->getParent() != I2->getParent())

21360returnfalse;

21361 }else {

21362// Try to compare nodes with same parent.

21363DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());

21364DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());

21365if (!NodeI1)

21366return NodeI2 !=nullptr;

21367if (!NodeI2)

21368returnfalse;

21369assert((NodeI1 == NodeI2) ==

21370 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

21371"Different nodes should have different DFS numbers");

21372if (NodeI1 != NodeI2)

21373return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

21374 }

21375 InstructionsState S =getSameOpcode({I1, I2}, TLI);

21376if (S && (IsCompatibility || !S.isAltShuffle()))

21377continue;

21378if (IsCompatibility)

21379returnfalse;

21380if (I1->getOpcode() != I2->getOpcode())

21381return I1->getOpcode() < I2->getOpcode();

21382 }

21383 }

21384return IsCompatibility;

21385}

21386

21387template <typename ItT>

21388bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,

21389BasicBlock *BB,BoUpSLP &R) {

21390bool Changed =false;

21391// Try to find reductions first.

21392for (CmpInst *I : CmpInsts) {

21393if (R.isDeleted(I))

21394continue;

21395for (Value *Op :I->operands())

21396if (auto *RootOp = dyn_cast<Instruction>(Op)) {

21397 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);

21398if (R.isDeleted(I))

21399break;

21400 }

21401 }

21402// Try to vectorize operands as vector bundles.

21403for (CmpInst *I : CmpInsts) {

21404if (R.isDeleted(I))

21405continue;

21406 Changed |= tryToVectorize(I, R);

21407 }

21408// Try to vectorize list of compares.

21409// Sort by type, compare predicate, etc.

21410auto CompareSorter = [&](Value *V,Value *V2) {

21411if (V == V2)

21412returnfalse;

21413return compareCmp<false>(V, V2, *TLI, *DT);

21414 };

21415

21416auto AreCompatibleCompares = [&](Value *V1,Value *V2) {

21417if (V1 == V2)

21418returntrue;

21419return compareCmp<true>(V1, V2, *TLI, *DT);

21420 };

21421

21422SmallVector<Value *> Vals;

21423for (Instruction *V : CmpInsts)

21424if (!R.isDeleted(V) &&isValidElementType(getValueType(V)))

21425 Vals.push_back(V);

21426if (Vals.size() <= 1)

21427return Changed;

21428 Changed |= tryToVectorizeSequence<Value>(

21429 Vals, CompareSorter, AreCompatibleCompares,

21430 [this, &R](ArrayRef<Value *> Candidates,bool MaxVFOnly) {

21431// Exclude possible reductions from other blocks.

21432bool ArePossiblyReducedInOtherBlock =any_of(Candidates, [](Value *V) {

21433returnany_of(V->users(), [V](User *U) {

21434 auto *Select = dyn_cast<SelectInst>(U);

21435 return Select &&

21436 Select->getParent() != cast<Instruction>(V)->getParent();

21437 });

21438 });

21439if (ArePossiblyReducedInOtherBlock)

21440returnfalse;

21441return tryToVectorizeList(Candidates, R, MaxVFOnly);

21442 },

21443/*MaxVFOnly=*/true,R);

21444return Changed;

21445}

21446

21447bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,

21448BasicBlock *BB,BoUpSLP &R) {

21449assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&

21450"This function only accepts Insert instructions");

21451bool OpsChanged =false;

21452SmallVector<WeakTrackingVH> PostponedInsts;

21453for (auto *I :reverse(Instructions)) {

21454// pass1 - try to match and vectorize a buildvector sequence for MaxVF only.

21455if (R.isDeleted(I) || isa<CmpInst>(I))

21456continue;

21457if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {

21458 OpsChanged |=

21459 vectorizeInsertValueInst(LastInsertValue, BB, R,/*MaxVFOnly=*/true);

21460 }elseif (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {

21461 OpsChanged |=

21462 vectorizeInsertElementInst(LastInsertElem, BB, R,/*MaxVFOnly=*/true);

21463 }

21464// pass2 - try to vectorize reductions only

21465if (R.isDeleted(I))

21466continue;

21467 OpsChanged |= vectorizeHorReduction(nullptr,I, BB, R, PostponedInsts);

21468if (R.isDeleted(I) || isa<CmpInst>(I))

21469continue;

21470// pass3 - try to match and vectorize a buildvector sequence.

21471if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {

21472 OpsChanged |=

21473 vectorizeInsertValueInst(LastInsertValue, BB, R,/*MaxVFOnly=*/false);

21474 }elseif (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {

21475 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,

21476/*MaxVFOnly=*/false);

21477 }

21478 }

21479// Now try to vectorize postponed instructions.

21480 OpsChanged |= tryToVectorize(PostponedInsts, R);

21481

21482Instructions.clear();

21483return OpsChanged;

21484}

21485

21486bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB,BoUpSLP &R) {

21487bool Changed =false;

21488SmallVector<Value *, 4>Incoming;

21489SmallPtrSet<Value *, 16> VisitedInstrs;

21490// Maps phi nodes to the non-phi nodes found in the use tree for each phi

21491// node. Allows better to identify the chains that can be vectorized in the

21492// better way.

21493DenseMap<Value *, SmallVector<Value *, 4>> PHIToOpcodes;

21494auto PHICompare = [this, &PHIToOpcodes](Value *V1,Value *V2) {

21495assert(isValidElementType(V1->getType()) &&

21496isValidElementType(V2->getType()) &&

21497"Expected vectorizable types only.");

21498// It is fine to compare type IDs here, since we expect only vectorizable

21499// types, like ints, floats and pointers, we don't care about other type.

21500if (V1->getType()->getTypeID() <V2->getType()->getTypeID())

21501returntrue;

21502if (V1->getType()->getTypeID() >V2->getType()->getTypeID())

21503returnfalse;

21504if (V1->getType()->getScalarSizeInBits() <

21505V2->getType()->getScalarSizeInBits())

21506returntrue;

21507if (V1->getType()->getScalarSizeInBits() >

21508V2->getType()->getScalarSizeInBits())

21509returnfalse;

21510ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

21511ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

21512if (Opcodes1.size() < Opcodes2.size())

21513returntrue;

21514if (Opcodes1.size() > Opcodes2.size())

21515returnfalse;

21516for (intI = 0, E = Opcodes1.size();I < E; ++I) {

21517 {

21518// Instructions come first.

21519auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);

21520auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);

21521if (I1 && I2) {

21522DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());

21523DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());

21524if (!NodeI1)

21525return NodeI2 !=nullptr;

21526if (!NodeI2)

21527returnfalse;

21528assert((NodeI1 == NodeI2) ==

21529 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

21530"Different nodes should have different DFS numbers");

21531if (NodeI1 != NodeI2)

21532return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

21533 InstructionsState S =getSameOpcode({I1, I2}, *TLI);

21534if (S && !S.isAltShuffle())

21535continue;

21536returnI1->getOpcode() < I2->getOpcode();

21537 }

21538if (I1)

21539returntrue;

21540if (I2)

21541returnfalse;

21542 }

21543 {

21544// Non-undef constants come next.

21545bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);

21546bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);

21547if (C1 && C2)

21548continue;

21549if (C1)

21550returntrue;

21551if (C2)

21552returnfalse;

21553 }

21554bool U1 = isa<UndefValue>(Opcodes1[I]);

21555bool U2 = isa<UndefValue>(Opcodes2[I]);

21556 {

21557// Non-constant non-instructions come next.

21558if (!U1 && !U2) {

21559auto ValID1 = Opcodes1[I]->getValueID();

21560auto ValID2 = Opcodes2[I]->getValueID();

21561if (ValID1 == ValID2)

21562continue;

21563if (ValID1 < ValID2)

21564returntrue;

21565if (ValID1 > ValID2)

21566returnfalse;

21567 }

21568if (!U1)

21569returntrue;

21570if (!U2)

21571returnfalse;

21572 }

21573// Undefs come last.

21574assert(U1 && U2 &&"The only thing left should be undef & undef.");

21575 }

21576returnfalse;

21577 };

21578auto AreCompatiblePHIs = [&PHIToOpcodes,this, &R](Value *V1,Value *V2) {

21579if (V1 == V2)

21580returntrue;

21581if (V1->getType() !=V2->getType())

21582returnfalse;

21583ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];

21584ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];

21585if (Opcodes1.size() != Opcodes2.size())

21586returnfalse;

21587for (intI = 0, E = Opcodes1.size();I < E; ++I) {

21588// Undefs are compatible with any other value.

21589if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))

21590continue;

21591if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))

21592if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {

21593if (R.isDeleted(I1) ||R.isDeleted(I2))

21594returnfalse;

21595if (I1->getParent() != I2->getParent())

21596returnfalse;

21597if (getSameOpcode({I1, I2}, *TLI))

21598continue;

21599returnfalse;

21600 }

21601if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))

21602continue;

21603if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())

21604returnfalse;

21605 }

21606returntrue;

21607 };

21608

21609bool HaveVectorizedPhiNodes =false;

21610do {

21611// Collect the incoming values from the PHIs.

21612Incoming.clear();

21613for (Instruction &I : *BB) {

21614auto *P = dyn_cast<PHINode>(&I);

21615if (!P ||P->getNumIncomingValues() >MaxPHINumOperands)

21616break;

21617

21618// No need to analyze deleted, vectorized and non-vectorizable

21619// instructions.

21620if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&

21621isValidElementType(P->getType()))

21622Incoming.push_back(P);

21623 }

21624

21625if (Incoming.size() <= 1)

21626break;

21627

21628// Find the corresponding non-phi nodes for better matching when trying to

21629// build the tree.

21630for (Value *V :Incoming) {

21631SmallVectorImpl<Value *> &Opcodes =

21632 PHIToOpcodes.try_emplace(V).first->getSecond();

21633if (!Opcodes.empty())

21634continue;

21635SmallVector<Value *, 4> Nodes(1, V);

21636SmallPtrSet<Value *, 4> Visited;

21637while (!Nodes.empty()) {

21638auto *PHI = cast<PHINode>(Nodes.pop_back_val());

21639if (!Visited.insert(PHI).second)

21640continue;

21641for (Value *V :PHI->incoming_values()) {

21642if (auto *PHI1 = dyn_cast<PHINode>((V))) {

21643 Nodes.push_back(PHI1);

21644continue;

21645 }

21646 Opcodes.emplace_back(V);

21647 }

21648 }

21649 }

21650

21651 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(

21652Incoming, PHICompare, AreCompatiblePHIs,

21653 [this, &R](ArrayRef<Value *> Candidates,bool MaxVFOnly) {

21654return tryToVectorizeList(Candidates, R, MaxVFOnly);

21655 },

21656/*MaxVFOnly=*/true,R);

21657 Changed |= HaveVectorizedPhiNodes;

21658if (HaveVectorizedPhiNodes &&any_of(PHIToOpcodes, [&](constauto &P) {

21659auto *PHI = dyn_cast<PHINode>(P.first);

21660return !PHI ||R.isDeleted(PHI);

21661 }))

21662 PHIToOpcodes.clear();

21663 VisitedInstrs.insert(Incoming.begin(),Incoming.end());

21664 }while (HaveVectorizedPhiNodes);

21665

21666 VisitedInstrs.clear();

21667

21668 InstSetVector PostProcessInserts;

21669SmallSetVector<CmpInst *, 8> PostProcessCmps;

21670// Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true

21671// also vectorizes `PostProcessCmps`.

21672auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {

21673bool Changed = vectorizeInserts(PostProcessInserts, BB, R);

21674if (VectorizeCmps) {

21675 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);

21676 PostProcessCmps.clear();

21677 }

21678 PostProcessInserts.clear();

21679return Changed;

21680 };

21681// Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.

21682auto IsInPostProcessInstrs = [&](Instruction *I) {

21683if (auto *Cmp = dyn_cast<CmpInst>(I))

21684return PostProcessCmps.contains(Cmp);

21685return isa<InsertElementInst, InsertValueInst>(I) &&

21686 PostProcessInserts.contains(I);

21687 };

21688// Returns true if `I` is an instruction without users, like terminator, or

21689// function call with ignored return value, store. Ignore unused instructions

21690// (basing on instruction type, except for CallInst and InvokeInst).

21691auto HasNoUsers = [](Instruction *I) {

21692returnI->use_empty() &&

21693 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));

21694 };

21695for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {

21696// Skip instructions with scalable type. The num of elements is unknown at

21697// compile-time for scalable type.

21698if (isa<ScalableVectorType>(It->getType()))

21699continue;

21700

21701// Skip instructions marked for the deletion.

21702if (R.isDeleted(&*It))

21703continue;

21704// We may go through BB multiple times so skip the one we have checked.

21705if (!VisitedInstrs.insert(&*It).second) {

21706if (HasNoUsers(&*It) &&

21707 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {

21708// We would like to start over since some instructions are deleted

21709// and the iterator may become invalid value.

21710 Changed =true;

21711 It = BB->begin();

21712 E = BB->end();

21713 }

21714continue;

21715 }

21716

21717if (isa<DbgInfoIntrinsic>(It))

21718continue;

21719

21720// Try to vectorize reductions that use PHINodes.

21721if (PHINode *P = dyn_cast<PHINode>(It)) {

21722// Check that the PHI is a reduction PHI.

21723if (P->getNumIncomingValues() == 2) {

21724// Try to match and vectorize a horizontal reduction.

21725Instruction *Root =getReductionInstr(DT,P, BB, LI);

21726if (Root && vectorizeRootInstruction(P, Root, BB, R)) {

21727 Changed =true;

21728 It = BB->begin();

21729 E = BB->end();

21730continue;

21731 }

21732 }

21733// Try to vectorize the incoming values of the PHI, to catch reductions

21734// that feed into PHIs.

21735for (unsignedI : seq<unsigned>(P->getNumIncomingValues())) {

21736// Skip if the incoming block is the current BB for now. Also, bypass

21737// unreachable IR for efficiency and to avoid crashing.

21738// TODO: Collect the skipped incoming values and try to vectorize them

21739// after processing BB.

21740if (BB ==P->getIncomingBlock(I) ||

21741 !DT->isReachableFromEntry(P->getIncomingBlock(I)))

21742continue;

21743

21744// Postponed instructions should not be vectorized here, delay their

21745// vectorization.

21746if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));

21747 PI && !IsInPostProcessInstrs(PI)) {

21748bool Res =

21749 vectorizeRootInstruction(nullptr, PI,P->getIncomingBlock(I), R);

21750 Changed |= Res;

21751if (Res &&R.isDeleted(P)) {

21752 It = BB->begin();

21753 E = BB->end();

21754break;

21755 }

21756 }

21757 }

21758continue;

21759 }

21760

21761if (HasNoUsers(&*It)) {

21762bool OpsChanged =false;

21763auto *SI = dyn_cast<StoreInst>(It);

21764bool TryToVectorizeRoot =ShouldStartVectorizeHorAtStore || !SI;

21765if (SI) {

21766auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));

21767// Try to vectorize chain in store, if this is the only store to the

21768// address in the block.

21769// TODO: This is just a temporarily solution to save compile time. Need

21770// to investigate if we can safely turn on slp-vectorize-hor-store

21771// instead to allow lookup for reduction chains in all non-vectorized

21772// stores (need to check side effects and compile time).

21773 TryToVectorizeRoot |= (I == Stores.end() ||I->second.size() == 1) &&

21774SI->getValueOperand()->hasOneUse();

21775 }

21776if (TryToVectorizeRoot) {

21777for (auto *V : It->operand_values()) {

21778// Postponed instructions should not be vectorized here, delay their

21779// vectorization.

21780if (auto *VI = dyn_cast<Instruction>(V);

21781VI && !IsInPostProcessInstrs(VI))

21782// Try to match and vectorize a horizontal reduction.

21783 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);

21784 }

21785 }

21786// Start vectorization of post-process list of instructions from the

21787// top-tree instructions to try to vectorize as many instructions as

21788// possible.

21789 OpsChanged |=

21790 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());

21791if (OpsChanged) {

21792// We would like to start over since some instructions are deleted

21793// and the iterator may become invalid value.

21794 Changed =true;

21795 It = BB->begin();

21796 E = BB->end();

21797continue;

21798 }

21799 }

21800

21801if (isa<InsertElementInst, InsertValueInst>(It))

21802 PostProcessInserts.insert(&*It);

21803elseif (isa<CmpInst>(It))

21804 PostProcessCmps.insert(cast<CmpInst>(&*It));

21805 }

21806

21807return Changed;

21808}

21809

21810bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB,BoUpSLP &R) {

21811auto Changed =false;

21812for (auto &Entry : GEPs) {

21813// If the getelementptr list has fewer than two elements, there's nothing

21814// to do.

21815if (Entry.second.size() < 2)

21816continue;

21817

21818LLVM_DEBUG(dbgs() <<"SLP: Analyzing a getelementptr list of length "

21819 <<Entry.second.size() <<".\n");

21820

21821// Process the GEP list in chunks suitable for the target's supported

21822// vector size. If a vector register can't hold 1 element, we are done. We

21823// are trying to vectorize the index computations, so the maximum number of

21824// elements is based on the size of the index expression, rather than the

21825// size of the GEP itself (the target's pointer size).

21826auto *It =find_if(Entry.second, [&](GetElementPtrInst *GEP) {

21827 return !R.isDeleted(GEP);

21828 });

21829if (It ==Entry.second.end())

21830continue;

21831unsigned MaxVecRegSize =R.getMaxVecRegSize();

21832unsigned EltSize =R.getVectorElementSize(*(*It)->idx_begin());

21833if (MaxVecRegSize < EltSize)

21834continue;

21835

21836unsigned MaxElts = MaxVecRegSize / EltSize;

21837for (unsigned BI = 0, BE =Entry.second.size(); BI < BE; BI += MaxElts) {

21838autoLen = std::min<unsigned>(BE - BI, MaxElts);

21839ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);

21840

21841// Initialize a set a candidate getelementptrs. Note that we use a

21842// SetVector here to preserve program order. If the index computations

21843// are vectorizable and begin with loads, we want to minimize the chance

21844// of having to reorder them later.

21845SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());

21846

21847// Some of the candidates may have already been vectorized after we

21848// initially collected them or their index is optimized to constant value.

21849// If so, they are marked as deleted, so remove them from the set of

21850// candidates.

21851 Candidates.remove_if([&R](Value *I) {

21852returnR.isDeleted(cast<Instruction>(I)) ||

21853 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());

21854 });

21855

21856// Remove from the set of candidates all pairs of getelementptrs with

21857// constant differences. Such getelementptrs are likely not good

21858// candidates for vectorization in a bottom-up phase since one can be

21859// computed from the other. We also ensure all candidate getelementptr

21860// indices are unique.

21861for (intI = 0, E = GEPList.size();I < E && Candidates.size() > 1; ++I) {

21862auto *GEPI = GEPList[I];

21863if (!Candidates.count(GEPI))

21864continue;

21865constSCEV *SCEVI = SE->getSCEV(GEPList[I]);

21866for (int J =I + 1; J < E && Candidates.size() > 1; ++J) {

21867auto *GEPJ = GEPList[J];

21868constSCEV *SCEVJ = SE->getSCEV(GEPList[J]);

21869if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {

21870 Candidates.remove(GEPI);

21871 Candidates.remove(GEPJ);

21872 }elseif (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {

21873 Candidates.remove(GEPJ);

21874 }

21875 }

21876 }

21877

21878// We break out of the above computation as soon as we know there are

21879// fewer than two candidates remaining.

21880if (Candidates.size() < 2)

21881continue;

21882

21883// Add the single, non-constant index of each candidate to the bundle. We

21884// ensured the indices met these constraints when we originally collected

21885// the getelementptrs.

21886SmallVector<Value *, 16> Bundle(Candidates.size());

21887auto BundleIndex = 0u;

21888for (auto *V : Candidates) {

21889auto *GEP = cast<GetElementPtrInst>(V);

21890auto *GEPIdx =GEP->idx_begin()->get();

21891assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));

21892 Bundle[BundleIndex++] = GEPIdx;

21893 }

21894

21895// Try and vectorize the indices. We are currently only interested in

21896// gather-like cases of the form:

21897//

21898// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...

21899//

21900// where the loads of "a", the loads of "b", and the subtractions can be

21901// performed in parallel. It's likely that detecting this pattern in a

21902// bottom-up phase will be simpler and less costly than building a

21903// full-blown top-down phase beginning at the consecutive loads.

21904 Changed |= tryToVectorizeList(Bundle, R);

21905 }

21906 }

21907return Changed;

21908}

21909

21910bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {

21911bool Changed =false;

21912// Sort by type, base pointers and values operand. Value operands must be

21913// compatible (have the same opcode, same parent), otherwise it is

21914// definitely not profitable to try to vectorize them.

21915auto &&StoreSorter = [this](StoreInst *V,StoreInst *V2) {

21916if (V->getValueOperand()->getType()->getTypeID() <

21917V2->getValueOperand()->getType()->getTypeID())

21918returntrue;

21919if (V->getValueOperand()->getType()->getTypeID() >

21920V2->getValueOperand()->getType()->getTypeID())

21921returnfalse;

21922if (V->getPointerOperandType()->getTypeID() <

21923V2->getPointerOperandType()->getTypeID())

21924returntrue;

21925if (V->getPointerOperandType()->getTypeID() >

21926V2->getPointerOperandType()->getTypeID())

21927returnfalse;

21928if (V->getValueOperand()->getType()->getScalarSizeInBits() <

21929V2->getValueOperand()->getType()->getScalarSizeInBits())

21930returntrue;

21931if (V->getValueOperand()->getType()->getScalarSizeInBits() >

21932V2->getValueOperand()->getType()->getScalarSizeInBits())

21933returnfalse;

21934// UndefValues are compatible with all other values.

21935if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))

21936if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {

21937DomTreeNodeBase<llvm::BasicBlock> *NodeI1 =

21938 DT->getNode(I1->getParent());

21939DomTreeNodeBase<llvm::BasicBlock> *NodeI2 =

21940 DT->getNode(I2->getParent());

21941assert(NodeI1 &&"Should only process reachable instructions");

21942assert(NodeI2 &&"Should only process reachable instructions");

21943assert((NodeI1 == NodeI2) ==

21944 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&

21945"Different nodes should have different DFS numbers");

21946if (NodeI1 != NodeI2)

21947return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();

21948returnI1->getOpcode() < I2->getOpcode();

21949 }

21950returnV->getValueOperand()->getValueID() <

21951V2->getValueOperand()->getValueID();

21952 };

21953

21954auto &&AreCompatibleStores = [this](StoreInst *V1,StoreInst *V2) {

21955if (V1 == V2)

21956returntrue;

21957if (V1->getValueOperand()->getType() !=V2->getValueOperand()->getType())

21958returnfalse;

21959if (V1->getPointerOperandType() !=V2->getPointerOperandType())

21960returnfalse;

21961// Undefs are compatible with any other value.

21962if (isa<UndefValue>(V1->getValueOperand()) ||

21963 isa<UndefValue>(V2->getValueOperand()))

21964returntrue;

21965if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))

21966if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {

21967if (I1->getParent() != I2->getParent())

21968returnfalse;

21969returngetSameOpcode({I1, I2}, *TLI).valid();

21970 }

21971if (isa<Constant>(V1->getValueOperand()) &&

21972 isa<Constant>(V2->getValueOperand()))

21973returntrue;

21974return V1->getValueOperand()->getValueID() ==

21975V2->getValueOperand()->getValueID();

21976 };

21977

21978// Attempt to sort and vectorize each of the store-groups.

21979DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>> Attempted;

21980for (auto &Pair : Stores) {

21981if (Pair.second.size() < 2)

21982continue;

21983

21984LLVM_DEBUG(dbgs() <<"SLP: Analyzing a store chain of length "

21985 << Pair.second.size() <<".\n");

21986

21987if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))

21988continue;

21989

21990// Reverse stores to do bottom-to-top analysis. This is important if the

21991// values are stores to the same addresses several times, in this case need

21992// to follow the stores order (reversed to meet the memory dependecies).

21993SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),

21994 Pair.second.rend());

21995 Changed |= tryToVectorizeSequence<StoreInst>(

21996 ReversedStores, StoreSorter, AreCompatibleStores,

21997 [&](ArrayRef<StoreInst *> Candidates,bool) {

21998return vectorizeStores(Candidates, R, Attempted);

21999 },

22000/*MaxVFOnly=*/false,R);

22001 }

22002return Changed;

22003}

isConstant

static bool isConstant(const MachineInstr &MI)

Definition:AMDGPUInstructionSelector.cpp:2862

Select

AMDGPU Register Bank Select

Definition:AMDGPURegBankSelect.cpp:71

PHI

Rewrite undef for PHI

Definition:AMDGPURewriteUndefForPHI.cpp:100

Ignore

ReachingDefAnalysis InstSet InstSet & Ignore

Definition:ARMLowOverheadLoops.cpp:531

MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL

Definition:ARMSLSHardening.cpp:73

Results

Function Alias Analysis Results

Definition:AliasAnalysis.cpp:731

AliasAnalysis.h

AssumptionCache.h

Attributes.h

This file contains the simple types necessary to represent the attributes associated with functions a...

getParent

static const Function * getParent(const Value *V)

Definition:BasicAliasAnalysis.cpp:863

true

basic Basic Alias true

Definition:BasicAliasAnalysis.cpp:1981

static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")

static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")

static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")

static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")

#define LLVM_DUMP_METHOD

Mark debug helper function definitions like dump() that should not be stripped from debug builds.

Definition:Compiler.h:622

ConstantFolding.h

Constants.h

This file contains the declarations for the subclasses of Constant, which represent the different fla...

CostKind

static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))

DOTGraphTraits.h

getElementIndex

static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)

Definition:DataLayout.cpp:920

DataLayout.h

Idx

Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx

Definition:DeadArgumentElimination.cpp:353

DebugCounter.h

This file provides an implementation of debug counters.

DEBUG_COUNTER

#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)

Definition:DebugCounter.h:190

Debug.h

LLVM_DEBUG

#define LLVM_DEBUG(...)

Definition:Debug.h:106

DemandedBits.h

DenseMap.h

This file defines the DenseMap class.

DenseSet.h

This file defines the DenseSet and SmallDenseSet classes.

DerivedTypes.h

Dominators.h

Name

std::string Name

Definition:ELFObjHandler.cpp:77

Index

uint32_t Index

Definition:ELFObjHandler.cpp:83

Size

uint64_t Size

Definition:ELFObjHandler.cpp:81

End

bool End

Definition:ELF_riscv.cpp:480

Blocks

DenseMap< Block *, BlockRelaxAux > Blocks

Definition:ELF_riscv.cpp:507

static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")

runImpl

static bool runImpl(Function &F, const TargetLowering &TLI)

Definition:ExpandLargeDivRem.cpp:79

GlobalsModRef.h

This is the interface for a simple mod/ref and alias analysis over globals.

GraphWriter.h

Cleanup

static const HTTPClientCleanup Cleanup

Definition:HTTPClient.cpp:42

GEP

Hexagon Common GEP

Definition:HexagonCommonGEP.cpp:170

#define _

Definition:HexagonMCCodeEmitter.cpp:46

IRBuilder.h

IRTranslator LLVM IR MI

Definition:IRTranslator.cpp:112

Module.h This file contains the declarations for the Module class.

Operator.h

Type.h

Use.h

This defines the Use class.

iv Induction Variable Users

Definition:IVUsers.cpp:48

InjectTLIMappings.h

InstrTypes.h

InstructionCost.h

This file defines an InstructionCost class that is used when calculating the cost of an instruction,...

static bool isSplat(Value *V)

Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).

Definition:LowerMatrixIntrinsics.cpp:102

#define F(x, y, z)

Definition:MD5.cpp:55

#define I(x, y, z)

Definition:MD5.cpp:58

Operands

mir Rename Register Operands

Definition:MIRNamerPass.cpp:74

MathExtras.h

MemoryLocation.h

This file provides utility analysis objects describing memory locations.

Unknown

@ Unknown

Definition:NVPTXISelLowering.cpp:4791

uint64_t IntrinsicInst * II

Definition:NVVMIntrRange.cpp:51

static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")

OptimizationRemarkEmitter.h

#define P(N)

verify

ppc ctr loops verify

Definition:PPCCTRLoopsVerify.cpp:72

IsSelect

static bool IsSelect(MachineInstr &MI)

Definition:PPCISelLowering.cpp:13186

if(PassOpts->AAPipeline)

Definition:PassBuilderBindings.cpp:64

Pass.h

PatternMatch.h

PriorityQueue.h

This file defines the PriorityQueue class.

Cond

const SmallVectorImpl< MachineOperand > & Cond

Definition:RISCVRedundantCopyElimination.cpp:75

assert

assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())

isLoadCombineCandidateImpl

static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)

Definition:SLPVectorizer.cpp:12012

RunSLPVectorization

static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))

getWidenedType

static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)

Definition:SLPVectorizer.cpp:263

isVectorLikeInstWithConstOps

static bool isVectorLikeInstWithConstOps(Value *V)

Checks if V is one of vector-like instructions, i.e.

Definition:SLPVectorizer.cpp:417

calculateRtStride

static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)

Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.

Definition:SLPVectorizer.cpp:4834

isRepeatedNonIdentityClusteredMask

static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)

Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...

Definition:SLPVectorizer.cpp:5876

MaxPHINumOperands

static const unsigned MaxPHINumOperands

Maximum allowed number of operands in the PHI nodes.

Definition:SLPVectorizer.cpp:222

MaxVectorRegSizeOption

static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))

MaxProfitableLoadStride

static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))

findBuildAggregate

static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)

Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...

Definition:SLPVectorizer.cpp:20889

needToScheduleSingleInstruction

static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:7398

clusterSortPtrAccesses

static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)

Definition:SLPVectorizer.cpp:5369

getNumElements

static unsigned getNumElements(Type *Ty)

Definition:SLPVectorizer.cpp:254

buildUseMask

static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)

Prepares a use bitset for the given mask either for the first argument or for the second.

Definition:SLPVectorizer.cpp:616

areCompatibleCmpOps

static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)

Checks if the provided operands of 2 cmp instructions are compatible, i.e.

Definition:SLPVectorizer.cpp:873

createInsertVector

static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})

Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.

Definition:SLPVectorizer.cpp:4979

getNumElems

static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)

Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...

Definition:SLPVectorizer.cpp:442

getShuffleCost

static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})

Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.

Definition:SLPVectorizer.cpp:4955

findBuildAggregate_rec

static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)

Definition:SLPVectorizer.cpp:20850

isSimple

static bool isSimple(Instruction *I)

Definition:SLPVectorizer.cpp:1138

MinScheduleRegionSize

static const int MinScheduleRegionSize

If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.

Definition:SLPVectorizer.cpp:219

MinProfitableStridedLoads

static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))

isFirstInsertElement

static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)

Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.

Definition:SLPVectorizer.cpp:12300

getAltInstrMask

static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)

Definition:SLPVectorizer.cpp:1212

LookAheadMaxDepth

static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))

MaxVFOption

static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))

reorderReuses

static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)

Reorders the given Reuses mask according to the given Mask.

Definition:SLPVectorizer.cpp:4568

combineOrders

static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)

Definition:SLPVectorizer.cpp:5916

MaxMemDepDistance

static const unsigned MaxMemDepDistance

Definition:SLPVectorizer.cpp:215

ViewSLPTree

static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))

doesInTreeUserNeedToExtract

static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)

Definition:SLPVectorizer.cpp:1100

VectorizeNonPowerOf2

static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))

MinTreeSize

static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))

reorderOrder

static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)

Reorders the given Order according to the given Mask.

Definition:SLPVectorizer.cpp:4582

getFullVectorNumberOfElements

static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)

Returns the number of elements of the given type Ty, not less than Sz, which forms type,...

Definition:SLPVectorizer.cpp:271

performExtractsShuffleAction

static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)

Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...

Definition:SLPVectorizer.cpp:12355

ShouldVectorizeHor

static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))

isConstant

static bool isConstant(Value *V)

Definition:SLPVectorizer.cpp:410

isSplat

static bool isSplat(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:493

SLPCostThreshold

static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))

getPartNumElems

static unsigned getPartNumElems(unsigned Size, unsigned NumParts)

Returns power-of-2 number of elements in a single register (part), given the total number of elements...

Definition:SLPVectorizer.cpp:435

allConstant

static bool allConstant(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:485

UsesLimit

static constexpr int UsesLimit

Definition:SLPVectorizer.cpp:210

getElementIndex

static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)

Definition:SLPVectorizer.cpp:568

isReductionCandidate

static bool isReductionCandidate(Instruction *I)

\Returns true if I is a candidate instruction for reduction vectorization.

Definition:SLPVectorizer.cpp:21022

checkTreeSizes

static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)

Checks if the quadratic mean deviation is less than 90% of the mean size.

Definition:SLPVectorizer.cpp:18678

getShufflevectorNumGroups

static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:339

isCmpSameOrSwapped

static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)

Definition:SLPVectorizer.cpp:887

SLPSkipEarlyProfitabilityCheck

static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))

generateKeySubkey

static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)

Generates key/subkey pair for the given value to provide effective sorting of the values and better d...

Definition:SLPVectorizer.cpp:7417

ShouldStartVectorizeHorAtStore

static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))

getVectorCallCosts

static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)

Definition:SLPVectorizer.cpp:9039

transformScalarShuffleIndiciesToVector

static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)

Definition:SLPVectorizer.cpp:300

SLPReVec

static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))

isValidForAlternation

static bool isValidForAlternation(unsigned Opcode)

Definition:SLPVectorizer.cpp:861

buildIntrinsicArgTypes

static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)

Builds the arguments types vector for the given call instruction with the given ID for the specified ...

Definition:SLPVectorizer.cpp:11077

getExtractIndex

static std::optional< unsigned > getExtractIndex(Instruction *E)

Definition:SLPVectorizer.cpp:794

RootLookAheadMaxDepth

static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))

AliasedCheckLimit

static const unsigned AliasedCheckLimit

Definition:SLPVectorizer.cpp:206

getValueType

static Type * getValueType(Value *V)

Returns the type of the given value/instruction V.

Definition:SLPVectorizer.cpp:243

gatherPossiblyVectorizableLoads

static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)

Tries to find subvector of loads and builds new vector of only loads if can be profitable.

Definition:SLPVectorizer.cpp:6789

shortBundleName

static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)

Print a short descriptor of the instruction bundle suitable for debug output.

Definition:SLPVectorizer.cpp:449

dumpOrder

static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)

Definition:SLPVectorizer.cpp:6734

isValidElementType

static bool isValidElementType(Type *Ty)

Predicate for the element types that the SLP vectorizer supports.

Definition:SLPVectorizer.cpp:231

getReductionInstr

static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)

Try and get a reduction instruction from a phi node.

Definition:SLPVectorizer.cpp:20925

calculateShufflevectorMask

static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:391

allSameType

static bool allSameType(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:1093

getLocation

static MemoryLocation getLocation(Instruction *I)

Definition:SLPVectorizer.cpp:1129

isCommutative

static bool isCommutative(Instruction *I)

Definition:SLPVectorizer.cpp:509

allSameBlock

static bool allSameBlock(ArrayRef< Value * > VL)

Definition:SLPVectorizer.cpp:461

getFloorFullVectorNumberOfElements

static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)

Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...

Definition:SLPVectorizer.cpp:286

areTwoInsertFromSameBuildVector

static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)

Check if two insertelement instructions are from the same buildvector.

Definition:SLPVectorizer.cpp:5500

arePointersCompatible

static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)

Definition:SLPVectorizer.cpp:4790

getGEPCosts

static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)

Calculate the scalar and the vector costs from vectorizing set of GEPs.

Definition:SLPVectorizer.cpp:9521

isUndefVector

static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})

Checks if the given value is actually an undefined constant vector.

Definition:SLPVectorizer.cpp:637

tryToVectorizeSequence

static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)

Definition:SLPVectorizer.cpp:21194

getSameOpcode

static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)

Definition:SLPVectorizer.cpp:909

ScheduleRegionSizeBudget

static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))

Limits the size of scheduling regions in a block.

tryGetSecondaryReductionRoot

static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)

We could have an initial reduction that is not an add.

Definition:SLPVectorizer.cpp:20995

getRdxKind

static RecurKind getRdxKind(Value *V)

Gets recurrence kind from the specified value.

Definition:SLPVectorizer.cpp:20819

matchRdxBop

static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)

Definition:SLPVectorizer.cpp:20969

MinVectorRegSizeOption

static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))

isFixedVectorShuffle

static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)

Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...

Definition:SLPVectorizer.cpp:706

getAggregateSize

static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)

Definition:SLPVectorizer.cpp:20822

getInsertExtractIndex

static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)

Definition:SLPVectorizer.cpp:543

RecursionMaxDepth

static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))

computeCommonAlignment

static Align computeCommonAlignment(ArrayRef< Value * > VL)

Calculates minimal alignment as a common alignment.

Definition:SLPVectorizer.cpp:4809

addMask

static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)

Shuffles Mask in accordance with the given SubMask.

Definition:SLPVectorizer.cpp:1151

fixupOrderingIndices

static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)

Order may have elements assigned special value (size) which is out of bounds.

Definition:SLPVectorizer.cpp:1186

createExtractVector

static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)

Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.

Definition:SLPVectorizer.cpp:5010

getNonPhiOperand

static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)

Returns the first operand of I that does not match Phi.

Definition:SLPVectorizer.cpp:21013

compareCmp

static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)

Compare two cmp instructions.

Definition:SLPVectorizer.cpp:21313

isReverseOrder

static bool isReverseOrder(ArrayRef< unsigned > Order)

Check if Order represents reverse order.

Definition:SLPVectorizer.cpp:4817

isAlternateInstruction

static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)

Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...

Definition:SLPVectorizer.cpp:9101

SLPVectorizer.h

STLExtras.h

This file contains some templates that are useful if you are working with the STL at all.

raw_pwrite_stream & OS

Definition:SampleProfWriter.cpp:51

SV_NAME

#define SV_NAME

Definition:SandboxVectorizer.cpp:17

ScalarEvolutionExpander.h

ScalarEvolutionExpressions.h

ScalarEvolution.h

ScopeExit.h

This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...

SetOperations.h

This file defines generic set operations that may be used on set's of different types,...

SetVector.h

This file implements a set that has insertion order iteration characteristics.

SmallBitVector.h

This file implements the SmallBitVector class.

SmallPtrSet.h

This file defines the SmallPtrSet class.

SmallSet.h

This file defines the SmallSet class.

SmallString.h

This file defines the SmallString class.

Statistic.h

This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...

STATISTIC

#define STATISTIC(VARNAME, DESC)

Definition:Statistic.h:166

getType

static SymbolRef::Type getType(const Symbol *Sym)

Definition:TapiFile.cpp:39

Ptr

@ Ptr

Definition:TargetLibraryInfo.cpp:77

TargetLibraryInfo.h

TargetTransformInfo.h

This pass exposes codegen information to IR-level passes.

Local.h

getOpcode

static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)

Returns the opcode of Values or ~0 if they do not all agree.

Definition:VPlanSLP.cpp:191

getOperands

static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)

Definition:VPlanSLP.cpp:154

Value * RHS

Definition:X86PartialReduction.cpp:74

LHS

Value * LHS

Definition:X86PartialReduction.cpp:73

static const uint32_t IV[8]

Definition:blake3_impl.h:78

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator

Merges shuffle masks and emits final shuffle instruction, if required.

Definition:SLPVectorizer.cpp:10144

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::ShuffleCostEstimator

ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)

Definition:SLPVectorizer.cpp:10673

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add

void add(Value *V1, Value *V2, ArrayRef< int > Mask)

Adds 2 input vectors and the mask for their shuffling.

Definition:SLPVectorizer.cpp:10832

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add

void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)

Definition:SLPVectorizer.cpp:10790

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::needToDelay

std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const

Checks if the specified entry E needs to be delayed because of its dependency nodes.

Definition:SLPVectorizer.cpp:10785

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::gather

Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)

Definition:SLPVectorizer.cpp:10896

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::~ShuffleCostEstimator

~ShuffleCostEstimator()

Definition:SLPVectorizer.cpp:11037

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::finalize

InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})

Finalize emission of the shuffles.

Definition:SLPVectorizer.cpp:10942

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::createFreeze

InstructionCost createFreeze(InstructionCost Cost)

Definition:SLPVectorizer.cpp:10939

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add

void add(const TreeEntry &E1, ArrayRef< int > Mask)

Definition:SLPVectorizer.cpp:10814

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::adjustExtracts

Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)

Definition:SLPVectorizer.cpp:10679

llvm::slpvectorizer::BoUpSLP::ShuffleCostEstimator::add

void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)

Adds another one input vector and the mask for the shuffling.

Definition:SLPVectorizer.cpp:10849

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder

Merges shuffle masks and emits final shuffle instruction, if required.

Definition:SLPVectorizer.cpp:14096

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add

void add(Value *V1, ArrayRef< int > Mask, bool=false)

Adds another one input vector and the mask for the shuffling.

Definition:SLPVectorizer.cpp:14447

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::addOrdered

void addOrdered(Value *V1, ArrayRef< unsigned > Order)

Adds another one input vector and the mask for the shuffling.

Definition:SLPVectorizer.cpp:14502

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::needToDelay

std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const

Checks if the specified entry E needs to be delayed because of its dependency nodes.

Definition:SLPVectorizer.cpp:14362

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add

void add(const TreeEntry &E1, ArrayRef< int > Mask)

Adds single input vector (in form of tree entry) and the mask for its shuffling.

Definition:SLPVectorizer.cpp:14401

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::gather

Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)

Definition:SLPVectorizer.cpp:14507

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add

void add(Value *V1, Value *V2, ArrayRef< int > Mask)

Adds 2 input vectors and the mask for their shuffling.

Definition:SLPVectorizer.cpp:14413

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::createFreeze

Value * createFreeze(Value *V)

Definition:SLPVectorizer.cpp:14514

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::ShuffleInstructionBuilder

ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)

Definition:SLPVectorizer.cpp:14222

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::add

void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)

Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.

Definition:SLPVectorizer.cpp:14380

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::adjustExtracts

Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)

Adjusts extractelements after reusing them.

Definition:SLPVectorizer.cpp:14226

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::finalize

Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})

Finalize emission of the shuffles.

Definition:SLPVectorizer.cpp:14519

llvm::slpvectorizer::BoUpSLP::ShuffleInstructionBuilder::~ShuffleInstructionBuilder

~ShuffleInstructionBuilder()

Definition:SLPVectorizer.cpp:14618

VectorType

Definition:ItaniumDemangle.h:1173

bool

llvm::AAManager

A manager for alias analyses.

Definition:AliasAnalysis.h:933

llvm::AAResults

Definition:AliasAnalysis.h:314

llvm::APInt

Class for arbitrary precision integers.

Definition:APInt.h:78

llvm::APInt::getAllOnes

static APInt getAllOnes(unsigned numBits)

Return an APInt of a specified width with all bits set.

Definition:APInt.h:234

llvm::APInt::clearBit

void clearBit(unsigned BitPosition)

Set a given bit to 0.

Definition:APInt.h:1407

llvm::APInt::setBit

void setBit(unsigned BitPosition)

Set the given bit to 1 whose position is given as "bitPosition".

Definition:APInt.h:1330

llvm::APInt::isAllOnes

bool isAllOnes() const

Determine if all bits are set. This is true for zero-width values.

Definition:APInt.h:371

llvm::APInt::isZero

bool isZero() const

Determine if this value is zero, i.e. all bits are clear.

Definition:APInt.h:380

llvm::APInt::urem

APInt urem(const APInt &RHS) const

Unsigned remainder operation.

Definition:APInt.cpp:1640

llvm::APInt::clearAllBits

void clearAllBits()

Set every bit to 0.

Definition:APInt.h:1397

llvm::APInt::setAllBits

void setAllBits()

Set every bit to 1.

Definition:APInt.h:1319

llvm::APInt::setBits

void setBits(unsigned loBit, unsigned hiBit)

Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.

Definition:APInt.h:1367

llvm::APInt::getZero

static APInt getZero(unsigned numBits)

Get the '0' value for the specified bit-width.

Definition:APInt.h:200

llvm::APInt::getBitsSetFrom

static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)

Constructs an APInt value that has a contiguous range of bits set.

Definition:APInt.h:286

llvm::APInt::getOneBitSet

static APInt getOneBitSet(unsigned numBits, unsigned BitNo)

Return an APInt with exactly one bit set in the result.

Definition:APInt.h:239

llvm::AnalysisManager

A container for analyses that lazily runs them and caches their results.

Definition:PassManager.h:253

llvm::AnalysisManager::getCachedResult

PassT::Result * getCachedResult(IRUnitT &IR) const

Get the cached result of an analysis pass for a given IR unit.

Definition:PassManager.h:429

llvm::AnalysisManager::getResult

PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)

Get the result of an analysis pass for a given IR unit.

Definition:PassManager.h:410

llvm::ArrayRef

ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...

Definition:ArrayRef.h:41

llvm::ArrayRef::equals

bool equals(ArrayRef RHS) const

equals - Check for element-wise equality.

Definition:ArrayRef.h:190

llvm::ArrayRef::back

const T & back() const

back - Get the last element.

Definition:ArrayRef.h:177

llvm::ArrayRef::take_front

ArrayRef< T > take_front(size_t N=1) const

Return a copy of *this with only the first N elements.

Definition:ArrayRef.h:231

llvm::ArrayRef::drop_front

ArrayRef< T > drop_front(size_t N=1) const

Drop the first N elements of the array.

Definition:ArrayRef.h:207

llvm::ArrayRef::front

const T & front() const

front - Get the first element.

Definition:ArrayRef.h:171

llvm::ArrayRef::end

iterator end() const

Definition:ArrayRef.h:157

llvm::ArrayRef::size

size_t size() const

size - Get the array size.

Definition:ArrayRef.h:168

llvm::ArrayRef::drop_back

ArrayRef< T > drop_back(size_t N=1) const

Drop the last N elements of the array.

Definition:ArrayRef.h:213

llvm::ArrayRef::begin

iterator begin() const

Definition:ArrayRef.h:156

llvm::ArrayRef::empty

bool empty() const

empty - Check if the array is empty.

Definition:ArrayRef.h:163

llvm::ArrayRef::slice

ArrayRef< T > slice(size_t N, size_t M) const

slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.

Definition:ArrayRef.h:198

llvm::AssumptionAnalysis

A function analysis which provides an AssumptionCache.

Definition:AssumptionCache.h:173

llvm::AssumptionCache

A cache of @llvm.assume calls within a function.

Definition:AssumptionCache.h:42

llvm::Attribute::getWithAlignment

static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)

Return a uniquified Attribute object that has the specific alignment set.

Definition:Attributes.cpp:234

llvm::BasicBlock

LLVM Basic Block Representation.

Definition:BasicBlock.h:61

llvm::BasicBlock::end

iterator end()

Definition:BasicBlock.h:474

llvm::BasicBlock::begin

iterator begin()

Instruction iterator methods.

Definition:BasicBlock.h:461

llvm::BasicBlock::reverse_iterator

InstListType::reverse_iterator reverse_iterator

Definition:BasicBlock.h:179

llvm::BasicBlock::getParent

const Function * getParent() const

Return the enclosing method, or null if none.

Definition:BasicBlock.h:220

llvm::BasicBlock::rend

reverse_iterator rend()

Definition:BasicBlock.h:479

llvm::BasicBlock::iterator

InstListType::iterator iterator

Instruction iterators...

Definition:BasicBlock.h:177

llvm::BasicBlock::isEHPad

bool isEHPad() const

Return true if this basic block is an exception handling block.

Definition:BasicBlock.h:688

llvm::BasicBlock::getTerminator

const Instruction * getTerminator() const LLVM_READONLY

Returns the terminator instruction if the block is well formed or null if the block is not well forme...

Definition:BasicBlock.h:240

llvm::BatchAAResults

This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...

Definition:AliasAnalysis.h:630

llvm::BatchAAResults::getModRefInfo

ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)

Definition:AliasAnalysis.h:653

llvm::BinaryOperator

Definition:InstrTypes.h:170

llvm::CFGAnalyses

Represents analyses that only rely on functions' control flow.

Definition:Analysis.h:72

llvm::CallBase

Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...

Definition:InstrTypes.h:1112

llvm::CallBase::getBundleOperandsEndIndex

unsigned getBundleOperandsEndIndex() const

Return the index of the last bundle operand in the Use array.

Definition:InstrTypes.h:1980

llvm::CallBase::getOperandBundlesAsDefs

void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const

Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.

Definition:Instructions.cpp:483

llvm::CallBase::isNoBuiltin

bool isNoBuiltin() const

Return true if the call should not be treated as a call to a builtin.

Definition:InstrTypes.h:1875

llvm::CallBase::getCalledFunction

Function * getCalledFunction() const

Returns the function called, or null if this is an indirect function invocation or the function signa...

Definition:InstrTypes.h:1341

llvm::CallBase::hasIdenticalOperandBundleSchema

bool hasIdenticalOperandBundleSchema(const CallBase &Other) const

Return true if Other has the same sequence of operand bundle tags with the same number of operands on...

Definition:InstrTypes.h:2117

llvm::CallBase::getBundleOperandsStartIndex

unsigned getBundleOperandsStartIndex() const

Return the index of the first bundle operand in the Use array.

Definition:InstrTypes.h:1974

llvm::CallBase::getArgOperand

Value * getArgOperand(unsigned i) const

Definition:InstrTypes.h:1286

llvm::CallBase::getFunctionType

FunctionType * getFunctionType() const

Definition:InstrTypes.h:1199

llvm::CallBase::args

iterator_range< User::op_iterator > args()

Iteration adapter for range-for loops.

Definition:InstrTypes.h:1277

llvm::CallBase::arg_size

unsigned arg_size() const

Definition:InstrTypes.h:1284

llvm::CallBase::addParamAttr

void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)

Adds the attribute to the indicated argument.

Definition:InstrTypes.h:1494

llvm::CallBase::hasOperandBundles

bool hasOperandBundles() const

Return true if this User has any operand bundles.

Definition:InstrTypes.h:1971

llvm::CallInst

This class represents a function call, abstracting a target machine's calling convention.

Definition:Instructions.h:1479

llvm::CastInst

This is the base class for all instructions that perform data casts.

Definition:InstrTypes.h:444

llvm::CmpInst

This class is the base class for the comparison instructions.

Definition:InstrTypes.h:661

llvm::CmpInst::makeCmpResultType

static Type * makeCmpResultType(Type *opnd_type)

Create a result type for fcmp/icmp.

Definition:InstrTypes.h:980

llvm::CmpInst::Predicate

Predicate

This enumeration lists the possible predicates for CmpInst subclasses.

Definition:InstrTypes.h:673

llvm::CmpInst::BAD_ICMP_PREDICATE

@ BAD_ICMP_PREDICATE

Definition:InstrTypes.h:706

llvm::CmpInst::ICMP_SLT

@ ICMP_SLT

signed less than

Definition:InstrTypes.h:702

llvm::CmpInst::ICMP_SLE

@ ICMP_SLE

signed less or equal

Definition:InstrTypes.h:703

llvm::CmpInst::ICMP_UGE

@ ICMP_UGE

unsigned greater or equal

Definition:InstrTypes.h:697

llvm::CmpInst::ICMP_UGT

@ ICMP_UGT

unsigned greater than

Definition:InstrTypes.h:696

llvm::CmpInst::ICMP_SGT

@ ICMP_SGT

signed greater than

Definition:InstrTypes.h:700

llvm::CmpInst::ICMP_ULT

@ ICMP_ULT

unsigned less than

Definition:InstrTypes.h:698

llvm::CmpInst::ICMP_SGE

@ ICMP_SGE

signed greater or equal

Definition:InstrTypes.h:701

llvm::CmpInst::ICMP_ULE

@ ICMP_ULE

unsigned less or equal

Definition:InstrTypes.h:699

llvm::CmpInst::BAD_FCMP_PREDICATE

@ BAD_FCMP_PREDICATE

Definition:InstrTypes.h:693

llvm::CmpInst::getSwappedPredicate

Predicate getSwappedPredicate() const

For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.

Definition:InstrTypes.h:825

llvm::CmpInst::getInversePredicate

Predicate getInversePredicate() const

For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...

Definition:InstrTypes.h:787

llvm::CmpInst::getPredicate

Predicate getPredicate() const

Return the predicate for this instruction.

Definition:InstrTypes.h:763

llvm::CmpPredicate

An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...

Definition:CmpPredicate.h:22

llvm::ConstantExpr::getIntToPtr

static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)

Definition:Constants.cpp:2307

llvm::ConstantInt

This is the shared class of boolean and integer constants.

Definition:Constants.h:83

llvm::ConstantInt::getZExtValue

uint64_t getZExtValue() const

Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...

Definition:Constants.h:157

llvm::ConstantVector::getSplat

static Constant * getSplat(ElementCount EC, Constant *Elt)

Return a ConstantVector with the specified constant in each element.

Definition:Constants.cpp:1472

llvm::ConstantVector::get

static Constant * get(ArrayRef< Constant * > V)

Definition:Constants.cpp:1421

llvm::Constant

This is an important base class in LLVM.

Definition:Constant.h:42

llvm::Constant::getAllOnesValue

static Constant * getAllOnesValue(Type *Ty)

Definition:Constants.cpp:420

llvm::Constant::getNullValue

static Constant * getNullValue(Type *Ty)

Constructor to create a '0' constant of arbitrary type.

Definition:Constants.cpp:373

llvm::DWARFExpression::Operation

This class represents an Operation in the Expression.

Definition:DWARFExpression.h:32

llvm::DWARFExpression::Operation::getNumOperands

uint64_t getNumOperands() const

Definition:DWARFExpression.h:90

llvm::DataLayout

A parsed version of the target data layout string in and methods for querying it.

Definition:DataLayout.h:63

llvm::DataLayout::getTypeStoreSizeInBits

TypeSize getTypeStoreSizeInBits(Type *Ty) const

Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...

Definition:DataLayout.h:434

llvm::DataLayout::getIndexType

IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const

Returns the type of a GEP index in AddressSpace.

Definition:DataLayout.cpp:878

llvm::DataLayout::getTypeSizeInBits

TypeSize getTypeSizeInBits(Type *Ty) const

Size examples:

Definition:DataLayout.h:617

llvm::DebugCounter::shouldExecute

static bool shouldExecute(unsigned CounterName)

Definition:DebugCounter.h:87

llvm::DemandedBitsAnalysis

An analysis that produces DemandedBits for a function.

Definition:DemandedBits.h:103

llvm::DemandedBits

Definition:DemandedBits.h:40

llvm::DemandedBits::getDemandedBits

APInt getDemandedBits(Instruction *I)

Return the bits demanded from instruction I.

Definition:DemandedBits.cpp:399

llvm::DenseMapBase::lookup

ValueT lookup(const_arg_type_t< KeyT > Val) const

lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...

Definition:DenseMap.h:194

llvm::DenseMapBase::find

iterator find(const_arg_type_t< KeyT > Val)

Definition:DenseMap.h:156

llvm::DenseMapBase::try_emplace

std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)

Definition:DenseMap.h:226

llvm::DenseMapBase::erase

bool erase(const KeyT &Val)

Definition:DenseMap.h:321

llvm::DenseMapBase::size

unsigned size() const

Definition:DenseMap.h:99

llvm::DenseMapBase::empty

bool empty() const

Definition:DenseMap.h:98

llvm::DenseMapBase::count

size_type count(const_arg_type_t< KeyT > Val) const

Return 1 if the specified key is in the map, 0 otherwise.

Definition:DenseMap.h:152

llvm::DenseMapBase::end

iterator end()

Definition:DenseMap.h:84

llvm::DenseMapBase::at

const ValueT & at(const_arg_type_t< KeyT > Val) const

at - Return the entry for the specified key, or abort if no such entry exists.

Definition:DenseMap.h:202

llvm::DenseMapBase::contains

bool contains(const_arg_type_t< KeyT > Val) const

Return true if the specified key is in the map, false otherwise.

Definition:DenseMap.h:147

llvm::DenseMapBase::insert

std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)

Definition:DenseMap.h:211

llvm::DenseMapBase::clear

void clear()

Definition:DenseMap.h:110

llvm::DenseMap

Definition:DenseMap.h:727

llvm::DenseSet

Implements a dense probed hash-table based set.

Definition:DenseSet.h:278

llvm::DomTreeNodeBase< BasicBlock >

llvm::DomTreeNodeBase::getDFSNumIn

unsigned getDFSNumIn() const

getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.

Definition:GenericDomTree.h:140

llvm::DominatorTreeAnalysis

Analysis pass which computes a DominatorTree.

Definition:Dominators.h:279

llvm::DominatorTreeBase::updateDFSNumbers

void updateDFSNumbers() const

updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.

Definition:GenericDomTree.h:805

llvm::DominatorTreeBase::getNode

DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const

getNode - return the (Post)DominatorTree node for the specified basic block.

Definition:GenericDomTree.h:401

llvm::DominatorTree

Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.

Definition:Dominators.h:162

llvm::DominatorTree::isReachableFromEntry

bool isReachableFromEntry(const Use &U) const

Provide an overload for a Use.

Definition:Dominators.cpp:321

llvm::DominatorTree::dominates

bool dominates(const BasicBlock *BB, const Use &U) const

Return true if the (end of the) basic block BB dominates the use U.

Definition:Dominators.cpp:122

llvm::ElementCount::getFixed

static constexpr ElementCount getFixed(ScalarTy MinVal)

Definition:TypeSize.h:311

llvm::ExtractElementInst

This instruction extracts a single (scalar) element from a VectorType value.

Definition:Instructions.h:1775

llvm::ExtractValueInst

This instruction extracts a struct member or array element value from an aggregate value.

Definition:Instructions.h:2397

llvm::FastMathFlags

Convenience struct for specifying and reasoning about fast-math flags.

Definition:FMF.h:20

llvm::FastMathFlags::set

void set()

Definition:FMF.h:62

llvm::FixedVectorType

Class to represent fixed width SIMD vectors.

Definition:DerivedTypes.h:563

llvm::FixedVectorType::getNumElements

unsigned getNumElements() const

Definition:DerivedTypes.h:606

llvm::FixedVectorType::get

static FixedVectorType * get(Type *ElementType, unsigned NumElts)

Definition:Type.cpp:791

llvm::FunctionType::params

ArrayRef< Type * > params() const

Definition:DerivedTypes.h:132

llvm::FunctionType::getReturnType

Type * getReturnType() const

Definition:DerivedTypes.h:126

llvm::Function

Definition:Function.h:63

llvm::Function::empty

bool empty() const

Definition:Function.h:859

llvm::GetElementPtrInst

an instruction for type-safe pointer arithmetic to access elements of arrays and structs

Definition:Instructions.h:933

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator++

nodes_iterator operator++()

Definition:SLPVectorizer.cpp:4474

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::nodes_iterator

nodes_iterator(const ItTy &It2)

Definition:SLPVectorizer.cpp:4472

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator*

NodeRef operator*()

Definition:SLPVectorizer.cpp:4473

llvm::GraphTraits< BoUpSLP * >::nodes_iterator::operator!=

bool operator!=(const nodes_iterator &N2) const

Definition:SLPVectorizer.cpp:4478

llvm::IRBuilderBase::InsertPointGuard

Definition:IRBuilder.h:394

llvm::IRBuilderBase

Common base class shared among various IRBuilders.

Definition:IRBuilder.h:113

llvm::IRBuilderBase::CreateExtractVector

CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")

Create a call to the vector.extract intrinsic.

Definition:IRBuilder.h:1072

llvm::IRBuilderBase::CreateInsertElement

Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")

Definition:IRBuilder.h:2511

llvm::IRBuilderBase::getInt1Ty

IntegerType * getInt1Ty()

Fetch the type representing a single bit.

Definition:IRBuilder.h:530

llvm::IRBuilderBase::CreateInsertVector

CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")

Create a call to the vector.insert intrinsic.

Definition:IRBuilder.h:1080

llvm::IRBuilderBase::CreateExtractElement

Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")

Definition:IRBuilder.h:2499

llvm::IRBuilderBase::getIntNTy

IntegerType * getIntNTy(unsigned N)

Fetch the type representing an N-bit integer.

Definition:IRBuilder.h:558

llvm::IRBuilderBase::CreateAlignedLoad

LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)

Definition:IRBuilder.h:1815

llvm::IRBuilderBase::getTrue

ConstantInt * getTrue()

Get the constant value for i1 true.

Definition:IRBuilder.h:485

llvm::IRBuilderBase::CreateSelect

Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)

Definition:IRBuilder.cpp:1053

llvm::IRBuilderBase::GetInsertPoint

BasicBlock::iterator GetInsertPoint() const

Definition:IRBuilder.h:194

llvm::IRBuilderBase::CreateFreeze

Value * CreateFreeze(Value *V, const Twine &Name="")

Definition:IRBuilder.h:2574

llvm::IRBuilderBase::CreateCast

Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})

Definition:IRBuilder.h:2186

llvm::IRBuilderBase::GetInsertBlock

BasicBlock * GetInsertBlock() const

Definition:IRBuilder.h:193

llvm::IRBuilderBase::setFastMathFlags

void setFastMathFlags(FastMathFlags NewFMF)

Set the fast-math flags to be used with generated fp-math operators.

Definition:IRBuilder.h:330

llvm::IRBuilderBase::SetCurrentDebugLocation

void SetCurrentDebugLocation(DebugLoc L)

Set location information used by debugging information.

Definition:IRBuilder.h:239

llvm::IRBuilderBase::CreateGEP

Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())

Definition:IRBuilder.h:1874

llvm::IRBuilderBase::getInt64

ConstantInt * getInt64(uint64_t C)

Get a constant 64-bit value.

Definition:IRBuilder.h:510

llvm::IRBuilderBase::getAllOnesMask

Value * getAllOnesMask(ElementCount NumElts)

Return an all true boolean vector (mask) with NumElts lanes.

Definition:IRBuilder.h:867

llvm::IRBuilderBase::CreateUnOp

Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)

Definition:IRBuilder.h:1761

llvm::IRBuilderBase::CreateBinaryIntrinsic

Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with 2 operands which is mangled on the first type.

Definition:IRBuilder.cpp:889

llvm::IRBuilderBase::CreateIntrinsic

CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with Args, mangled using Types.

Definition:IRBuilder.cpp:900

llvm::IRBuilderBase::getInt32

ConstantInt * getInt32(uint32_t C)

Get a constant 32-bit value.

Definition:IRBuilder.h:505

llvm::IRBuilderBase::CreateCmp

Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)

Definition:IRBuilder.h:2404

llvm::IRBuilderBase::CreatePHI

PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")

Definition:IRBuilder.h:2435

llvm::IRBuilderBase::CreateBitCast

Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")

Definition:IRBuilder.h:2152

llvm::IRBuilderBase::CreateUnaryIntrinsic

CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")

Create a call to intrinsic ID with 1 operand which is mangled on its type.

Definition:IRBuilder.cpp:881

llvm::IRBuilderBase::CreateShuffleVector

Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")

Definition:IRBuilder.h:2533

llvm::IRBuilderBase::getFalse

ConstantInt * getFalse()

Get the constant value for i1 false.

Definition:IRBuilder.h:490

llvm::IRBuilderBase::CreateCall

CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)

Definition:IRBuilder.h:2449

llvm::IRBuilderBase::CreateBinOp

Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)

Definition:IRBuilder.h:1671

llvm::IRBuilderBase::ClearInsertionPoint

void ClearInsertionPoint()

Clear the insertion point: created instructions will not be inserted into a block.

Definition:IRBuilder.h:188

llvm::IRBuilderBase::CreateIntCast

Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")

Definition:IRBuilder.h:2225

llvm::IRBuilderBase::SetInsertPoint

void SetInsertPoint(BasicBlock *TheBB)

This specifies that created instructions should be appended to the end of the specified block.

Definition:IRBuilder.h:199

llvm::IRBuilderBase::CreateAlignedStore

StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)

Definition:IRBuilder.h:1834

llvm::IRBuilderBase::CreateICmp

Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")

Definition:IRBuilder.h:2380

llvm::IRBuilderBase::CreateFMul

Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)

Definition:IRBuilder.h:1614

llvm::IRBuilderBase::CreateMul

Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)

Definition:IRBuilder.h:1404

llvm::IRBuilderBase::CreateMaskedGather

CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")

Create a call to Masked Gather intrinsic.

Definition:IRBuilder.cpp:596

llvm::IRBuilder

This provides a uniform API for creating instructions and inserting them into a basic block: either a...

Definition:IRBuilder.h:2705

llvm::InsertElementInst

This instruction inserts a single (scalar) element into a VectorType value.

Definition:Instructions.h:1834

llvm::InsertElementInst::getType

VectorType * getType() const

Overload to return most specific vector type.

Definition:Instructions.h:1862

llvm::InsertValueInst

This instruction inserts a struct field of array element value into an aggregate value.

Definition:Instructions.h:2485

llvm::InstructionCost

Definition:InstructionCost.h:29

llvm::InstructionCost::getInvalid

static InstructionCost getInvalid(CostType Val=0)

Definition:InstructionCost.h:73

llvm::InstructionCost::isValid

bool isValid() const

Definition:InstructionCost.h:79

llvm::Instruction

Definition:Instruction.h:68

llvm::Instruction::isCast

bool isCast() const

Definition:Instruction.h:319

llvm::Instruction::mayReadOrWriteMemory

bool mayReadOrWriteMemory() const

Return true if this instruction may read or write memory.

Definition:Instruction.h:799

llvm::Instruction::getDebugLoc

const DebugLoc & getDebugLoc() const

Return the debug location for this node as a DebugLoc.

Definition:Instruction.h:511

llvm::Instruction::moveAfter

void moveAfter(Instruction *MovePos)

Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...

Definition:Instruction.cpp:191

llvm::Instruction::isBinaryOp

bool isBinaryOp() const

Definition:Instruction.h:315

llvm::Instruction::comesBefore

bool comesBefore(const Instruction *Other) const

Given an instruction Other in the same basic block as this instruction, return true if this instructi...

Definition:Instruction.cpp:334

llvm::Instruction::getNextNonDebugInstruction

const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const

Return a pointer to the next non-debug instruction in the same basic block as 'this',...

Definition:Instruction.cpp:1226

llvm::Instruction::getOpcode

unsigned getOpcode() const

Returns a member of one of the enums like Instruction::Add.

Definition:Instruction.h:310

llvm::Instruction::BinaryOps

BinaryOps

Definition:Instruction.h:1008

llvm::Instruction::isIdenticalTo

bool isIdenticalTo(const Instruction *I) const LLVM_READONLY

Return true if the specified instruction is exactly identical to the current one.

Definition:Instruction.cpp:914

llvm::Instruction::isIntDivRem

bool isIntDivRem() const

Definition:Instruction.h:316

llvm::Instruction::UnaryOps

UnaryOps

Definition:Instruction.h:1001

llvm::Instruction::CastOps

CastOps

Definition:Instruction.h:1022

llvm::IntegerType::get

static IntegerType * get(LLVMContext &C, unsigned NumBits)

This static method is the primary way of constructing an IntegerType.

Definition:Type.cpp:311

llvm::IntrinsicCostAttributes

Definition:TargetTransformInfo.h:119

llvm::IntrinsicCostAttributes::getArgTypes

const SmallVectorImpl< Type * > & getArgTypes() const

Definition:TargetTransformInfo.h:156

llvm::LoadInst

An instruction for reading from memory.

Definition:Instructions.h:176

llvm::LoadInst::getPointerOperand

Value * getPointerOperand()

Definition:Instructions.h:255

llvm::LoadInst::isSimple

bool isSimple() const

Definition:Instructions.h:247

llvm::LoadInst::getAlign

Align getAlign() const

Return the alignment of the access that is being performed.

Definition:Instructions.h:211

llvm::LoopAnalysis

Analysis pass that exposes the LoopInfo for a function.

Definition:LoopInfo.h:566

llvm::LoopBase::getLoopLatch

BlockT * getLoopLatch() const

If there is a single latch block for this loop, return it.

Definition:GenericLoopInfoImpl.h:256

llvm::LoopInfoBase::getLoopFor

LoopT * getLoopFor(const BlockT *BB) const

Return the inner most loop that BB lives in.

Definition:GenericLoopInfo.h:606

llvm::LoopInfo

Definition:LoopInfo.h:407

llvm::Loop

Represents a single loop in the control flow graph.

Definition:LoopInfo.h:39

llvm::MapVector

This class implements a map that also provides access to all stored values in a deterministic order.

Definition:MapVector.h:36

llvm::MapVector::end

iterator end()

Definition:MapVector.h:71

llvm::MapVector::takeVector

VectorType takeVector()

Clear the MapVector and return the underlying vector.

Definition:MapVector.h:55

llvm::MapVector::find

iterator find(const KeyT &Key)

Definition:MapVector.h:167

llvm::MapVector::empty

bool empty() const

Definition:MapVector.h:79

llvm::MapVector::try_emplace

std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)

Definition:MapVector.h:118

llvm::MapVector::insert

std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)

Definition:MapVector.h:141

llvm::MapVector::lookup

ValueT lookup(const KeyT &Key) const

Definition:MapVector.h:110

llvm::MapVector::size

size_type size() const

Definition:MapVector.h:60

llvm::MapVector::front

std::pair< KeyT, ValueT > & front()

Definition:MapVector.h:83

llvm::MapVector::clear

void clear()

Definition:MapVector.h:88

llvm::MemIntrinsic

This is the common base class for memset/memcpy/memmove.

Definition:IntrinsicInst.h:1205

llvm::MemoryLocation

Representation for a specific memory location.

Definition:MemoryLocation.h:227

llvm::MemoryLocation::get

static MemoryLocation get(const LoadInst *LI)

Return a location with information about the memory reference by the given instruction.

Definition:MemoryLocation.cpp:35

llvm::MemoryLocation::Ptr

const Value * Ptr

The address of the start of the location.

Definition:MemoryLocation.h:235

llvm::MutableArrayRef

MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...

Definition:ArrayRef.h:310

llvm::MutableArrayRef::front

T & front() const

front - Get the first element.

Definition:ArrayRef.h:366

llvm::MutableArrayRef::end

iterator end() const

Definition:ArrayRef.h:360

llvm::MutableArrayRef::begin

iterator begin() const

Definition:ArrayRef.h:359

llvm::MutableArrayRef::slice

MutableArrayRef< T > slice(size_t N, size_t M) const

slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.

Definition:ArrayRef.h:379

llvm::OptimizationRemarkEmitterAnalysis

Definition:OptimizationRemarkEmitter.h:164

llvm::OptimizationRemarkEmitter

The optimization diagnostic interface.

Definition:OptimizationRemarkEmitter.h:32

llvm::OptimizationRemarkMissed

Diagnostic information for missed-optimization remarks.

Definition:DiagnosticInfo.h:807

llvm::OptimizationRemark

Diagnostic information for applied optimization remarks.

Definition:DiagnosticInfo.h:762

llvm::OwningArrayRef

This is a MutableArrayRef that owns its array.

Definition:ArrayRef.h:452

llvm::PHINode

Definition:Instructions.h:2600

llvm::PHINode::addIncoming

void addIncoming(Value *V, BasicBlock *BB)

Add an incoming value to the end of the PHI list.

Definition:Instructions.h:2735

llvm::PHINode::getIncomingValueForBlock

Value * getIncomingValueForBlock(const BasicBlock *BB) const

Definition:Instructions.h:2775

llvm::PHINode::getIncomingBlock

BasicBlock * getIncomingBlock(unsigned i) const

Return incoming basic block number i.

Definition:Instructions.h:2695

llvm::PHINode::getNumIncomingValues

unsigned getNumIncomingValues() const

Return the number of incoming edges.

Definition:Instructions.h:2671

llvm::Pass

Pass interface - Implemented by all 'passes'.

Definition:Pass.h:94

llvm::PointerType::getUnqual

static PointerType * getUnqual(Type *ElementType)

This constructs a pointer to an object of the specified type in the default address space (address sp...

Definition:DerivedTypes.h:686

llvm::PointerUnion

A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...

Definition:PointerUnion.h:118

llvm::PointerUnion::isNull

bool isNull() const

Test if the pointer held in the union is null, regardless of which type it is.

Definition:PointerUnion.h:142

llvm::PointerUnion::dyn_cast

T dyn_cast() const

Returns the current pointer if it is of the specified pointer type, otherwise returns null.

Definition:PointerUnion.h:168

llvm::PoisonValue::get

static PoisonValue * get(Type *T)

Static factory methods - Return an 'poison' object of the specified type.

Definition:Constants.cpp:1878

llvm::PreservedAnalyses

A set of analyses that are preserved following a run of a transformation pass.

Definition:Analysis.h:111

llvm::PreservedAnalyses::all

static PreservedAnalyses all()

Construct a special preserved set that preserves all passes.

Definition:Analysis.h:117

llvm::PreservedAnalyses::preserveSet

void preserveSet()

Mark an analysis set as preserved.

Definition:Analysis.h:146

llvm::PriorityQueue

PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...

Definition:PriorityQueue.h:28

llvm::RecurrenceDescriptor::getOpcode

unsigned getOpcode() const

Definition:IVDescriptors.h:212

llvm::RecurrenceDescriptor::isIntMinMaxRecurrenceKind

static bool isIntMinMaxRecurrenceKind(RecurKind Kind)

Returns true if the recurrence kind is an integer min/max kind.

Definition:IVDescriptors.h:234

llvm::RecurrenceDescriptor::isMinMaxRecurrenceKind

static bool isMinMaxRecurrenceKind(RecurKind Kind)

Returns true if the recurrence kind is any min/max kind.

Definition:IVDescriptors.h:246

llvm::SCEVExpander

This class uses information about analyze scalars to rewrite expressions in canonical form.

Definition:ScalarEvolutionExpander.h:63

llvm::SCEVExpander::expandCodeFor

Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)

Insert code to directly compute the specified SCEV expression into the program.

Definition:ScalarEvolutionExpander.cpp:1443

llvm::SCEV

This class represents an analyzed expression in the program.

Definition:ScalarEvolution.h:71

llvm::SCEV::isZero

bool isZero() const

Return true if the expression is a constant zero.

Definition:ScalarEvolution.cpp:448

llvm::SCEV::isNonConstantNegative

bool isNonConstantNegative() const

Return true if the specified scev is negated, but not a constant.

Definition:ScalarEvolution.cpp:454

llvm::SCEV::getType

Type * getType() const

Return the LLVM type of this SCEV expression.

Definition:ScalarEvolution.cpp:386

llvm::ScalarEvolutionAnalysis

Analysis pass that exposes the ScalarEvolution for a function.

Definition:ScalarEvolution.h:2320

llvm::ScalarEvolution

The main scalar evolution driver.

Definition:ScalarEvolution.h:447

llvm::ScalarEvolution::getConstant

const SCEV * getConstant(ConstantInt *V)

Definition:ScalarEvolution.cpp:473

llvm::ScalarEvolution::getSCEV

const SCEV * getSCEV(Value *V)

Return a SCEV expression for the full generality of the specified expression.

Definition:ScalarEvolution.cpp:4547

llvm::ScalarEvolution::forgetValue

void forgetValue(Value *V)

This method should be called by the client when it has changed a value in a way that may effect its v...

Definition:ScalarEvolution.cpp:8542

llvm::ScalarEvolution::getMinusSCEV

const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Return LHS-RHS.

Definition:ScalarEvolution.cpp:4655

llvm::ScalarEvolution::getMulExpr

const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Get a canonical multiply expression, or something simpler if possible.

Definition:ScalarEvolution.cpp:3106

llvm::ScalarEvolution::getUDivExactExpr

const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)

Get a canonical unsigned division expression, or something simpler if possible.

Definition:ScalarEvolution.cpp:3587

llvm::ScalarEvolution::getAddExpr

const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)

Get a canonical add expression, or something simpler if possible.

Definition:ScalarEvolution.cpp:2526

llvm::SelectInst

This class represents the LLVM 'select' instruction.

Definition:Instructions.h:1657

llvm::SetVector

A vector that has set insertion semantics.

Definition:SetVector.h:57

llvm::SetVector::getArrayRef

ArrayRef< value_type > getArrayRef() const

Definition:SetVector.h:84

llvm::SetVector::size

size_type size() const

Determine the number of elements in the SetVector.

Definition:SetVector.h:98

llvm::SetVector::front

const value_type & front() const

Return the first element of the SetVector.

Definition:SetVector.h:143

llvm::SetVector::clear

void clear()

Completely clear the SetVector.

Definition:SetVector.h:273

llvm::SetVector::empty

bool empty() const

Determine if the SetVector is empty or not.

Definition:SetVector.h:93

llvm::SetVector::insert

bool insert(const value_type &X)

Insert a new element into the SetVector.

Definition:SetVector.h:162

llvm::SetVector::contains

bool contains(const key_type &key) const

Check if the SetVector contains the given key.

Definition:SetVector.h:254

llvm::ShuffleVectorInst

This instruction constructs a fixed permutation of two input vectors.

Definition:Instructions.h:1901

llvm::ShuffleVectorInst::isZeroEltSplatMask

static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)

Return true if this shuffle mask chooses all elements with the same value as the first element of exa...

Definition:Instructions.cpp:1911

llvm::ShuffleVectorInst::isOneUseSingleSourceMask

static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)

Return true if this shuffle mask represents "clustered" mask of size VF, i.e.

Definition:Instructions.cpp:2253

llvm::ShuffleVectorInst::isDeInterleaveMaskOfFactor

static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)

Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...

Definition:Instructions.cpp:2379

llvm::ShuffleVectorInst::isIdentityMask

static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)

Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...

Definition:Instructions.cpp:1883

llvm::ShuffleVectorInst::isExtractSubvectorMask

static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)

Return true if this shuffle mask is an extract subvector mask.

Definition:Instructions.cpp:2010

llvm::ShuffleVectorInst::isReverseMask

static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)

Return true if this shuffle mask swaps the order of elements from exactly one source vector.

Definition:Instructions.cpp:1891

llvm::ShuffleVectorInst::isInsertSubvectorMask

static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)

Return true if this shuffle mask is an insert subvector mask.

Definition:Instructions.cpp:2039

llvm::ShuffleVectorInst::isInterleaveMask

static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)

Return true if the mask interleaves one or more input vectors together.

Definition:Instructions.cpp:2295

llvm::SmallBitVector

This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...

Definition:SmallBitVector.h:35

llvm::SmallBitVector::find_first

int find_first() const

Returns the index of the first set bit, -1 if none of the bits are set.

Definition:SmallBitVector.h:230

llvm::SmallBitVector::set

SmallBitVector & set()

Definition:SmallBitVector.h:366

llvm::SmallBitVector::test

bool test(unsigned Idx) const

Definition:SmallBitVector.h:472

llvm::SmallBitVector::find_next

int find_next(unsigned Prev) const

Returns the index of the next set bit following the "Prev" bit.

Definition:SmallBitVector.h:277

llvm::SmallBitVector::all

bool all() const

Returns true if all bits are set.

Definition:SmallBitVector.h:216

llvm::SmallBitVector::size

size_type size() const

Returns the number of bits in this bitvector.

Definition:SmallBitVector.h:195

llvm::SmallBitVector::any

bool any() const

Returns true if any bit is set.

Definition:SmallBitVector.h:209

llvm::SmallBitVector::count

size_type count() const

Returns the number of bits which are set.

Definition:SmallBitVector.h:200

llvm::SmallBitVector::reset

SmallBitVector & reset()

Definition:SmallBitVector.h:401

llvm::SmallBitVector::none

bool none() const

Returns true if none of the bits are set.

Definition:SmallBitVector.h:223

llvm::SmallDenseMap

Definition:DenseMap.h:883

llvm::SmallDenseSet

Implements a dense probed hash-table based set with some number of buckets stored inline.

Definition:DenseSet.h:298

llvm::SmallPtrSetImplBase::size

size_type size() const

Definition:SmallPtrSet.h:94

llvm::SmallPtrSetImplBase::clear

void clear()

Definition:SmallPtrSet.h:97

llvm::SmallPtrSetImplBase::empty

bool empty() const

Definition:SmallPtrSet.h:93

llvm::SmallPtrSetImpl

A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...

Definition:SmallPtrSet.h:363

llvm::SmallPtrSetImpl::erase

bool erase(PtrType Ptr)

Remove pointer from the set.

Definition:SmallPtrSet.h:401

llvm::SmallPtrSetImpl::count

size_type count(ConstPtrType Ptr) const

count - Return 1 if the specified pointer is in the set, 0 otherwise.

Definition:SmallPtrSet.h:452

llvm::SmallPtrSetImpl::end

iterator end() const

Definition:SmallPtrSet.h:477

llvm::SmallPtrSetImpl::insert

std::pair< iterator, bool > insert(PtrType Ptr)

Inserts Ptr if and only if there is no element in the container equal to Ptr.

Definition:SmallPtrSet.h:384

llvm::SmallPtrSetImpl::begin

iterator begin() const

Definition:SmallPtrSet.h:472

llvm::SmallPtrSetImpl::contains

bool contains(ConstPtrType Ptr) const

Definition:SmallPtrSet.h:458

llvm::SmallPtrSet< Value *, 16 >

llvm::SmallSetVector

A SetVector that performs no allocations if smaller than a certain size.

Definition:SetVector.h:370

llvm::SmallSet

SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...

Definition:SmallSet.h:132

llvm::SmallSet::count

size_type count(const T &V) const

count - Return 1 if the element is in the set, 0 otherwise.

Definition:SmallSet.h:175

llvm::SmallSet::contains

bool contains(const T &V) const

Check if the SmallSet contains the given element.

Definition:SmallSet.h:222

llvm::SmallSet::insert

std::pair< const_iterator, bool > insert(const T &V)

insert - Insert an element into the set if it isn't already there.

Definition:SmallSet.h:181

llvm::SmallSet::size

size_type size() const

Definition:SmallSet.h:170

llvm::SmallString

SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...

Definition:SmallString.h:26

llvm::SmallVectorBase::empty

bool empty() const

Definition:SmallVector.h:81

llvm::SmallVectorBase::size

size_t size() const

Definition:SmallVector.h:78

llvm::SmallVectorImpl

This class consists of common code factored out of the SmallVector class to reduce code duplication b...

Definition:SmallVector.h:573

llvm::SmallVectorImpl::pop_back_val

T pop_back_val()

Definition:SmallVector.h:673

llvm::SmallVectorImpl::assign

void assign(size_type NumElts, ValueParamT Elt)

Definition:SmallVector.h:704

llvm::SmallVectorImpl::emplace_back

reference emplace_back(ArgTypes &&... Args)

Definition:SmallVector.h:937

llvm::SmallVectorImpl::reserve

void reserve(size_type N)

Definition:SmallVector.h:663

llvm::SmallVectorImpl::append

void append(ItTy in_start, ItTy in_end)

Add the specified range to the end of the SmallVector.

Definition:SmallVector.h:683

llvm::SmallVectorImpl::clear

void clear()

Definition:SmallVector.h:610

llvm::SmallVectorImpl::swap

void swap(SmallVectorImpl &RHS)

Definition:SmallVector.h:968

llvm::SmallVectorImpl::resize

void resize(size_type N)

Definition:SmallVector.h:638

llvm::SmallVectorTemplateBase::pop_back

void pop_back()

Definition:SmallVector.h:425

llvm::SmallVectorTemplateBase::push_back

void push_back(const T &Elt)

Definition:SmallVector.h:413

llvm::SmallVectorTemplateCommon::end

iterator end()

Definition:SmallVector.h:269

llvm::SmallVectorTemplateCommon::rbegin

reverse_iterator rbegin()

Definition:SmallVector.h:273

llvm::SmallVectorTemplateCommon::front

reference front()

Definition:SmallVector.h:299

llvm::SmallVectorTemplateCommon::begin

iterator begin()

Definition:SmallVector.h:267

llvm::SmallVectorTemplateCommon::back

reference back()

Definition:SmallVector.h:308

llvm::SmallVectorTemplateCommon::rend

reverse_iterator rend()

Definition:SmallVector.h:275

llvm::SmallVector

This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.

Definition:SmallVector.h:1196

llvm::StoreInst

An instruction for storing to memory.

Definition:Instructions.h:292

llvm::StoreInst::getPointerOperandType

Type * getPointerOperandType() const

Definition:Instructions.h:384

llvm::StoreInst::getValueOperand

Value * getValueOperand()

Definition:Instructions.h:378

llvm::StoreInst::getPointerOperand

Value * getPointerOperand()

Definition:Instructions.h:381

llvm::StringRef

StringRef - Represent a constant reference to a string, i.e.

Definition:StringRef.h:51

llvm::TargetFolder

TargetFolder - Create constants with target dependent folding.

Definition:TargetFolder.h:34

llvm::TargetIRAnalysis

Analysis pass providing the TargetTransformInfo.

Definition:TargetTransformInfo.h:3194

llvm::TargetLibraryAnalysis

Analysis pass providing the TargetLibraryInfo.

Definition:TargetLibraryInfo.h:614

llvm::TargetLibraryInfo

Provides information about what library functions are available for the current target.

Definition:TargetLibraryInfo.h:280

llvm::TargetTransformInfo

This pass provides access to the codegen interfaces that are needed for IR-level transformations.

Definition:TargetTransformInfo.h:212

llvm::TargetTransformInfo::getCastContextHint

static CastContextHint getCastContextHint(const Instruction *I)

Calculates a CastContextHint from I.

Definition:TargetTransformInfo.cpp:996

llvm::TargetTransformInfo::getStridedMemoryOpCost

InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1156

llvm::TargetTransformInfo::getCmpSelInstrCost

InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1067

llvm::TargetTransformInfo::getRegisterBitWidth

TypeSize getRegisterBitWidth(RegisterKind K) const

Definition:TargetTransformInfo.cpp:776

llvm::TargetTransformInfo::isLegalMaskedGather

bool isLegalMaskedGather(Type *DataType, Align Alignment) const

Return true if the target supports masked gather.

Definition:TargetTransformInfo.cpp:490

llvm::TargetTransformInfo::getMemoryOpCost

InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1125

llvm::TargetTransformInfo::getInterleavedMemoryOpCost

InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const

Definition:TargetTransformInfo.cpp:1165

llvm::TargetTransformInfo::getIntrinsicInstrCost

InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const

Definition:TargetTransformInfo.cpp:1177

llvm::TargetTransformInfo::getArithmeticReductionCost

InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Calculate the cost of vector reduction intrinsics.

Definition:TargetTransformInfo.cpp:1215

llvm::TargetTransformInfo::getCastInstrCost

InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1039

llvm::TargetTransformInfo::getGEPCost

InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const

Estimate the cost of a GEP operation when lowered.

Definition:TargetTransformInfo.cpp:248

llvm::TargetTransformInfo::isLegalInterleavedAccessType

bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const

Return true is the target supports interleaved access for the given vector type VTy,...

Definition:TargetTransformInfo.cpp:531

llvm::TargetTransformInfo::isLegalBroadcastLoad

bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const

\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...

Definition:TargetTransformInfo.cpp:485

llvm::TargetTransformInfo::getExtendedReductionCost

InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...

Definition:TargetTransformInfo.cpp:1233

llvm::TargetTransformInfo::getOperandInfo

static OperandValueInfo getOperandInfo(const Value *V)

Collect properties of V used in cost analysis, e.g. OP_PowerOf2.

Definition:TargetTransformInfo.cpp:880

llvm::TargetTransformInfo::getRegisterClassForType

unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const

Definition:TargetTransformInfo.cpp:767

llvm::TargetTransformInfo::forceScalarizeMaskedGather

bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const

Return true if the target forces scalarizing of llvm.masked.gather intrinsics.

Definition:TargetTransformInfo.cpp:506

llvm::TargetTransformInfo::isLegalStridedLoadStore

bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const

Return true if the target supports strided load.

Definition:TargetTransformInfo.cpp:526

llvm::TargetTransformInfo::getMinMaxReductionCost

InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Definition:TargetTransformInfo.cpp:1224

llvm::TargetTransformInfo::TargetCostKind

TargetCostKind

The kind of cost model.

Definition:TargetTransformInfo.h:263

llvm::TargetTransformInfo::TCK_RecipThroughput

@ TCK_RecipThroughput

Reciprocal throughput.

Definition:TargetTransformInfo.h:264

llvm::TargetTransformInfo::getArithmeticInstrCost

InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const

This is an approximation of reciprocal throughput of a math/logic op.

Definition:TargetTransformInfo.cpp:940

llvm::TargetTransformInfo::OperandValueProperties

OperandValueProperties

Additional properties of an operand's values.

Definition:TargetTransformInfo.h:1126

llvm::TargetTransformInfo::OP_NegatedPowerOf2

@ OP_NegatedPowerOf2

Definition:TargetTransformInfo.h:1129

llvm::TargetTransformInfo::OP_None

@ OP_None

Definition:TargetTransformInfo.h:1127

llvm::TargetTransformInfo::OP_PowerOf2

@ OP_PowerOf2

Definition:TargetTransformInfo.h:1128

llvm::TargetTransformInfo::getPointersChainCost

InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const

Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...

Definition:TargetTransformInfo.cpp:254

llvm::TargetTransformInfo::getMaximumVF

unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const

Definition:TargetTransformInfo.cpp:807

llvm::TargetTransformInfo::isTypeLegal

bool isTypeLegal(Type *Ty) const

Return true if this type is legal.

Definition:TargetTransformInfo.cpp:583

llvm::TargetTransformInfo::getCostOfKeepingLiveOverCall

InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const

Definition:TargetTransformInfo.cpp:1247

llvm::TargetTransformInfo::RGK_FixedWidthVector

@ RGK_FixedWidthVector

Definition:TargetTransformInfo.h:1180

llvm::TargetTransformInfo::getShuffleCost

InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const

Definition:TargetTransformInfo.cpp:976

llvm::TargetTransformInfo::getMinVectorRegisterBitWidth

unsigned getMinVectorRegisterBitWidth() const

Definition:TargetTransformInfo.cpp:781

llvm::TargetTransformInfo::getGatherScatterOpCost

InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const

Definition:TargetTransformInfo.cpp:1146

llvm::TargetTransformInfo::getNumberOfRegisters

unsigned getNumberOfRegisters(unsigned ClassID) const

Definition:TargetTransformInfo.cpp:759

llvm::TargetTransformInfo::isLegalAltInstr

bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const

Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...

Definition:TargetTransformInfo.cpp:495

llvm::TargetTransformInfo::isFPVectorizationPotentiallyUnsafe

bool isFPVectorizationPotentiallyUnsafe() const

Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...

Definition:TargetTransformInfo.cpp:680

llvm::TargetTransformInfo::getStoreMinimumVF

unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const

Definition:TargetTransformInfo.cpp:812

llvm::TargetTransformInfo::TCC_Expensive

@ TCC_Expensive

The cost of a 'div' instruction on x86.

Definition:TargetTransformInfo.h:291

llvm::TargetTransformInfo::TCC_Free

@ TCC_Free

Expected to fold away in lowering.

Definition:TargetTransformInfo.h:289

llvm::TargetTransformInfo::TCC_Basic

@ TCC_Basic

The cost of a typical 'add' instruction.

Definition:TargetTransformInfo.h:290

llvm::TargetTransformInfo::getScalarizationOverhead

InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const

Estimate the overhead of scalarizing an instruction.

Definition:TargetTransformInfo.cpp:628

llvm::TargetTransformInfo::getInstructionCost

InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const

Estimate the cost of a given IR user when lowered.

Definition:TargetTransformInfo.cpp:270

llvm::TargetTransformInfo::getExtractWithExtendCost

InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const

Definition:TargetTransformInfo.cpp:1050

llvm::TargetTransformInfo::getNumberOfParts

unsigned getNumberOfParts(Type *Tp) const

Definition:TargetTransformInfo.cpp:1193

llvm::TargetTransformInfo::getVectorInstrCost

InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const

Definition:TargetTransformInfo.cpp:1079

llvm::TargetTransformInfo::ShuffleKind

ShuffleKind

The various kinds of shuffle patterns for vector queries.

Definition:TargetTransformInfo.h:1098

llvm::TargetTransformInfo::SK_InsertSubvector

@ SK_InsertSubvector

InsertSubvector. Index indicates start offset.

Definition:TargetTransformInfo.h:1105

llvm::TargetTransformInfo::SK_Select

@ SK_Select

Selects elements from the corresponding lane of either source operand.

Definition:TargetTransformInfo.h:1101

llvm::TargetTransformInfo::SK_PermuteSingleSrc

@ SK_PermuteSingleSrc

Shuffle elements of single source vector with any shuffle mask.

Definition:TargetTransformInfo.h:1109

llvm::TargetTransformInfo::SK_Broadcast

@ SK_Broadcast

Broadcast element 0 to all other elements.

Definition:TargetTransformInfo.h:1099

llvm::TargetTransformInfo::SK_PermuteTwoSrc

@ SK_PermuteTwoSrc

Merge elements from two source vectors into one with any shuffle mask.

Definition:TargetTransformInfo.h:1107

llvm::TargetTransformInfo::SK_Reverse

@ SK_Reverse

Reverse the order of the vector.

Definition:TargetTransformInfo.h:1100

llvm::TargetTransformInfo::SK_ExtractSubvector

@ SK_ExtractSubvector

ExtractSubvector Index indicates start offset.

Definition:TargetTransformInfo.h:1106

llvm::TargetTransformInfo::getCallInstrCost

InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const

Definition:TargetTransformInfo.cpp:1185

llvm::TargetTransformInfo::CastContextHint

CastContextHint

Represents a hint about the context in which a cast is used.

Definition:TargetTransformInfo.h:1389

llvm::TargetTransformInfo::CastContextHint::Reversed

@ Reversed

The cast is used with a reversed load/store.

llvm::TargetTransformInfo::CastContextHint::None

@ None

The cast is not used with a load/store of any kind.

llvm::TargetTransformInfo::CastContextHint::Normal

@ Normal

The cast is used with a normal load/store.

llvm::TargetTransformInfo::CastContextHint::GatherScatter

@ GatherScatter

The cast is used with a gather/scatter.

llvm::TargetTransformInfo::OperandValueKind

OperandValueKind

Additional information about an operand's possible values.

Definition:TargetTransformInfo.h:1118

llvm::TargetTransformInfo::OK_UniformConstantValue

@ OK_UniformConstantValue

Definition:TargetTransformInfo.h:1121

llvm::TargetTransformInfo::OK_UniformValue

@ OK_UniformValue

Definition:TargetTransformInfo.h:1120

llvm::TargetTransformInfo::OK_AnyValue

@ OK_AnyValue

Definition:TargetTransformInfo.h:1119

llvm::TargetTransformInfo::OK_NonUniformConstantValue

@ OK_NonUniformConstantValue

Definition:TargetTransformInfo.h:1122

llvm::Twine

Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...

Definition:Twine.h:81

llvm::TypeSize

Definition:TypeSize.h:334

llvm::Type

The instances of the Type class are immutable: once they are created, they are never changed.

Definition:Type.h:45

llvm::Type::getIntegerBitWidth

unsigned getIntegerBitWidth() const

llvm::Type::isVectorTy

bool isVectorTy() const

True if this is an instance of VectorType.

Definition:Type.h:270

llvm::Type::isX86_FP80Ty

bool isX86_FP80Ty() const

Return true if this is x86 long double.

Definition:Type.h:159

llvm::Type::isIntOrIntVectorTy

bool isIntOrIntVectorTy() const

Return true if this is an integer type or a vector of integer types.

Definition:Type.h:243

llvm::Type::isPointerTy

bool isPointerTy() const

True if this is an instance of PointerType.

Definition:Type.h:264

llvm::Type::isEmptyTy

bool isEmptyTy() const

Return true if this type is empty, that is, it has no elements or all of its elements are empty.

llvm::Type::getStructNumElements

unsigned getStructNumElements() const

llvm::Type::getPointerAddressSpace

unsigned getPointerAddressSpace() const

Get the address space of this pointer or pointer vector type.

llvm::Type::isSingleValueType

bool isSingleValueType() const

Return true if the type is a valid type for a register in codegen.

Definition:Type.h:295

llvm::Type::print

void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const

Print the current type.

llvm::Type::isPPC_FP128Ty

bool isPPC_FP128Ty() const

Return true if this is powerpc long double.

Definition:Type.h:165

llvm::Type::getScalarSizeInBits

unsigned getScalarSizeInBits() const LLVM_READONLY

If this is a vector type, return the getPrimitiveSizeInBits value for the element type.

llvm::Type::getWithNewType

Type * getWithNewType(Type *EltTy) const

Given vector type, change the element type, whilst keeping the old number of elements.

llvm::Type::getContext

LLVMContext & getContext() const

Return the LLVMContext in which this type was uniqued.

Definition:Type.h:128

llvm::Type::isFloatingPointTy

bool isFloatingPointTy() const

Return true if this is one of the floating-point types.

Definition:Type.h:184

llvm::Type::isPtrOrPtrVectorTy

bool isPtrOrPtrVectorTy() const

Return true if this is a pointer type or a vector of pointer types.

Definition:Type.h:267

llvm::Type::isIntegerTy

bool isIntegerTy() const

True if this is an instance of IntegerType.

Definition:Type.h:237

llvm::Type::getTypeID

TypeID getTypeID() const

Return the type id for the type.

Definition:Type.h:136

llvm::Type::isFPOrFPVectorTy

bool isFPOrFPVectorTy() const

Return true if this is a FP type or a vector of FP.

Definition:Type.h:225

llvm::Type::isVoidTy

bool isVoidTy() const

Return true if this is 'void'.

Definition:Type.h:139

llvm::Type::getScalarType

Type * getScalarType() const

If this is a vector type, return the element type, otherwise return 'this'.

Definition:Type.h:355

llvm::UnaryOperator

Definition:InstrTypes.h:100

llvm::UndefValue::get

static UndefValue * get(Type *T)

Static factory methods - Return an 'undef' object of the specified type.

Definition:Constants.cpp:1859

llvm::Use

A Use represents the edge between a Value definition and its users.

Definition:Use.h:43

llvm::User

Definition:User.h:44

llvm::User::operands

op_range operands()

Definition:User.h:288

llvm::User::replaceUsesOfWith

bool replaceUsesOfWith(Value *From, Value *To)

Replace uses of one Value with another.

Definition:User.cpp:21

llvm::User::User

User(Type *ty, unsigned vty, AllocInfo AllocInfo)

Definition:User.h:115

llvm::User::op_begin

op_iterator op_begin()

Definition:User.h:280

llvm::User::getOperand

Value * getOperand(unsigned i) const

Definition:User.h:228

llvm::User::getNumOperands

unsigned getNumOperands() const

Definition:User.h:250

llvm::User::operand_values

iterator_range< value_op_iterator > operand_values()

Definition:User.h:312

llvm::VFDatabase

The Vector Function Database.

Definition:VectorUtils.h:31

llvm::VFDatabase::getMappings

static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)

Retrieve all the VFInfo instances associated to the CallInst CI.

Definition:VectorUtils.h:72

llvm::Value

LLVM Value Representation.

Definition:Value.h:74

llvm::Value::getType

Type * getType() const

All values are typed, get the type of this value.

Definition:Value.h:255

llvm::Value::user_begin

user_iterator user_begin()

Definition:Value.h:397

llvm::Value::hasOneUse

bool hasOneUse() const

Return true if there is exactly one use of this value.

Definition:Value.h:434

llvm::Value::replaceAllUsesWith

void replaceAllUsesWith(Value *V)

Change all uses of this to point to a new Value.

Definition:Value.cpp:534

llvm::Value::users

iterator_range< user_iterator > users()

Definition:Value.h:421

llvm::Value::getValueID

unsigned getValueID() const

Return an ID for the concrete type of this object.

Definition:Value.h:532

llvm::Value::hasNUsesOrMore

bool hasNUsesOrMore(unsigned N) const

Return true if this value has N uses or more.

Definition:Value.cpp:153

llvm::Value::hasNUses

bool hasNUses(unsigned N) const

Return true if this Value has exactly N uses.

Definition:Value.cpp:149

llvm::Value::use_empty

bool use_empty() const

Definition:Value.h:344

llvm::Value::getContext

LLVMContext & getContext() const

All values hold a context through their type.

Definition:Value.cpp:1075

llvm::Value::getNumUses

unsigned getNumUses() const

This method computes the number of uses of this Value.

Definition:Value.cpp:255

llvm::Value::getName

StringRef getName() const

Return a constant reference to the value's name.

Definition:Value.cpp:309

llvm::Value::takeName

void takeName(Value *V)

Transfer the name from V to this value.

Definition:Value.cpp:383

llvm::VectorType

Base class of all SIMD vector types.

Definition:DerivedTypes.h:427

llvm::VectorType::getElementCount

ElementCount getElementCount() const

Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...

Definition:DerivedTypes.h:665

llvm::VectorType::get

static VectorType * get(Type *ElementType, ElementCount EC)

This static method is the primary way to construct an VectorType.

llvm::VectorType::getElementType

Type * getElementType() const

Definition:DerivedTypes.h:460

llvm::WeakTrackingVH

Value handle that is nullable, but tries to track the Value.

Definition:ValueHandle.h:204

llvm::cl::opt

Definition:CommandLine.h:1423

llvm::detail::DenseSetImpl::insert

std::pair< iterator, bool > insert(const ValueT &V)

Definition:DenseSet.h:213

llvm::detail::DenseSetImpl::clear

void clear()

Definition:DenseSet.h:92

llvm::detail::DenseSetImpl::find

iterator find(const_arg_type_t< ValueT > V)

Definition:DenseSet.h:187

llvm::detail::DenseSetImpl::end

iterator end()

Definition:DenseSet.h:182

llvm::detail::DenseSetImpl::size

size_type size() const

Definition:DenseSet.h:81

llvm::detail::DenseSetImpl::empty

bool empty() const

Definition:DenseSet.h:80

llvm::detail::DenseSetImpl::contains

bool contains(const_arg_type_t< ValueT > V) const

Check if the set contains the given element.

Definition:DenseSet.h:193

llvm::detail::DenseSetImpl::begin

iterator begin()

Definition:DenseSet.h:181

llvm::detail::DenseSetImpl::erase

bool erase(const ValueT &V)

Definition:DenseSet.h:97

llvm::detail::DenseSetImpl::count

size_type count(const_arg_type_t< ValueT > V) const

Return 1 if the specified key is in the set, 0 otherwise.

Definition:DenseSet.h:95

llvm::details::FixedOrScalableQuantity::getFixedValue

constexpr ScalarTy getFixedValue() const

Definition:TypeSize.h:202

llvm::function_ref

An efficient, type-erasing, non-owning reference to a callable.

Definition:STLFunctionalExtras.h:37

llvm::hash_code

An opaque object representing a hash code.

Definition:Hashing.h:75

llvm::ilist_detail::node_parent_access::getParent

const ParentTy * getParent() const

Definition:ilist_node.h:32

llvm::ilist_node_impl::getIterator

self_iterator getIterator()

Definition:ilist_node.h:132

llvm::ilist_node_with_parent::getPrevNode

NodeTy * getPrevNode()

Definition:ilist_node.h:339

llvm::iterator_adaptor_base

CRTP base class for adapting an iterator to a different type.

Definition:iterator.h:237

llvm::iterator_range

A range adaptor for a pair of iterators.

Definition:iterator_range.h:42

llvm::raw_ostream

This class implements an extremely fast bulk output stream that can only output to a stream.

Definition:raw_ostream.h:52

llvm::raw_ostream::indent

raw_ostream & indent(unsigned NumSpaces)

indent - Insert 'NumSpaces' spaces.

Definition:raw_ostream.cpp:495

llvm::raw_string_ostream

A raw_ostream that writes to an std::string.

Definition:raw_ostream.h:661

llvm::raw_svector_ostream

A raw_ostream that writes to an SmallVector or SmallString.

Definition:raw_ostream.h:691

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics

A helper class used for scoring candidates for two consecutive lanes.

Definition:SLPVectorizer.cpp:1687

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveExtracts

static const int ScoreConsecutiveExtracts

ExtractElementInst from same vector and consecutive indexes.

Definition:SLPVectorizer.cpp:1725

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getShallowScore

int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const

Definition:SLPVectorizer.cpp:1747

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAllUserVectorized

static const int ScoreAllUserVectorized

Score if all users are vectorized.

Definition:SLPVectorizer.cpp:1741

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSameOpcode

static const int ScoreSameOpcode

Instructions with the same opcode.

Definition:SLPVectorizer.cpp:1731

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreUndef

static const int ScoreUndef

Matching with an undef is preferable to failing.

Definition:SLPVectorizer.cpp:1737

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::getScoreAtLevelRec

int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const

Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.

Definition:SLPVectorizer.cpp:1919

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreFail

static const int ScoreFail

Score for failing to find a decent match.

Definition:SLPVectorizer.cpp:1739

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreMaskedGatherCandidate

static const int ScoreMaskedGatherCandidate

A load candidate for masked gather.

Definition:SLPVectorizer.cpp:1723

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplat

static const int ScoreSplat

Identical instructions (a.k.a. splat or broadcast).

Definition:SLPVectorizer.cpp:1735

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::LookAheadHeuristics

LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)

Definition:SLPVectorizer.cpp:1696

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreSplatLoads

static const int ScoreSplatLoads

The same load multiple times.

Definition:SLPVectorizer.cpp:1719

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedLoads

static const int ScoreReversedLoads

Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).

Definition:SLPVectorizer.cpp:1721

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConstants

static const int ScoreConstants

Constants.

Definition:SLPVectorizer.cpp:1729

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreAltOpcodes

static const int ScoreAltOpcodes

Instructions with alt opcodes (e.g, add + sub).

Definition:SLPVectorizer.cpp:1733

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreConsecutiveLoads

static const int ScoreConsecutiveLoads

Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).

Definition:SLPVectorizer.cpp:1714

llvm::slpvectorizer::BoUpSLP::LookAheadHeuristics::ScoreReversedExtracts

static const int ScoreReversedExtracts

ExtractElementInst from same vector and reversed indices.

Definition:SLPVectorizer.cpp:1727

llvm::slpvectorizer::BoUpSLP::VLOperands

A helper data structure to hold the operands of a vector of instructions.

Definition:SLPVectorizer.cpp:1988

llvm::slpvectorizer::BoUpSLP::VLOperands::getVL

ValueList getVL(unsigned OpIdx) const

\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.

Definition:SLPVectorizer.cpp:2598

llvm::slpvectorizer::BoUpSLP::VLOperands::getModeStr

static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)

Definition:SLPVectorizer.cpp:2751

llvm::slpvectorizer::BoUpSLP::VLOperands::VLOperands

VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)

Initialize with all the operands of the instruction vector RootVL.

Definition:SLPVectorizer.cpp:2588

llvm::slpvectorizer::BoUpSLP::VLOperands::dumpMode

static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)

Debug print.

Definition:SLPVectorizer.cpp:2773

llvm::slpvectorizer::BoUpSLP::VLOperands::dump

LLVM_DUMP_METHOD void dump() const

Debug print.

Definition:SLPVectorizer.cpp:2800

llvm::slpvectorizer::BoUpSLP::VLOperands::operator<<

friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)

Definition:SLPVectorizer.cpp:2777

llvm::slpvectorizer::BoUpSLP::VLOperands::reorder

void reorder()

Definition:SLPVectorizer.cpp:2610

llvm::slpvectorizer::BoUpSLP::VLOperands::printMode

static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)

Definition:SLPVectorizer.cpp:2767

llvm::slpvectorizer::BoUpSLP::VLOperands::print

LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const

Definition:SLPVectorizer.cpp:2781

llvm::slpvectorizer::BoUpSLP

Bottom Up SLP Vectorizer.

Definition:SLPVectorizer.cpp:1336

llvm::slpvectorizer::BoUpSLP::OrdersType

SmallVector< unsigned, 4 > OrdersType

Definition:SLPVectorizer.cpp:1356

llvm::slpvectorizer::BoUpSLP::getRootNodeTypeWithNoCast

std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const

Returns the type/is-signed info for the root node in the graph without casting.

Definition:SLPVectorizer.cpp:1426

llvm::slpvectorizer::BoUpSLP::findPartiallyOrderedLoads

std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)

Sort loads into increasing pointers offsets to allow greater clustering.

Definition:SLPVectorizer.cpp:5476

llvm::slpvectorizer::BoUpSLP::LoadsState

LoadsState

Tracks the state we can represent the loads in the given sequence.

Definition:SLPVectorizer.cpp:1344

llvm::slpvectorizer::BoUpSLP::LoadsState::ScatterVectorize

@ ScatterVectorize

llvm::slpvectorizer::BoUpSLP::LoadsState::Gather

@ Gather

llvm::slpvectorizer::BoUpSLP::LoadsState::Vectorize

@ Vectorize

llvm::slpvectorizer::BoUpSLP::LoadsState::StridedVectorize

@ StridedVectorize

llvm::slpvectorizer::BoUpSLP::operator<<

friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)

Definition:SLPVectorizer.cpp:4062

llvm::slpvectorizer::BoUpSLP::reorderTopToBottom

void reorderTopToBottom()

Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...

Definition:SLPVectorizer.cpp:5938

llvm::slpvectorizer::BoUpSLP::reorderBottomToTop

void reorderBottomToTop(bool IgnoreReorder=false)

Reorders the current graph to the most profitable order starting from leaves to the root.

Definition:SLPVectorizer.cpp:6270

llvm::slpvectorizer::BoUpSLP::registerNonVectorizableLoads

void registerNonVectorizableLoads(ArrayRef< T * > VL)

Registers non-vectorizable sequence of loads.

Definition:SLPVectorizer.cpp:1644

llvm::slpvectorizer::BoUpSLP::getTreeSize

unsigned getTreeSize() const

Definition:SLPVectorizer.cpp:1504

llvm::slpvectorizer::BoUpSLP::~BoUpSLP

~BoUpSLP()

Definition:SLPVectorizer.cpp:4527

llvm::slpvectorizer::BoUpSLP::areKnownNonVectorizableLoads

bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const

Checks if the given loads sequence is known as not vectorizable.

Definition:SLPVectorizer.cpp:1650

llvm::slpvectorizer::BoUpSLP::getCanonicalGraphSize

unsigned getCanonicalGraphSize() const

Returns the base graph size, before any transformations.

Definition:SLPVectorizer.cpp:1507

llvm::slpvectorizer::BoUpSLP::areAnalyzedReductionVals

bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const

Checks if the provided list of reduced values was checked already for vectorization.

Definition:SLPVectorizer.cpp:2936

llvm::slpvectorizer::BoUpSLP::canVectorizeLoads

LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const

Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.

Definition:SLPVectorizer.cpp:5025

llvm::slpvectorizer::BoUpSLP::isLoadCombineCandidate

bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const

Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...

Definition:SLPVectorizer.cpp:12062

llvm::slpvectorizer::BoUpSLP::analyzedReductionVals

void analyzedReductionVals(ArrayRef< Value * > VL)

Adds the list of reduced values to list of already checked values for the vectorization.

Definition:SLPVectorizer.cpp:2941

llvm::slpvectorizer::BoUpSLP::isLoadCombineReductionCandidate

bool isLoadCombineReductionCandidate(RecurKind RdxKind) const

Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...

Definition:SLPVectorizer.cpp:12052

llvm::slpvectorizer::BoUpSLP::getVectorElementSize

unsigned getVectorElementSize(Value *V)

Definition:SLPVectorizer.cpp:17661

llvm::slpvectorizer::BoUpSLP::isSignedMinBitwidthRootNode

bool isSignedMinBitwidthRootNode() const

Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...

Definition:SLPVectorizer.cpp:1445

llvm::slpvectorizer::BoUpSLP::analyzedReductionRoot

void analyzedReductionRoot(Instruction *I)

Definition:SLPVectorizer.cpp:2931

llvm::slpvectorizer::BoUpSLP::getRootNodeScalars

ArrayRef< Value * > getRootNodeScalars() const

Return the scalars of the root node.

Definition:SLPVectorizer.cpp:1419

llvm::slpvectorizer::BoUpSLP::computeMinimumValueSizes

void computeMinimumValueSizes()

Compute the minimum type sizes required to represent the entries in a vectorizable tree.

Definition:SLPVectorizer.cpp:18136

llvm::slpvectorizer::BoUpSLP::deleteTree

void deleteTree()

Clear the internal data structures that are created by 'buildTree'.

Definition:SLPVectorizer.cpp:1477

llvm::slpvectorizer::BoUpSLP::getTreeCost

InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})

Definition:SLPVectorizer.cpp:12461

llvm::slpvectorizer::BoUpSLP::getMaximumVF

unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const

Definition:SLPVectorizer.cpp:1585

llvm::slpvectorizer::BoUpSLP::ValueSet

SmallPtrSet< Value *, 16 > ValueSet

Definition:SLPVectorizer.cpp:1353

llvm::slpvectorizer::BoUpSLP::ExtraValueToDebugLocsMap

SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap

Definition:SLPVectorizer.cpp:1355

llvm::slpvectorizer::BoUpSLP::BoUpSLP

BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)

Definition:SLPVectorizer.cpp:1358

llvm::slpvectorizer::BoUpSLP::isNotScheduled

bool isNotScheduled(const Value *V) const

Checks if the specified value was not schedule.

Definition:SLPVectorizer.cpp:2959

llvm::slpvectorizer::BoUpSLP::transformNodes

void transformNodes()

Transforms graph nodes to target specific representations, if profitable.

Definition:SLPVectorizer.cpp:9773

llvm::slpvectorizer::BoUpSLP::isDeleted

bool isDeleted(Instruction *I) const

Checks if the instruction is marked for deletion.

Definition:SLPVectorizer.cpp:2830

llvm::slpvectorizer::BoUpSLP::buildExternalUses

void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})

Builds external uses of the vectorized scalars, i.e.

Definition:SLPVectorizer.cpp:6547

llvm::slpvectorizer::BoUpSLP::isTreeTinyAndNotFullyVectorizable

bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const

Definition:SLPVectorizer.cpp:12075

llvm::slpvectorizer::BoUpSLP::removeInstructionsAndOperands

void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)

Remove instructions from the parent function and clear the operands of DeadVals instructions,...

Definition:SLPVectorizer.cpp:2842

llvm::slpvectorizer::BoUpSLP::canMapToVector

unsigned canMapToVector(Type *T) const

Check if homogeneous aggregate is isomorphic to some VectorType.

Definition:SLPVectorizer.cpp:8910

llvm::slpvectorizer::BoUpSLP::getMinVF

unsigned getMinVF(unsigned Sz) const

Definition:SLPVectorizer.cpp:1581

llvm::slpvectorizer::BoUpSLP::isAnalyzedReductionRoot

bool isAnalyzedReductionRoot(Instruction *I) const

Checks if the instruction was already analyzed for being possible reduction root.

Definition:SLPVectorizer.cpp:2926

llvm::slpvectorizer::BoUpSLP::getReorderingData

std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)

Gets reordering data for the given tree entry.

Definition:SLPVectorizer.cpp:5552

llvm::slpvectorizer::BoUpSLP::eraseInstruction

void eraseInstruction(Instruction *I)

Removes an instruction from its block and eventually deletes it.

Definition:SLPVectorizer.cpp:2835

llvm::slpvectorizer::BoUpSLP::doesRootHaveInTreeUses

bool doesRootHaveInTreeUses() const

Returns whether the root node has in-tree uses.

Definition:SLPVectorizer.cpp:1413

llvm::slpvectorizer::BoUpSLP::getORE

OptimizationRemarkEmitter * getORE()

Definition:SLPVectorizer.cpp:1654

llvm::slpvectorizer::BoUpSLP::isAnyGathered

bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const

Checks if the given value is gathered in one of the nodes.

Definition:SLPVectorizer.cpp:2951

llvm::slpvectorizer::BoUpSLP::ValueList

SmallVector< Value *, 8 > ValueList

Definition:SLPVectorizer.cpp:1351

llvm::slpvectorizer::BoUpSLP::buildTree

void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)

Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...

Definition:SLPVectorizer.cpp:6771

llvm::slpvectorizer::BoUpSLP::isTreeNotExtendable

bool isTreeNotExtendable() const

Checks if the graph and all its subgraphs cannot be better vectorized.

Definition:SLPVectorizer.cpp:12159

llvm::slpvectorizer::BoUpSLP::getReductionType

FixedVectorType * getReductionType() const

Returns reduction type after minbitdth analysis.

Definition:SLPVectorizer.cpp:1450

llvm::slpvectorizer::BoUpSLP::getMaxVecRegSize

unsigned getMaxVecRegSize() const

Definition:SLPVectorizer.cpp:1572

llvm::slpvectorizer::BoUpSLP::isVectorized

bool isVectorized(Value *V) const

Check if the value is vectorized in the tree.

Definition:SLPVectorizer.cpp:2964

llvm::slpvectorizer::BoUpSLP::isIdentityOrder

bool isIdentityOrder(ArrayRef< unsigned > Order) const

Does this non-empty order represent an identity order? Identity should be represented as an empty ord...

Definition:SLPVectorizer.cpp:1516

llvm::slpvectorizer::BoUpSLP::isGathered

bool isGathered(const Value *V) const

Checks if the given value is gathered in one of the nodes.

Definition:SLPVectorizer.cpp:2955

llvm::slpvectorizer::BoUpSLP::getSpillCost

InstructionCost getSpillCost() const

Definition:SLPVectorizer.cpp:12187

llvm::slpvectorizer::BoUpSLP::getMinVecRegSize

unsigned getMinVecRegSize() const

Definition:SLPVectorizer.cpp:1577

llvm::slpvectorizer::BoUpSLP::vectorizeTree

Value * vectorizeTree()

Vectorize the tree that starts with the elements in VL.

Definition:SLPVectorizer.cpp:16300

llvm::slpvectorizer::BoUpSLP::findBestRootPair

std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const

Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...

Definition:SLPVectorizer.cpp:2810

llvm::slpvectorizer::BoUpSLP::findReusedOrderedScalars

std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)

Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...

Definition:SLPVectorizer.cpp:4627

llvm::slpvectorizer::BoUpSLP::clearReductionData

void clearReductionData()

Clear the list of the analyzed reduction root instructions.

Definition:SLPVectorizer.cpp:2945

llvm::slpvectorizer::BoUpSLP::optimizeGatherSequence

void optimizeGatherSequence()

Perform LICM and CSE on the newly generated gather sequences.

Definition:SLPVectorizer.cpp:16956

uint32_t

uint64_t

unsigned

llvm::VFDatabase::getVectorizedFunction

Function * getVectorizedFunction(const VFShape &Shape) const

Definition:VectorUtils.h:106

iterator.h

iterator_range.h

This provides a very simple, boring adaptor for a begin and end iterator into a range type.

ErrorHandling.h

llvm_unreachable

#define llvm_unreachable(msg)

Marks that the current location is not supposed to be reachable.

Definition:ErrorHandling.h:143

llvm::AArch64CC::LS

@ LS

Definition:AArch64BaseInfo.h:264

llvm::AMDGPU::HSAMD::Kernel::Key::Args

constexpr char Args[]

Key for Kernel::Metadata::mArgs.

Definition:AMDGPUMetadata.h:395

llvm::AMDGPU::PALMD::Key

Key

PAL metadata keys.

Definition:AMDGPUMetadata.h:487

llvm::AMDGPU::VGPRIndexMode::Id

Definition:SIDefines.h:310

llvm::AMDGPU::P1

@ P1

Definition:AMDGPURegBankLegalizeRules.h:53

llvm::ARMII::HorizontalReduction

@ HorizontalReduction

Definition:ARMBaseInfo.h:425

llvm::ARM_MB::ST

@ ST

Definition:ARMBaseInfo.h:73

llvm::ARM_PROC::IE

@ IE

Definition:ARMBaseInfo.h:27

llvm::ARM::PredBlockMask::TE

@ TE

llvm::BitmaskEnumDetail::Mask

constexpr std::underlying_type_t< E > Mask()

Get a bitmask with 1s in all places up to the high-order bit of E's largest value.

Definition:BitmaskEnum.h:125

llvm::COFF::Entry

@ Entry

Definition:COFF.h:844

llvm::CallingConv::C

@ C

The default llvm calling convention, compatible with C.

Definition:CallingConv.h:34

llvm::Intrinsic::getOrInsertDeclaration

Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})

Look up the Function declaration of the intrinsic id in the Module M.

Definition:Intrinsics.cpp:732

llvm::Intrinsic::not_intrinsic

@ not_intrinsic

Definition:Intrinsics.h:44

llvm::LegalityPredicates::all

Predicate all(Predicate P0, Predicate P1)

True iff P0 and P1 are true.

Definition:LegalizerInfo.h:234

llvm::M68kBeads::Term

@ Term

Definition:M68kBaseInfo.h:116

llvm::M68k::MemAddrModeKind::U

@ U

llvm::M68k::MemAddrModeKind::V

@ V

llvm::M68k::MemAddrModeKind::u

@ u

llvm::M68k::MemAddrModeKind::K

@ K

llvm::M68k::MemAddrModeKind::L

@ L

llvm::MipsISD::Ext

@ Ext

Definition:MipsISelLowering.h:157

llvm::MipsISD::Ins

@ Ins

Definition:MipsISelLowering.h:158

llvm::NVPTX::PTXLdStInstCode::Scalar

@ Scalar

Definition:NVPTX.h:162

llvm::NVPTX::PTXLdStInstCode::V2

@ V2

Definition:NVPTX.h:163

llvm::PatternMatch

Definition:PatternMatch.h:47

llvm::PatternMatch::m_Store

TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)

Matches StoreInst.

Definition:PatternMatch.h:1930

llvm::PatternMatch::m_And

BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1216

llvm::PatternMatch::m_Add

BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1102

llvm::PatternMatch::m_BinOp

class_match< BinaryOperator > m_BinOp()

Match an arbitrary binary operation and ignore it.

Definition:PatternMatch.h:100

llvm::PatternMatch::m_Xor

BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1228

llvm::PatternMatch::m_FMul

BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1174

llvm::PatternMatch::match

bool match(Val *V, const Pattern &P)

Definition:PatternMatch.h:49

llvm::PatternMatch::m_Instruction

bind_ty< Instruction > m_Instruction(Instruction *&I)

Match an instruction, capturing it if we match.

Definition:PatternMatch.h:826

llvm::PatternMatch::m_Specific

specificval_ty m_Specific(const Value *V)

Match if we have a specific specified value.

Definition:PatternMatch.h:885

llvm::PatternMatch::m_ExtractElt

TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)

Matches ExtractElementInst.

Definition:PatternMatch.h:1837

llvm::PatternMatch::m_ConstantInt

class_match< ConstantInt > m_ConstantInt()

Match an arbitrary ConstantInt and ignore it.

Definition:PatternMatch.h:168

llvm::PatternMatch::m_Select

ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)

Matches SelectInst.

Definition:PatternMatch.h:1799

llvm::PatternMatch::m_SMin

MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)

Definition:PatternMatch.h:2348

llvm::PatternMatch::m_FAdd

BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1108

llvm::PatternMatch::m_Mul

BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1168

llvm::PatternMatch::m_OneUse

OneUse_match< T > m_OneUse(const T &SubPattern)

Definition:PatternMatch.h:67

llvm::PatternMatch::m_LogicalOr

auto m_LogicalOr()

Matches L || R where L and R are arbitrary values.

Definition:PatternMatch.h:3099

llvm::PatternMatch::m_Load

OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)

Matches LoadInst.

Definition:PatternMatch.h:1923

llvm::PatternMatch::m_ZExt

CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)

Matches ZExt.

Definition:PatternMatch.h:2107

llvm::PatternMatch::m_UMax

MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)

Definition:PatternMatch.h:2354

llvm::PatternMatch::m_Cmp

class_match< CmpInst > m_Cmp()

Matches any compare instruction and ignore it.

Definition:PatternMatch.h:105

llvm::PatternMatch::m_SMax

MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)

Definition:PatternMatch.h:2342

llvm::PatternMatch::m_APInt

apint_match m_APInt(const APInt *&Res)

Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.

Definition:PatternMatch.h:299

llvm::PatternMatch::m_Value

class_match< Value > m_Value()

Match an arbitrary value and ignore it.

Definition:PatternMatch.h:92

llvm::PatternMatch::m_ZExtOrSExt

match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)

Definition:PatternMatch.h:2138

llvm::PatternMatch::m_Shl

BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1234

llvm::PatternMatch::m_LogicalAnd

auto m_LogicalAnd()

Matches L && R where L and R are arbitrary values.

Definition:PatternMatch.h:3081

llvm::PatternMatch::m_Undef

auto m_Undef()

Match an arbitrary undef constant.

Definition:PatternMatch.h:152

llvm::PatternMatch::m_Or

BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)

Definition:PatternMatch.h:1222

llvm::PatternMatch::m_UMin

MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)

Definition:PatternMatch.h:2360

llvm::PatternMatch::m_CombineOr

match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)

Combine two pattern matchers matching L || R.

Definition:PatternMatch.h:239

llvm::RISCVFenceField::R

@ R

Definition:RISCVBaseInfo.h:373

llvm::SIEncodingFamily::VI

@ VI

Definition:SIDefines.h:37

llvm::SIEncodingFamily::SI

@ SI

Definition:SIDefines.h:36

llvm::SPII::Store

@ Store

Definition:SparcInstrInfo.h:33

llvm::X86AS::GS

@ GS

Definition:X86.h:210

llvm::X86::FirstMacroFusionInstKind::Cmp

@ Cmp

llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind::Used

@ Used

llvm::cl::Hidden

@ Hidden

Definition:CommandLine.h:137

llvm::cl::init

initializer< Ty > init(const Ty &Val)

Definition:CommandLine.h:443

llvm::codeview::EncodedFramePtrReg::BasePtr

@ BasePtr

llvm::codeview::ExportFlags::IsConstant

@ IsConstant

llvm::dwarf::Index

Index

Definition:Dwarf.h:882

llvm::dxil::ElementType::I1

@ I1

llvm::logicalview::LVPrintKind::Instructions

@ Instructions

llvm::objcopy::AdjustKind::Set

@ Set

llvm::omp::RTLDependInfoFields::Len

@ Len

llvm::ore::NV

DiagnosticInfoOptimizationBase::Argument NV

Definition:OptimizationRemarkEmitter.h:135

llvm::pdb::PDB_MemoryType::Stack

@ Stack

llvm::sampleprof::Base

@ Base

Definition:Discriminator.h:58

llvm::sys::path::begin

const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)

Get begin iterator over path.

Definition:Path.cpp:226

llvm::tgtok::In

@ In

Definition:TGLexer.h:84

llvm

This is an optimization pass for GlobalISel generic memory operations.

Definition:AddressRanges.h:18

llvm::drop_begin

auto drop_begin(T &&RangeOrContainer, size_t N=1)

Return a range covering RangeOrContainer with the first N elements excluded.

Definition:STLExtras.h:329

llvm::getPointersDiff

std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)

Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...

Definition:LoopAccessAnalysis.cpp:1535

llvm::dump

void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)

Definition:SparseBitVector.h:877

llvm::createSimpleReduction

Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)

Create a reduction of the given vector.

Definition:LoopUtils.cpp:1278

llvm::doesNotNeedToBeScheduled

static bool doesNotNeedToBeScheduled(Value *V)

Checks if the specified value does not require scheduling.

Definition:SLPVectorizer.cpp:1288

llvm::Offset

@ Offset

Definition:DWP.cpp:480

llvm::zip

detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)

zip iterator for two or more iteratable types.

Definition:STLExtras.h:854

llvm::stable_sort

void stable_sort(R &&Range)

Definition:STLExtras.h:2037

llvm::find

auto find(R &&Range, const T &Val)

Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1759

llvm::for_each

UnaryFunction for_each(R &&Range, UnaryFunction F)

Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1732

llvm::all_of

bool all_of(R &&range, UnaryPredicate P)

Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1739

llvm::hash_value

hash_code hash_value(const FixedPointSemantics &Val)

Definition:APFixedPoint.h:136

llvm::getMinMaxReductionIntrinsicOp

Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)

Returns the min/max intrinsic used when expanding a min/max reduction.

Definition:LoopUtils.cpp:989

llvm::isEqual

bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)

Definition:GCNRegPressure.cpp:22

llvm::size

auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)

Get the size of a range.

Definition:STLExtras.h:1697

llvm::RecursivelyDeleteTriviallyDeadInstructions

bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())

If the specified value is a trivially dead instruction, delete it.

Definition:Local.cpp:546

llvm::getVectorIntrinsicIDForCall

Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)

Returns intrinsic ID for call.

Definition:VectorUtils.cpp:209

llvm::reorderScalars

static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)

Reorders the list of scalars in accordance with the given Mask.

Definition:SLPVectorizer.cpp:1239

llvm::make_scope_exit

detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)

Definition:ScopeExit.h:59

llvm::Depth

@ Depth

Definition:SIMachineScheduler.h:36

llvm::enumerate

auto enumerate(FirstRange &&First, RestRanges &&...Rest)

Given two or more input ranges, returns a new range whose values are tuples (A, B,...

Definition:STLExtras.h:2448

llvm::set_intersect

void set_intersect(S1Ty &S1, const S2Ty &S2)

set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...

Definition:SetOperations.h:58

llvm::AlignStyle::Right

@ Right

llvm::AlignStyle::Left

@ Left

llvm::verifyFunction

bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)

Check a function for errors, useful for use when debugging a pass.

Definition:Verifier.cpp:7301

llvm::salvageDebugInfo

void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)

Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...

Definition:Utils.cpp:1683

llvm::Failed

testing::Matcher< const detail::ErrorHolder & > Failed()

Definition:Error.h:198

llvm::isUsedOutsideBlock

static bool isUsedOutsideBlock(Value *V)

Checks if the provided value does not require scheduling.

Definition:SLPVectorizer.cpp:1271

llvm::canConvertToMinOrMaxIntrinsic

std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)

Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...

Definition:ValueTracking.cpp:9174

llvm::set_is_subset

bool set_is_subset(const S1Ty &S1, const S2Ty &S2)

set_is_subset(A, B) - Return true iff A in B

Definition:SetOperations.h:151

llvm::getUnderlyingObject

const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)

This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....

Definition:ValueTracking.cpp:6775

llvm::interleaveComma

void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)

Definition:STLExtras.h:2207

llvm::make_early_inc_range

iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)

Make a range that does early increment to allow mutation of the underlying range without disrupting i...

Definition:STLExtras.h:657

llvm::alignDown

constexpr T alignDown(U Value, V Align, W Skew=0)

Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.

Definition:MathExtras.h:556

llvm::post_order

iterator_range< po_iterator< T > > post_order(const T &G)

Definition:PostOrderIterator.h:197

llvm::getAlign

MaybeAlign getAlign(const Function &F, unsigned Index)

Definition:NVPTXUtilities.cpp:323

llvm::propagateMetadata

Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)

Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...

Definition:VectorUtils.cpp:942

llvm::bit_ceil

T bit_ceil(T Value)

Returns the smallest integral power of two no smaller than Value if Value is nonzero.

Definition:bit.h:342

llvm::copy_if

OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)

Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1785

llvm::isGather

bool isGather(IntrinsicInst *IntInst)

Definition:ARMBaseInstrInfo.h:937

llvm::getPointerOperand

const Value * getPointerOperand(const Value *V)

A helper function that returns the pointer operand of a load, store or GEP instruction.

Definition:Instructions.h:4998

llvm::PowerOf2Ceil

uint64_t PowerOf2Ceil(uint64_t A)

Returns the power of two which is greater than or equal to the given value.

Definition:MathExtras.h:395

llvm::MaskedValueIsZero

bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)

Return true if 'V & Mask' is known to be zero.

Definition:ValueTracking.cpp:333

llvm::erase

void erase(Container &C, ValueType V)

Wrapper function to remove a value from a container:

Definition:STLExtras.h:2107

llvm::transform

OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)

Wrapper function around std::transform to apply a function to a range and store the result elsewhere.

Definition:STLExtras.h:1952

llvm::has_single_bit

constexpr bool has_single_bit(T Value) noexcept

Definition:bit.h:146

llvm::any_of

bool any_of(R &&range, UnaryPredicate P)

Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1746

llvm::isInstructionTriviallyDead

bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)

Return true if the result produced by the instruction is not used, and the instruction will return.

Definition:Local.cpp:406

llvm::Log2_32

unsigned Log2_32(uint32_t Value)

Return the floor log base 2 of the specified value, -1 if the value is zero.

Definition:MathExtras.h:341

llvm::createStrideMask

llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)

Create a stride shuffle mask.

Definition:VectorUtils.cpp:1032

llvm::reverse

auto reverse(ContainerTy &&C)

Definition:STLExtras.h:420

llvm::inversePermutation

static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)

Definition:SLPVectorizer.cpp:1229

llvm::get

decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)

Definition:PointerIntPair.h:270

llvm::sort

void sort(IteratorTy Start, IteratorTy End)

Definition:STLExtras.h:1664

llvm::createReplicatedMask

llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)

Create a mask with replicated elements.

Definition:VectorUtils.cpp:1012

llvm::ComplexDeinterleavingOperation::Splat

@ Splat

llvm::find_if_not

auto find_if_not(R &&Range, UnaryPredicate P)

Definition:STLExtras.h:1771

llvm::dbgs

raw_ostream & dbgs()

dbgs() - This returns a reference to a raw_ostream for debugging messages.

Definition:Debug.cpp:163

llvm::hasFullVectorsOrPowerOf2

static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)

Returns true if widened type of Ty elements with size Sz represents full vector type,...

Definition:SLPVectorizer.cpp:1304

llvm::isPointerTy

bool isPointerTy(const Type *T)

Definition:SPIRVUtils.h:256

llvm::none_of

bool none_of(R &&Range, UnaryPredicate P)

Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1753

llvm::wouldInstructionBeTriviallyDead

bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)

Return true if the result produced by the instruction would have no side effects if it was not used.

Definition:Local.cpp:425

llvm::isModOrRefSet

bool isModOrRefSet(const ModRefInfo MRI)

Definition:ModRef.h:42

llvm::isSafeToSpeculativelyExecute

bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)

Return true if the instruction does not have any effects besides calculating the result and does not ...

Definition:ValueTracking.cpp:7050

llvm::sortPtrAccesses

bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)

Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...

Definition:LoopAccessAnalysis.cpp:1600

llvm::isa

bool isa(const From &Val)

isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...

Definition:Casting.h:548

llvm::propagateIRFlags

void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)

Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...

Definition:LoopUtils.cpp:1368

llvm::PoisonMaskElem

constexpr int PoisonMaskElem

Definition:Instructions.h:1889

llvm::ModRefInfo::Ref

@ Ref

The access may reference the value stored in memory.

llvm::divideCeil

constexpr T divideCeil(U Numerator, V Denominator)

Returns the integer ceil(Numerator / Denominator).

Definition:MathExtras.h:404

llvm::IRMemLocation::Other

@ Other

Any other memory.

llvm::IRMemLocation::First

@ First

Helpers to iterate all locations in the MemoryEffectsBase class.

llvm::TTI

TargetTransformInfo TTI

Definition:TargetTransformInfo.h:208

llvm::getMinMaxReductionPredicate

CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)

Returns the comparison predicate used when expanding a min/max reduction.

Definition:LoopUtils.cpp:1054

llvm::RecurKind

RecurKind

These are the kinds of recurrences that we support.

Definition:IVDescriptors.h:33

llvm::RecurKind::Or

@ Or

Bitwise or logical OR of integers.

llvm::RecurKind::None

@ None

Not a recurrence.

llvm::isVectorIntrinsicWithScalarOpAtArg

bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)

Identifies if the vector form of the intrinsic has a scalar operand.

Definition:VectorUtils.cpp:134

llvm::areAllOperandsNonInsts

static bool areAllOperandsNonInsts(Value *V)

Checks if the provided value does not require scheduling.

Definition:SLPVectorizer.cpp:1254

llvm::alignTo

uint64_t alignTo(uint64_t Size, Align A)

Returns a multiple of A needed to store Size bytes.

Definition:Alignment.h:155

llvm::count

auto count(R &&Range, const E &Element)

Wrapper function around std::count to count the number of times an element Element occurs in the give...

Definition:STLExtras.h:1938

llvm::Op

DWARFExpression::Operation Op

Definition:DWARFExpression.cpp:22

llvm::max_element

auto max_element(R &&Range)

Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...

Definition:STLExtras.h:2014

llvm::ViewGraph

void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)

ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.

Definition:GraphWriter.h:427

llvm::copy

OutputIt copy(R &&Range, OutputIt Out)

Definition:STLExtras.h:1841

llvm::doesNotNeedToSchedule

static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)

Checks if the specified array of instructions does not require scheduling.

Definition:SLPVectorizer.cpp:1296

llvm::BitWidth

constexpr unsigned BitWidth

Definition:BitmaskEnum.h:217

llvm::isGuaranteedToTransferExecutionToSuccessor

bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)

Return true if this function can prove that the instruction I will always transfer execution to one o...

Definition:ValueTracking.cpp:7927

llvm::PseudoProbeReservedId::Last

@ Last

llvm::count_if

auto count_if(R &&Range, UnaryPredicate P)

Wrapper function around std::count_if to count the number of times an element satisfying a given pred...

Definition:STLExtras.h:1945

llvm::getNumberOfParts

static unsigned getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, const unsigned Limit=std::numeric_limits< unsigned >::max())

Returns number of parts, the type VecTy will be split at the codegen phase.

Definition:SLPVectorizer.cpp:1321

llvm::find_if

auto find_if(R &&Range, UnaryPredicate P)

Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.

Definition:STLExtras.h:1766

llvm::is_contained

bool is_contained(R &&Range, const E &Element)

Returns true if Element is found in Range.

Definition:STLExtras.h:1903

llvm::ComputeNumSignBits

unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)

Return the number of times the sign bit of the register is replicated into the other bits.

Definition:ValueTracking.cpp:351

llvm::Cost

InstructionCost Cost

Definition:FunctionSpecialization.h:102

llvm::seq

auto seq(T Begin, T End)

Iterate over an integral type from Begin up to - but not including - End.

Definition:Sequence.h:305

llvm::VFParamKind::Vector

@ Vector

llvm::hash_combine

hash_code hash_combine(const Ts &...args)

Combine values into a single hash_code.

Definition:Hashing.h:590

llvm::isGuaranteedNotToBePoison

bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)

Returns true if V cannot be poison, but may be undef.

Definition:ValueTracking.cpp:7856

llvm::bit_floor

T bit_floor(T Value)

Returns the largest integral power of two no greater than Value if Value is nonzero.

Definition:bit.h:327

llvm::ConstantFoldIntegerCast

Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)

Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...

Definition:ConstantFolding.cpp:1549

llvm::Data

@ Data

Definition:SIMachineScheduler.h:55

llvm::isKnownNonNegative

bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)

Returns true if the give value is known to be non-negative.

Definition:ValueTracking.cpp:292

llvm::mayHaveNonDefUseDependency

bool mayHaveNonDefUseDependency(const Instruction &I)

Returns true if the result or effects of the given instructions I depend values not reachable through...

Definition:ValueTracking.cpp:7163

llvm::isTriviallyVectorizable

bool isTriviallyVectorizable(Intrinsic::ID ID)

Identify if the intrinsic is trivially vectorizable.

Definition:VectorUtils.cpp:46

llvm::isVectorIntrinsicWithOverloadTypeAtArg

bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)

Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...

Definition:VectorUtils.cpp:162

llvm::hash_combine_range

hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)

Compute a hash_code for a sequence of values.

Definition:Hashing.h:468

std::swap

void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)

Implement std::swap in terms of BitVector swap.

Definition:BitVector.h:860

raw_ostream.h

#define N

llvm::Align

This struct is a compact representation of a valid (non-zero power of two) alignment.

Definition:Alignment.h:39

llvm::CallBase::BundleOpInfo

Used to keep track of an operand bundle.

Definition:InstrTypes.h:2138

llvm::CodeMetrics::collectEphemeralValues

static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)

Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).

Definition:CodeMetrics.cpp:71

llvm::DOTGraphTraits< BoUpSLP * >::TreeEntry

BoUpSLP::TreeEntry TreeEntry

Definition:SLPVectorizer.cpp:4493

llvm::DOTGraphTraits< BoUpSLP * >::getNodeLabel

std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)

Definition:SLPVectorizer.cpp:4497

llvm::DOTGraphTraits< BoUpSLP * >::getNodeAttributes

static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)

Definition:SLPVectorizer.cpp:4514

llvm::DOTGraphTraits< BoUpSLP * >::DOTGraphTraits

DOTGraphTraits(bool IsSimple=false)

Definition:SLPVectorizer.cpp:4495

llvm::DOTGraphTraits

DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...

Definition:DOTGraphTraits.h:166

llvm::DefaultDOTGraphTraits

DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...

Definition:DOTGraphTraits.h:28

llvm::DiagnosticInfoOptimizationBase::Argument

Used in the streaming interface as the general argument type.

Definition:DiagnosticInfo.h:499

llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::operator*

NodeRef operator*()

Definition:SLPVectorizer.cpp:4450

llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::ChildIteratorType

ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)

Definition:SLPVectorizer.cpp:4446

llvm::GraphTraits< BoUpSLP * >::ChildIteratorType::VectorizableTree

ContainerTy & VectorizableTree

Definition:SLPVectorizer.cpp:4444

llvm::GraphTraits< BoUpSLP * >::child_end

static ChildIteratorType child_end(NodeRef N)

Definition:SLPVectorizer.cpp:4461

llvm::GraphTraits< BoUpSLP * >::getEntryNode

static NodeRef getEntryNode(BoUpSLP &R)

Definition:SLPVectorizer.cpp:4453

llvm::GraphTraits< BoUpSLP * >::child_begin

static ChildIteratorType child_begin(NodeRef N)

Definition:SLPVectorizer.cpp:4457

llvm::GraphTraits< BoUpSLP * >::nodes_begin

static nodes_iterator nodes_begin(BoUpSLP *R)

Definition:SLPVectorizer.cpp:4481

llvm::GraphTraits< BoUpSLP * >::NodeRef

TreeEntry * NodeRef

NodeRef has to be a pointer per the GraphWriter.

Definition:SLPVectorizer.cpp:4435

llvm::GraphTraits< BoUpSLP * >::size

static unsigned size(BoUpSLP *R)

Definition:SLPVectorizer.cpp:4489

llvm::GraphTraits< BoUpSLP * >::TreeEntry

BoUpSLP::TreeEntry TreeEntry

Definition:SLPVectorizer.cpp:4432

llvm::GraphTraits< BoUpSLP * >::nodes_end

static nodes_iterator nodes_end(BoUpSLP *R)

Definition:SLPVectorizer.cpp:4485

llvm::GraphTraits

Definition:GraphTraits.h:38

llvm::Incoming

Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...

Definition:SILowerI1Copies.h:25

llvm::Loop::LoopBounds::Direction

Direction

An enum for the direction of the loop.

Definition:LoopInfo.h:215

llvm::MaybeAlign

This struct is a compact representation of a valid (power of two) or undefined (0) alignment.

Definition:Alignment.h:117

llvm::MinMax

Definition:AssumeBundleQueries.h:70

llvm::SLPVectorizerPass::run

PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)

Definition:SLPVectorizer.cpp:18482

llvm::SLPVectorizerPass::runImpl

bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)

Definition:SLPVectorizer.cpp:18502

llvm::SimplifyQuery

Definition:SimplifyQuery.h:70

llvm::SmallMapVector

A MapVector that performs no allocations if smaller than a certain size.

Definition:MapVector.h:254

llvm::TargetTransformInfo::OperandValueInfo

Definition:TargetTransformInfo.h:1135

llvm::TargetTransformInfo::PointersChainInfo

Describe known properties for a set of pointers.

Definition:TargetTransformInfo.h:311

llvm::VFShape